diff options
Diffstat (limited to 'Microbenchmarks/CPU-DPU')
-rw-r--r-- | Microbenchmarks/CPU-DPU/Makefile | 2 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/host/app.c | 35 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh | 7 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh | 18 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-alloc-rank.sh | 25 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh | 7 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh | 18 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-transfer-rank.sh | 27 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/support/timer.h | 60 |
9 files changed, 126 insertions, 73 deletions
diff --git a/Microbenchmarks/CPU-DPU/Makefile b/Microbenchmarks/CPU-DPU/Makefile index 697dfcd..f354e67 100644 --- a/Microbenchmarks/CPU-DPU/Makefile +++ b/Microbenchmarks/CPU-DPU/Makefile @@ -8,7 +8,7 @@ COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} +HOST_FLAGS := ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TRANSFER} QUIET = @ diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c index 07431ee..7c402bc 100644 --- a/Microbenchmarks/CPU-DPU/host/app.c +++ b/Microbenchmarks/CPU-DPU/host/app.c @@ -104,9 +104,9 @@ int main(int argc, char **argv) { //printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" - " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n", + " | latency_dpu_alloc_ns=%lu latency_dpu_load_ns=%lu latency_dpu_get_ns=%lu\n", nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, - timer.time[4], timer.time[5], timer.time[6]); + timer.nanoseconds[4], timer.nanoseconds[5], timer.nanoseconds[6]); // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { @@ -171,32 +171,19 @@ int main(int argc, char **argv) { if (rep >= p.n_warmup) { printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" - " | throughput_dram_mram_MBps=%f throughput_mram_dram_MBps=%f", + " | latency_dram_mram_ns=%lu latency_mram_dram_ns=%lu throughput_dram_mram_Bps=%f throughput_mram_dram_Bps=%f", nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, - transfer_size * sizeof(T) / timer.time[1], - transfer_size * sizeof(T) / timer.time[3]); - printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f", - transfer_size / timer.time[1], - transfer_size / timer.time[3]); - printf(" latency_dpu_launch_us=%f\n", - timer.time[2]); + timer.nanoseconds[1], timer.nanoseconds[3], + transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[1], + transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[3]); + printf(" throughput_dram_mram_Opps=%f throughput_mram_dram_Opps=%f", + transfer_size * 1e9 / timer.nanoseconds[1], + transfer_size * 1e9 / timer.nanoseconds[3]); + printf(" latency_dpu_launch_ns=%lu\n", + timer.nanoseconds[2]); } } - // Print timing results - /* - printf("CPU-DPU "); - print(&timer, 1, p.n_reps); - double time_load = timer.time[1] / (1000 * 1); - printf("CPU-DPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_load*1e6)); - printf("DPU Kernel "); - print(&timer, 2, p.n_reps); - printf("\n"); - printf("DPU-CPU "); - print(&timer, 3, p.n_reps); - double time_retrieve = timer.time[3] / (1000 * 1); - printf("DPU-CPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_retrieve*1e6)); - */ // Check output bool status = true; #ifdef BROADCASTX diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh new file mode 100755 index 0000000..321f14c --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +mkdir -p "$(hostname)-alloc" + +./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-idle.txt" + +xz -v -9 -M 800M "$(hostname)-alloc/rank-idle.txt" diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh new file mode 100755 index 0000000..cf90174 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +mkdir -p "$(hostname)-alloc" + +NCORES=$(grep -c '^processor' /proc/cpuinfo) +cleanexit() { + pkill -f "stress -c ${NCORES}" +} + +trap cleanexit TERM INT + +stress -c ${NCORES} & + +./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-stress-c${NCORES}.txt" + +cleanexit + +xz -v -9 -M 800M "$(hostname)-alloc/rank-stress-c${NCORES}.txt" diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh new file mode 100755 index 0000000..a6907fe --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +set -e + +echo "prim-benchmarks CPU-DPU alloc (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# runtime exclusive of host_code execution time: 25 seconds per inner loop +# *16 -> about 7 minutes per outer loop +# *163 -> about 18 hours total +for i in 1 4 8 16 32 48 64; do + for j in $(seq 0 16); do + echo $i/64 $j/16 + ./make-size.sh $j + n_nops=$((j * 256)) + if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then + for l in $(seq 1 100); do + bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops || true + done + fi + done +done + +echo "Completed at $(date)" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh new file mode 100755 index 0000000..89dda03 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +mkdir -p "$(hostname)-transfer" + +./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-idle.txt" + +xz -v -9 -M 800M "$(hostname)-transfer/rank-idle.txt" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh new file mode 100755 index 0000000..e2f4020 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +mkdir -p "$(hostname)-transfer" + +NCORES=$(grep -c '^processor' /proc/cpuinfo) +cleanexit() { + pkill -f "stress -c ${NCORES}" +} + +trap cleanexit TERM INT + +stress -c ${NCORES} & + +./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-stress-c${NCORES}.txt" + +cleanexit + +xz -v -9 -M 800M "$(hostname)-transfer/rank-stress-c${NCORES}.txt" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh new file mode 100755 index 0000000..61e9dba --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +set -e + +echo "prim-benchmarks CPU-DPU transfer (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +./make-size.sh 0 + +for i in 1 4 8 16 32 48 64; do + for k in SERIAL PUSH BROADCAST; do + # BROADCAST sends the same data to all DPUs, so data size must not exceed the amount of MRAM available on a single DPU (i.e., 64 MB) + # 8 B ... 64 MB + for l in 1 16 256 4096 65536 262144 1048576 4194304 6291456 8388608; do + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + bin/host_code -w 0 -e 100 -x 1 -i $l + done + done + # maximum amount of data + for k in SERIAL PUSH; do + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + bin/host_code -w 0 -e 100 -x 1 -i $(( 4194304 * i )) + bin/host_code -w 0 -e 100 -x 1 -i $(( 6291456 * i )) + bin/host_code -w 0 -e 100 -x 1 -i $(( 8388608 * i )) + done +done diff --git a/Microbenchmarks/CPU-DPU/support/timer.h b/Microbenchmarks/CPU-DPU/support/timer.h index 7c24f3b..0a4d6a1 100755 --- a/Microbenchmarks/CPU-DPU/support/timer.h +++ b/Microbenchmarks/CPU-DPU/support/timer.h @@ -1,59 +1,23 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
+#pragma once
+#include <time.h>
-#include <sys/time.h>
+typedef struct Timer {
-typedef struct Timer{
+ struct timespec startTime[7];
+ struct timespec stopTime[7];
+ uint64_t nanoseconds[7];
- struct timeval startTime[10];
- struct timeval stopTime[10];
- double time[10];
-
-}Timer;
+} Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
- timer->time[i] = 0.0;
+ timer->nanoseconds[i] = 0;
}
- gettimeofday(&timer->startTime[i], NULL);
+ clock_gettime(CLOCK_MONOTONIC, &timer->startTime[i]);
}
void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+ clock_gettime(CLOCK_MONOTONIC, &timer->stopTime[i]);
+ timer->nanoseconds[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000000 +
+ (timer->stopTime[i].tv_nsec - timer->startTime[i].tv_nsec);
}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
|