diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2023-06-22 15:51:20 +0200 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2023-06-22 15:51:20 +0200 |
commit | e1430bd99f71196a0fba5e3edc72512b54628f7e (patch) | |
tree | 6dd59838c00b58f6a84a740996abdd1031a3683e /Microbenchmarks | |
parent | 3884cdaff9c0fbd149931f3e4ddf28e4624652e4 (diff) |
CPU-DPU: measure time for loading binary into dpu
Diffstat (limited to 'Microbenchmarks')
-rw-r--r-- | Microbenchmarks/CPU-DPU/Makefile | 13 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/dpu/size.c | 34 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/host/app.c | 57 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/make-size.sh | 44 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run.sh | 18 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/support/params.h | 10 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/support/timer.h | 6 |
7 files changed, 152 insertions, 30 deletions
diff --git a/Microbenchmarks/CPU-DPU/Makefile b/Microbenchmarks/CPU-DPU/Makefile index f552a6d..697dfcd 100644 --- a/Microbenchmarks/CPU-DPU/Makefile +++ b/Microbenchmarks/CPU-DPU/Makefile @@ -2,13 +2,13 @@ NR_TASKLETS ?= 16 BL ?= 8 NR_DPUS ?= 1 TRANSFER ?= PUSH +DPU_BINARY ?= '"bin/dpu_code"' COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) -DPU_SOURCES := $(wildcard dpu/*.c) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TRANSFER} QUIET = @ @@ -17,7 +17,7 @@ ifdef verbose QUIET = endif -all: bin/host_code bin/dpu_code +all: bin/host_code bin/dpu_code bin/dpu_size bin: ${QUIET}mkdir -p bin @@ -25,8 +25,11 @@ bin: bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin/dpu_code: dpu/task.c ${COMMON_INCLUDES} bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c + +bin/dpu_size: dpu/size.c ${COMMON_INCLUDES} bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/size.c clean: ${QUIET}rm -rf bin diff --git a/Microbenchmarks/CPU-DPU/dpu/size.c b/Microbenchmarks/CPU-DPU/dpu/size.c new file mode 100644 index 0000000..360ab47 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/dpu/size.c @@ -0,0 +1,34 @@ +/* +* Empty kernel with multiple tasklets +* +*/ +#include <stdint.h> +#include <stdio.h> +#include <defs.h> +#include <mram.h> +#include <alloc.h> +#include <perfcounter.h> +#include <barrier.h> + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +extern int main_kernel1(void); + +int (*kernels[nr_kernels])(void) = {main_kernel1}; + +int main(void) { + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel](); +} + +// main_kernel1 +int main_kernel1() { +#if PRINT + unsigned int tasklet_id = me(); + printf("tasklet_id = %u\n", tasklet_id); +#endif +#include "nop.inc" + return 0; +} diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c index 88148a1..633f080 100644 --- a/Microbenchmarks/CPU-DPU/host/app.c +++ b/Microbenchmarks/CPU-DPU/host/app.c @@ -58,16 +58,37 @@ int main(int argc, char **argv) { struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; - + + char ntpp[24]; + + // Timer declaration + Timer timer; + + snprintf(ntpp, 24, "nrThreadPerPool=%d", p.n_threads); // Allocate DPUs and load binary - DPU_ASSERT(dpu_alloc(NR_DPUS, "nrThreadPerPool=8", &dpu_set)); + start(&timer, 4, 0); + DPU_ASSERT(dpu_alloc(NR_DPUS, ntpp, &dpu_set)); + stop(&timer, 4); + start(&timer, 5, 0); DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + stop(&timer, 5); + start(&timer, 6, 0); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + stop(&timer, 6); //printf("Allocated %d DPU(s)\n", nr_of_dpus); unsigned int i = 0; unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size; + //printf("Load input data\n"); + // Input arguments + const unsigned int input_size_dpu = input_size / nr_of_dpus; +#ifdef BROADCAST + const unsigned int transfer_size = input_size_dpu; +#else + const unsigned int transfer_size = input_size; +#endif + // Input/output allocation A = malloc(input_size * sizeof(T)); B = malloc(input_size * sizeof(T)); @@ -79,22 +100,14 @@ int main(int argc, char **argv) { // Create an input file with arbitrary data read_input(A, B, input_size); - // Timer declaration - Timer timer; - //printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); + printf("[::] NMC reconfiguration | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" + " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n", + nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, + timer.time[4], timer.time[5], timer.time[6]); // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { - - //printf("Load input data\n"); - // Input arguments - const unsigned int input_size_dpu = input_size / nr_of_dpus; -#ifdef BROADCAST - const unsigned int transfer_size = input_size_dpu; -#else - const unsigned int transfer_size = input_size; -#endif // Copy input arrays if(rep >= p.n_warmup) start(&timer, 1, 0); @@ -119,7 +132,8 @@ int main(int argc, char **argv) { // Run DPU kernel if(rep >= p.n_warmup) start(&timer, 2, 0); - //DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + // empty kernel -> measure communication overhead + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); if(rep >= p.n_warmup) stop(&timer, 2); @@ -154,18 +168,21 @@ int main(int argc, char **argv) { stop(&timer, 3); if (rep >= p.n_warmup) { - printf("[::] NMC transfer | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%u e_mode=%s" + printf("[::] NMC transfer | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" " | throughput_dram_mram_MBps=%f throughput_mram_dram_MBps=%f", - nr_of_dpus, NR_TASKLETS, XSTR(T), transfer_size, transfer_mode, + nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, transfer_size * sizeof(T) / timer.time[1], transfer_size * sizeof(T) / timer.time[3]); - printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f\n", + printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f", transfer_size / timer.time[1], transfer_size / timer.time[3]); + printf(" latency_dpu_launch_us=%f\n", + timer.time[2]); } } // Print timing results + /* printf("CPU-DPU "); print(&timer, 1, p.n_reps); double time_load = timer.time[1] / (1000 * 1); @@ -177,7 +194,7 @@ int main(int argc, char **argv) { print(&timer, 3, p.n_reps); double time_retrieve = timer.time[3] / (1000 * 1); printf("DPU-CPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_retrieve*1e6)); - + */ // Check output bool status = true; #ifdef BROADCAST @@ -200,7 +217,7 @@ int main(int argc, char **argv) { } #endif if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + //printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); } else { printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); } diff --git a/Microbenchmarks/CPU-DPU/make-size.sh b/Microbenchmarks/CPU-DPU/make-size.sh new file mode 100755 index 0000000..203d1e4 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/make-size.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +: > dpu/size.inc +: > dpu/nop.inc + +for i in $(seq 1 $1); do + for i in $(seq 1 128); do + echo 'asm volatile("nop");' >> dpu/nop.inc + done + cat >>dpu/size.inc <<EOF +volatile unsigned int data${i}a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; +volatile unsigned int data${i}b[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; +volatile unsigned int data${i}c[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; +volatile unsigned int data${i}d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; +EOF +done diff --git a/Microbenchmarks/CPU-DPU/run.sh b/Microbenchmarks/CPU-DPU/run.sh index 48261ec..8107e2e 100755 --- a/Microbenchmarks/CPU-DPU/run.sh +++ b/Microbenchmarks/CPU-DPU/run.sh @@ -2,7 +2,23 @@ set -e -for i in 1 2 4 8 16 32 64; do +for i in 1 2 4 8 16 32 64 128 256 512; do + for k in SERIAL PUSH BROADCAST; do + for j in $(seq 0 32); do + ./make-size.sh $j + n_nops=$((j * 128)) + if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then + for l in $(seq 1 30); do + bin/host_code -w 1 -e 0 -N $n_nops + done + fi + done + done +done + +./make-size.sh 0 + +for i in 1 2 4 8 16 32 64 128 256 512; do for j in 1; do for k in SERIAL PUSH BROADCAST; do # 8 B ... 64 MB diff --git a/Microbenchmarks/CPU-DPU/support/params.h b/Microbenchmarks/CPU-DPU/support/params.h index 4618411..1ecf71d 100644 --- a/Microbenchmarks/CPU-DPU/support/params.h +++ b/Microbenchmarks/CPU-DPU/support/params.h @@ -5,6 +5,8 @@ typedef struct Params { unsigned int input_size; + unsigned int n_threads; + unsigned int n_nops; int n_warmup; int n_reps; int exp; @@ -22,24 +24,30 @@ static void usage() { "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=8K elements)" + "\n -n <N> number of threads per pool (default=8)" + "\n -N <N> number of nops in dpu task (default=0)" "\n"); } struct Params input_params(int argc, char **argv) { struct Params p; p.input_size = 8 << 10; + p.n_threads = 8; + p.n_nops = 0; p.n_warmup = 1; p.n_reps = 3; p.exp = 0; int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { + while((opt = getopt(argc, argv, "hi:n:w:e:x:N:")) >= 0) { switch(opt) { case 'h': usage(); exit(0); break; case 'i': p.input_size = atoi(optarg); break; + case 'n': p.n_threads = atoi(optarg); break; + case 'N': p.n_nops = atoi(optarg); break; case 'w': p.n_warmup = atoi(optarg); break; case 'e': p.n_reps = atoi(optarg); break; case 'x': p.exp = atoi(optarg); break; diff --git a/Microbenchmarks/CPU-DPU/support/timer.h b/Microbenchmarks/CPU-DPU/support/timer.h index eedc385..7c24f3b 100755 --- a/Microbenchmarks/CPU-DPU/support/timer.h +++ b/Microbenchmarks/CPU-DPU/support/timer.h @@ -37,9 +37,9 @@ typedef struct Timer{
- struct timeval startTime[4];
- struct timeval stopTime[4];
- double time[4];
+ struct timeval startTime[10];
+ struct timeval stopTime[10];
+ double time[10];
}Timer;
|