From e1430bd99f71196a0fba5e3edc72512b54628f7e Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Thu, 22 Jun 2023 15:51:20 +0200 Subject: CPU-DPU: measure time for loading binary into dpu --- Microbenchmarks/CPU-DPU/Makefile | 13 +++++--- Microbenchmarks/CPU-DPU/dpu/size.c | 34 +++++++++++++++++++ Microbenchmarks/CPU-DPU/host/app.c | 57 +++++++++++++++++++++----------- Microbenchmarks/CPU-DPU/make-size.sh | 44 ++++++++++++++++++++++++ Microbenchmarks/CPU-DPU/run.sh | 18 +++++++++- Microbenchmarks/CPU-DPU/support/params.h | 10 +++++- Microbenchmarks/CPU-DPU/support/timer.h | 6 ++-- 7 files changed, 152 insertions(+), 30 deletions(-) create mode 100644 Microbenchmarks/CPU-DPU/dpu/size.c create mode 100755 Microbenchmarks/CPU-DPU/make-size.sh (limited to 'Microbenchmarks/CPU-DPU') diff --git a/Microbenchmarks/CPU-DPU/Makefile b/Microbenchmarks/CPU-DPU/Makefile index f552a6d..697dfcd 100644 --- a/Microbenchmarks/CPU-DPU/Makefile +++ b/Microbenchmarks/CPU-DPU/Makefile @@ -2,13 +2,13 @@ NR_TASKLETS ?= 16 BL ?= 8 NR_DPUS ?= 1 TRANSFER ?= PUSH +DPU_BINARY ?= '"bin/dpu_code"' COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) -DPU_SOURCES := $(wildcard dpu/*.c) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TRANSFER} QUIET = @ @@ -17,7 +17,7 @@ ifdef verbose QUIET = endif -all: bin/host_code bin/dpu_code +all: bin/host_code bin/dpu_code bin/dpu_size bin: ${QUIET}mkdir -p bin @@ -25,8 +25,11 @@ bin: bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin/dpu_code: dpu/task.c ${COMMON_INCLUDES} bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c + +bin/dpu_size: dpu/size.c ${COMMON_INCLUDES} bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/size.c clean: ${QUIET}rm -rf bin diff --git a/Microbenchmarks/CPU-DPU/dpu/size.c b/Microbenchmarks/CPU-DPU/dpu/size.c new file mode 100644 index 0000000..360ab47 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/dpu/size.c @@ -0,0 +1,34 @@ +/* +* Empty kernel with multiple tasklets +* +*/ +#include +#include +#include +#include +#include +#include +#include + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +extern int main_kernel1(void); + +int (*kernels[nr_kernels])(void) = {main_kernel1}; + +int main(void) { + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel](); +} + +// main_kernel1 +int main_kernel1() { +#if PRINT + unsigned int tasklet_id = me(); + printf("tasklet_id = %u\n", tasklet_id); +#endif +#include "nop.inc" + return 0; +} diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c index 88148a1..633f080 100644 --- a/Microbenchmarks/CPU-DPU/host/app.c +++ b/Microbenchmarks/CPU-DPU/host/app.c @@ -58,16 +58,37 @@ int main(int argc, char **argv) { struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; - + + char ntpp[24]; + + // Timer declaration + Timer timer; + + snprintf(ntpp, 24, "nrThreadPerPool=%d", p.n_threads); // Allocate DPUs and load binary - DPU_ASSERT(dpu_alloc(NR_DPUS, "nrThreadPerPool=8", &dpu_set)); + start(&timer, 4, 0); + DPU_ASSERT(dpu_alloc(NR_DPUS, ntpp, &dpu_set)); + stop(&timer, 4); + start(&timer, 5, 0); DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + stop(&timer, 5); + start(&timer, 6, 0); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + stop(&timer, 6); //printf("Allocated %d DPU(s)\n", nr_of_dpus); unsigned int i = 0; unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size; + //printf("Load input data\n"); + // Input arguments + const unsigned int input_size_dpu = input_size / nr_of_dpus; +#ifdef BROADCAST + const unsigned int transfer_size = input_size_dpu; +#else + const unsigned int transfer_size = input_size; +#endif + // Input/output allocation A = malloc(input_size * sizeof(T)); B = malloc(input_size * sizeof(T)); @@ -79,22 +100,14 @@ int main(int argc, char **argv) { // Create an input file with arbitrary data read_input(A, B, input_size); - // Timer declaration - Timer timer; - //printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); + printf("[::] NMC reconfiguration | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" + " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n", + nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, + timer.time[4], timer.time[5], timer.time[6]); // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { - - //printf("Load input data\n"); - // Input arguments - const unsigned int input_size_dpu = input_size / nr_of_dpus; -#ifdef BROADCAST - const unsigned int transfer_size = input_size_dpu; -#else - const unsigned int transfer_size = input_size; -#endif // Copy input arrays if(rep >= p.n_warmup) start(&timer, 1, 0); @@ -119,7 +132,8 @@ int main(int argc, char **argv) { // Run DPU kernel if(rep >= p.n_warmup) start(&timer, 2, 0); - //DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + // empty kernel -> measure communication overhead + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); if(rep >= p.n_warmup) stop(&timer, 2); @@ -154,18 +168,21 @@ int main(int argc, char **argv) { stop(&timer, 3); if (rep >= p.n_warmup) { - printf("[::] NMC transfer | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%u e_mode=%s" + printf("[::] NMC transfer | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" " | throughput_dram_mram_MBps=%f throughput_mram_dram_MBps=%f", - nr_of_dpus, NR_TASKLETS, XSTR(T), transfer_size, transfer_mode, + nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, transfer_size * sizeof(T) / timer.time[1], transfer_size * sizeof(T) / timer.time[3]); - printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f\n", + printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f", transfer_size / timer.time[1], transfer_size / timer.time[3]); + printf(" latency_dpu_launch_us=%f\n", + timer.time[2]); } } // Print timing results + /* printf("CPU-DPU "); print(&timer, 1, p.n_reps); double time_load = timer.time[1] / (1000 * 1); @@ -177,7 +194,7 @@ int main(int argc, char **argv) { print(&timer, 3, p.n_reps); double time_retrieve = timer.time[3] / (1000 * 1); printf("DPU-CPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_retrieve*1e6)); - + */ // Check output bool status = true; #ifdef BROADCAST @@ -200,7 +217,7 @@ int main(int argc, char **argv) { } #endif if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + //printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); } else { printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); } diff --git a/Microbenchmarks/CPU-DPU/make-size.sh b/Microbenchmarks/CPU-DPU/make-size.sh new file mode 100755 index 0000000..203d1e4 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/make-size.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +: > dpu/size.inc +: > dpu/nop.inc + +for i in $(seq 1 $1); do + for i in $(seq 1 128); do + echo 'asm volatile("nop");' >> dpu/nop.inc + done + cat >>dpu/size.inc < input size (default=8K elements)" + "\n -n number of threads per pool (default=8)" + "\n -N number of nops in dpu task (default=0)" "\n"); } struct Params input_params(int argc, char **argv) { struct Params p; p.input_size = 8 << 10; + p.n_threads = 8; + p.n_nops = 0; p.n_warmup = 1; p.n_reps = 3; p.exp = 0; int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { + while((opt = getopt(argc, argv, "hi:n:w:e:x:N:")) >= 0) { switch(opt) { case 'h': usage(); exit(0); break; case 'i': p.input_size = atoi(optarg); break; + case 'n': p.n_threads = atoi(optarg); break; + case 'N': p.n_nops = atoi(optarg); break; case 'w': p.n_warmup = atoi(optarg); break; case 'e': p.n_reps = atoi(optarg); break; case 'x': p.exp = atoi(optarg); break; diff --git a/Microbenchmarks/CPU-DPU/support/timer.h b/Microbenchmarks/CPU-DPU/support/timer.h index eedc385..7c24f3b 100755 --- a/Microbenchmarks/CPU-DPU/support/timer.h +++ b/Microbenchmarks/CPU-DPU/support/timer.h @@ -37,9 +37,9 @@ typedef struct Timer{ - struct timeval startTime[4]; - struct timeval stopTime[4]; - double time[4]; + struct timeval startTime[10]; + struct timeval stopTime[10]; + double time[10]; }Timer; -- cgit v1.2.3