diff options
-rw-r--r-- | Microbenchmarks/STREAM/Makefile | 3 | ||||
-rw-r--r-- | Microbenchmarks/STREAM/host/app.c | 52 |
2 files changed, 46 insertions, 9 deletions
diff --git a/Microbenchmarks/STREAM/Makefile b/Microbenchmarks/STREAM/Makefile index e653af7..048a46e 100644 --- a/Microbenchmarks/STREAM/Makefile +++ b/Microbenchmarks/STREAM/Makefile @@ -10,13 +10,14 @@ WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 WITH_DPUINFO ?= 0 SDK_SINGLETHREADED ?= 0 +TRANSFER ?= SERIAL DPU_SOURCES = dpu/${OP}.c HOST_SOURCES = $(wildcard host/*.c) COMMON_INCLUDES = support COMMON_FLAGS = -Wall -Wextra -O2 -I${COMMON_INCLUDES} -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DT=${T} -D${OP} -D${MEM} -DUNROLL=${UNROLL} -HOST_FLAGS = ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 `dpu-pkg-config --cflags --libs dpu` -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DSDK_SINGLETHREADED=${SDK_SINGLETHREADED} +HOST_FLAGS = ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 `dpu-pkg-config --cflags --libs dpu` -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DSDK_SINGLETHREADED=${SDK_SINGLETHREADED} -D${TRANSFER} DPU_FLAGS = ${COMMON_FLAGS} -flto QUIET = @ diff --git a/Microbenchmarks/STREAM/host/app.c b/Microbenchmarks/STREAM/host/app.c index 6ae7829..bd358cf 100644 --- a/Microbenchmarks/STREAM/host/app.c +++ b/Microbenchmarks/STREAM/host/app.c @@ -44,6 +44,14 @@ static T* C; #endif static T* C2; +static const char transfer_mode[] = +#if SERIAL +"SERIAL" +#else +"PUSH" +#endif +; + // Create input arrays static void read_input(T* A, T* B, unsigned int nr_elements) { srand(0); @@ -208,14 +216,25 @@ int main(int argc, char **argv) { dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel}; DPU_ASSERT(dpu_copy_to(dpu_set, "DPU_INPUT_ARGUMENTS", 0, (const void *)&input_arguments, sizeof(input_arguments))); // Copy input arrays - i = 0; - DPU_FOREACH (dpu_set, dpu) { +#ifdef SERIAL + DPU_FOREACH (dpu_set, dpu, i) { DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, 0, bufferA + input_size_dpu * i, input_size_dpu * sizeof(T))); #if defined(add) || defined(triad) DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), bufferB + input_size_dpu * i, input_size_dpu * sizeof(T))); #endif - i++; } +#else + DPU_FOREACH (dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu * i)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#if defined(add) || defined(triad) + DPU_FOREACH (dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu * i)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#endif +#endif if(rep >= p.n_warmup) { stop(&timer, 3); } @@ -244,20 +263,37 @@ int main(int argc, char **argv) { if(rep >= p.n_warmup) { start(&timer, 5, 0); } + dpu_results_t results[NR_DPUS]; - i = 0; - DPU_FOREACH (dpu_set, dpu) { + +#ifdef SERIAL + DPU_FOREACH (dpu_set, dpu, i) { // Copy output array #if defined(add) || defined(triad) DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, 2 * input_size_dpu * sizeof(T), bufferC + input_size_dpu * i, input_size_dpu * sizeof(T))); #else DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), bufferB + input_size_dpu * i, input_size_dpu * sizeof(T))); #endif - i++; } +#else + DPU_FOREACH (dpu_set, dpu, i) { +#if defined(add) || defined(triad) + DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu * i)); +#else + DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu * i)); +#endif + } +#if defined(add) || defined(triad) + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, 2 * input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#else + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#endif +#endif + if(rep >= p.n_warmup) { stop(&timer, 5); } + #if PERF i = 0; DPU_FOREACH (dpu_set, dpu) { @@ -327,8 +363,8 @@ int main(int argc, char **argv) { if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - printf("[::] STREAM UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_benchmark=%-6s e_type=%s e_mem=%s b_unroll=%d block_size_B=%d n_elements=%d n_elements_per_dpu=%d b_sdk_singlethreaded=%d ", - NR_DPUS, nr_of_ranks, NR_TASKLETS, benchmark_name, XSTR(T), mem_name, UNROLL, BLOCK_SIZE, input_size, input_size / NR_DPUS, SDK_SINGLETHREADED); + printf("[::] STREAM UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_benchmark=%-6s e_type=%s e_mem=%s e_mode=%s b_unroll=%d block_size_B=%d n_elements=%d n_elements_per_dpu=%d b_sdk_singlethreaded=%d ", + NR_DPUS, nr_of_ranks, NR_TASKLETS, benchmark_name, XSTR(T), mem_name, transfer_mode, UNROLL, BLOCK_SIZE, input_size, input_size / NR_DPUS, SDK_SINGLETHREADED); printf("| latency_alloc_ns=%lu latency_load_ns=%lu latency_cpu_ns=%lu latency_write_ns=%lu latency_kernel_ns=%lu latency_read_ns=%lu latency_free_ns=%lu", timer.nanoseconds[0], timer.nanoseconds[1], |