diff options
| author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-03-04 14:01:39 +0100 | 
|---|---|---|
| committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-03-04 14:01:39 +0100 | 
| commit | 9c6c6d37c52facc0ee2f289db4e021766768f5c6 (patch) | |
| tree | 95c3800e6f30cb6b57286ed93e30bbe398abb769 /Microbenchmarks | |
| parent | cb66c09298004a6dc6ac88300bee8441d4046a29 (diff) | |
STREAM: support SERIAL (default) and PUSH (new) transfers
Diffstat (limited to 'Microbenchmarks')
| -rw-r--r-- | Microbenchmarks/STREAM/Makefile | 3 | ||||
| -rw-r--r-- | Microbenchmarks/STREAM/host/app.c | 52 | 
2 files changed, 46 insertions, 9 deletions
| diff --git a/Microbenchmarks/STREAM/Makefile b/Microbenchmarks/STREAM/Makefile index e653af7..048a46e 100644 --- a/Microbenchmarks/STREAM/Makefile +++ b/Microbenchmarks/STREAM/Makefile @@ -10,13 +10,14 @@ WITH_LOAD_OVERHEAD ?= 0  WITH_FREE_OVERHEAD ?= 0  WITH_DPUINFO ?= 0  SDK_SINGLETHREADED ?= 0 +TRANSFER ?= SERIAL  DPU_SOURCES = dpu/${OP}.c  HOST_SOURCES = $(wildcard host/*.c)  COMMON_INCLUDES = support  COMMON_FLAGS = -Wall -Wextra -O2 -I${COMMON_INCLUDES} -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DT=${T} -D${OP} -D${MEM} -DUNROLL=${UNROLL} -HOST_FLAGS = ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 `dpu-pkg-config --cflags --libs dpu` -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DSDK_SINGLETHREADED=${SDK_SINGLETHREADED} +HOST_FLAGS = ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 `dpu-pkg-config --cflags --libs dpu` -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DSDK_SINGLETHREADED=${SDK_SINGLETHREADED} -D${TRANSFER}  DPU_FLAGS = ${COMMON_FLAGS} -flto  QUIET = @ diff --git a/Microbenchmarks/STREAM/host/app.c b/Microbenchmarks/STREAM/host/app.c index 6ae7829..bd358cf 100644 --- a/Microbenchmarks/STREAM/host/app.c +++ b/Microbenchmarks/STREAM/host/app.c @@ -44,6 +44,14 @@ static T* C;  #endif  static T* C2; +static const char transfer_mode[] = +#if SERIAL +"SERIAL" +#else +"PUSH" +#endif +; +  // Create input arrays  static void read_input(T* A, T* B, unsigned int nr_elements) {      srand(0); @@ -208,14 +216,25 @@ int main(int argc, char **argv) {          dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel};          DPU_ASSERT(dpu_copy_to(dpu_set, "DPU_INPUT_ARGUMENTS", 0, (const void *)&input_arguments, sizeof(input_arguments)));          // Copy input arrays -        i = 0; -        DPU_FOREACH (dpu_set, dpu) { +#ifdef SERIAL +        DPU_FOREACH (dpu_set, dpu, i) {              DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, 0, bufferA + input_size_dpu * i, input_size_dpu * sizeof(T)));  #if defined(add) || defined(triad)              DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), bufferB + input_size_dpu * i, input_size_dpu * sizeof(T)));  #endif -            i++;          } +#else +        DPU_FOREACH (dpu_set, dpu, i) { +            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu * i)); +        } +        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#if defined(add) || defined(triad) +        DPU_FOREACH (dpu_set, dpu, i) { +            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu * i)); +        } +        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#endif +#endif          if(rep >= p.n_warmup) {              stop(&timer, 3);          } @@ -244,20 +263,37 @@ int main(int argc, char **argv) {          if(rep >= p.n_warmup) {              start(&timer, 5, 0);          } +          dpu_results_t results[NR_DPUS]; -        i = 0; -        DPU_FOREACH (dpu_set, dpu) { + +#ifdef SERIAL +        DPU_FOREACH (dpu_set, dpu, i) {              // Copy output array  #if defined(add) || defined(triad)              DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, 2 * input_size_dpu * sizeof(T), bufferC + input_size_dpu * i, input_size_dpu * sizeof(T)));  #else              DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), bufferB + input_size_dpu * i, input_size_dpu * sizeof(T)));  #endif -            i++;          } +#else +        DPU_FOREACH (dpu_set, dpu, i) { +#if defined(add) || defined(triad) +            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu * i)); +#else +            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu * i)); +#endif +        } +#if defined(add) || defined(triad) +        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, 2 * input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#else +        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu * sizeof(T), input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); +#endif +#endif +          if(rep >= p.n_warmup) {              stop(&timer, 5);          } +  #if PERF          i = 0;          DPU_FOREACH (dpu_set, dpu) { @@ -327,8 +363,8 @@ int main(int argc, char **argv) {          if (status) {              printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); -            printf("[::] STREAM UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_benchmark=%-6s e_type=%s e_mem=%s b_unroll=%d block_size_B=%d n_elements=%d n_elements_per_dpu=%d b_sdk_singlethreaded=%d ", -                NR_DPUS, nr_of_ranks, NR_TASKLETS, benchmark_name, XSTR(T), mem_name, UNROLL, BLOCK_SIZE, input_size, input_size / NR_DPUS, SDK_SINGLETHREADED); +            printf("[::] STREAM UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_benchmark=%-6s e_type=%s e_mem=%s e_mode=%s b_unroll=%d block_size_B=%d n_elements=%d n_elements_per_dpu=%d b_sdk_singlethreaded=%d ", +                NR_DPUS, nr_of_ranks, NR_TASKLETS, benchmark_name, XSTR(T), mem_name, transfer_mode, UNROLL, BLOCK_SIZE, input_size, input_size / NR_DPUS, SDK_SINGLETHREADED);              printf("| latency_alloc_ns=%lu latency_load_ns=%lu latency_cpu_ns=%lu latency_write_ns=%lu latency_kernel_ns=%lu latency_read_ns=%lu latency_free_ns=%lu",                  timer.nanoseconds[0],                  timer.nanoseconds[1], | 
