diff options
-rw-r--r-- | COUNT/baselines/cpu/Makefile | 27 | ||||
-rw-r--r-- | COUNT/baselines/cpu/app_baseline.c | 22 | ||||
-rw-r--r-- | COUNT/host/app.c | 34 |
3 files changed, 60 insertions, 23 deletions
diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile index 4608944..ede0498 100644 --- a/COUNT/baselines/cpu/Makefile +++ b/COUNT/baselines/cpu/Makefile @@ -1,8 +1,23 @@ -NUMA ?= 0 -FLAGS = +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 +numa_memcpy ?= 0 -ifeq (${NUMA}, 1) - FLAGS += -lnuma +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma endif .PHONY: all @@ -11,7 +26,7 @@ all: count TYPE ?= uint64_t count: app_baseline.c - gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS} + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS} .PHONY: run run: count @@ -19,4 +34,4 @@ run: count .PHONY: clean clean: - rm -f count count_O0 count_O2 + rm -f count diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c index d52257a..4e96276 100644 --- a/COUNT/baselines/cpu/app_baseline.c +++ b/COUNT/baselines/cpu/app_baseline.c @@ -12,7 +12,13 @@ #include <assert.h> #include <stdint.h> #include <omp.h> + +#if WITH_BENCHMARK #include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif #if NUMA #include <numaif.h> @@ -186,13 +192,22 @@ int main(int argc, char **argv) { // Create an input file with arbitrary data. create_test_file(file_size); +#if WITH_BENCHMARK Timer timer; +#endif + +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { start(&timer, 0, 0); total_count = count_host(file_size, p.n_threads); stop(&timer, 0); +#if WITH_BENCHMARK unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic @@ -213,8 +228,15 @@ int main(int argc, char **argv) { file_size / timer.time[0]); printall(&timer, 0); } +#endif // WITH_BENCHMARK } +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + #if NUMA numa_free(A, file_size * sizeof(T)); #else diff --git a/COUNT/host/app.c b/COUNT/host/app.c index 7708f6d..9ea6bea 100644 --- a/COUNT/host/app.c +++ b/COUNT/host/app.c @@ -76,17 +76,17 @@ int main(int argc, char **argv) { // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + timer.time[TMR_ALLOC] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + timer.time[TMR_LOAD] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[TMR_FREE] = 0; // free #endif #if ENERGY @@ -122,20 +122,20 @@ int main(int argc, char **argv) { #if WITH_ALLOC_OVERHEAD if(rep >= p.n_warmup) { - start(&timer, 0, 0); + start(&timer, TMR_ALLOC, 0); } DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); if(rep >= p.n_warmup) { - stop(&timer, 0); + stop(&timer, TMR_ALLOC); } #endif #if WITH_LOAD_OVERHEAD if(rep >= p.n_warmup) { - start(&timer, 1, 0); + start(&timer, TMR_LOAD, 0); } DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); if(rep >= p.n_warmup) { - stop(&timer, 1); + stop(&timer, TMR_LOAD); } DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); @@ -161,14 +161,14 @@ int main(int argc, char **argv) { // Compute output on CPU (performance comparison and verification purposes) if(rep >= p.n_warmup) - start(&timer, 2, 0); + start(&timer, TMR_CPU, 0); total_count = count_host(A, input_size); if(rep >= p.n_warmup) - stop(&timer, 2); + stop(&timer, TMR_CPU); printf("Load input data\n"); if(rep >= p.n_warmup) - start(&timer, 3, 0); + start(&timer, TMR_WRITE, 0); // Input arguments const unsigned int input_size_dpu = input_size_dpu_round; unsigned int kernel = 0; @@ -184,19 +184,19 @@ int main(int argc, char **argv) { } DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); if(rep >= p.n_warmup) - stop(&timer, 3); + stop(&timer, TMR_WRITE); printf("Run program on DPU(s) \n"); // Run DPU kernel if(rep >= p.n_warmup) { - start(&timer, 4, 0); + start(&timer, TMR_KERNEL, 0); #if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); #endif } DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); if(rep >= p.n_warmup) { - stop(&timer, 4); + stop(&timer, TMR_KERNEL); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); #endif @@ -220,7 +220,7 @@ int main(int argc, char **argv) { accum = 0; if(rep >= p.n_warmup) - start(&timer, 5, 0); + start(&timer, TMR_READ, 0); // PARALLEL RETRIEVE TRANSFER DPU_FOREACH(dpu_set, dpu, i) { @@ -240,20 +240,20 @@ int main(int argc, char **argv) { accum += results[i].t_count; } if(rep >= p.n_warmup) - stop(&timer, 5); + stop(&timer, TMR_READ); i = 0; #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD if(rep >= p.n_warmup) { - start(&timer, 8, 0); + start(&timer, TMR_FREE, 0); } #endif DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD if(rep >= p.n_warmup) { - stop(&timer, 8); + stop(&timer, TMR_FREE); } #endif #endif |