diff options
| -rw-r--r-- | COUNT/baselines/cpu/Makefile | 27 | ||||
| -rw-r--r-- | COUNT/baselines/cpu/app_baseline.c | 22 | ||||
| -rw-r--r-- | COUNT/host/app.c | 34 | 
3 files changed, 60 insertions, 23 deletions
diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile index 4608944..ede0498 100644 --- a/COUNT/baselines/cpu/Makefile +++ b/COUNT/baselines/cpu/Makefile @@ -1,8 +1,23 @@ -NUMA ?= 0 -FLAGS = +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 +numa_memcpy ?= 0 -ifeq (${NUMA}, 1) -	FLAGS += -lnuma +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) +	CFLAGS += -g +endif + +ifeq (${native}, 1) +	CFLAGS += -march=native +endif + +ifeq (${numa}, 1) +	LDFLAGS += -lnuma  endif  .PHONY: all @@ -11,7 +26,7 @@ all: count  TYPE ?= uint64_t  count: app_baseline.c -	gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS} +	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS}  .PHONY: run  run: count @@ -19,4 +34,4 @@ run: count  .PHONY: clean  clean: -	rm -f count count_O0 count_O2 +	rm -f count diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c index d52257a..4e96276 100644 --- a/COUNT/baselines/cpu/app_baseline.c +++ b/COUNT/baselines/cpu/app_baseline.c @@ -12,7 +12,13 @@  #include <assert.h>  #include <stdint.h>  #include <omp.h> + +#if WITH_BENCHMARK  #include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif  #if NUMA  #include <numaif.h> @@ -186,13 +192,22 @@ int main(int argc, char **argv) {      // Create an input file with arbitrary data.      create_test_file(file_size); +#if WITH_BENCHMARK      Timer timer; +#endif + +#if NOP_SYNC +    for(int rep = 0; rep < 200000; rep++) { +        asm volatile("nop" ::); +    } +#endif      for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {          start(&timer, 0, 0);          total_count = count_host(file_size, p.n_threads);          stop(&timer, 0); +#if WITH_BENCHMARK          unsigned int nr_threads = 0;  #pragma omp parallel  #pragma omp atomic @@ -213,8 +228,15 @@ int main(int argc, char **argv) {                  file_size / timer.time[0]);              printall(&timer, 0);          } +#endif // WITH_BENCHMARK      } +#if NOP_SYNC +    for(int rep = 0; rep < 200000; rep++) { +        asm volatile("nop" ::); +    } +#endif +  #if NUMA      numa_free(A, file_size * sizeof(T));  #else diff --git a/COUNT/host/app.c b/COUNT/host/app.c index 7708f6d..9ea6bea 100644 --- a/COUNT/host/app.c +++ b/COUNT/host/app.c @@ -76,17 +76,17 @@ int main(int argc, char **argv) {      // Allocate DPUs and load binary  #if !WITH_ALLOC_OVERHEAD      DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); -    timer.time[0] = 0; // alloc +    timer.time[TMR_ALLOC] = 0; // alloc  #endif  #if !WITH_LOAD_OVERHEAD      DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));      DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));      DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));      assert(nr_of_dpus == NR_DPUS); -    timer.time[1] = 0; // load +    timer.time[TMR_LOAD] = 0; // load  #endif  #if !WITH_FREE_OVERHEAD -    timer.time[6] = 0; // free +    timer.time[TMR_FREE] = 0; // free  #endif  #if ENERGY @@ -122,20 +122,20 @@ int main(int argc, char **argv) {  #if WITH_ALLOC_OVERHEAD          if(rep >= p.n_warmup) { -            start(&timer, 0, 0); +            start(&timer, TMR_ALLOC, 0);          }          DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));          if(rep >= p.n_warmup) { -            stop(&timer, 0); +            stop(&timer, TMR_ALLOC);          }  #endif  #if WITH_LOAD_OVERHEAD          if(rep >= p.n_warmup) { -            start(&timer, 1, 0); +            start(&timer, TMR_LOAD, 0);          }          DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));          if(rep >= p.n_warmup) { -            stop(&timer, 1); +            stop(&timer, TMR_LOAD);          }          DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));          DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); @@ -161,14 +161,14 @@ int main(int argc, char **argv) {          // Compute output on CPU (performance comparison and verification purposes)          if(rep >= p.n_warmup) -            start(&timer, 2, 0); +            start(&timer, TMR_CPU, 0);          total_count = count_host(A, input_size);          if(rep >= p.n_warmup) -            stop(&timer, 2); +            stop(&timer, TMR_CPU);          printf("Load input data\n");          if(rep >= p.n_warmup) -            start(&timer, 3, 0); +            start(&timer, TMR_WRITE, 0);          // Input arguments          const unsigned int input_size_dpu = input_size_dpu_round;          unsigned int kernel = 0; @@ -184,19 +184,19 @@ int main(int argc, char **argv) {          }          DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));          if(rep >= p.n_warmup) -            stop(&timer, 3); +            stop(&timer, TMR_WRITE);          printf("Run program on DPU(s) \n");          // Run DPU kernel          if(rep >= p.n_warmup) { -            start(&timer, 4, 0); +            start(&timer, TMR_KERNEL, 0);              #if ENERGY              DPU_ASSERT(dpu_probe_start(&probe));              #endif          }          DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));          if(rep >= p.n_warmup) { -            stop(&timer, 4); +            stop(&timer, TMR_KERNEL);              #if ENERGY              DPU_ASSERT(dpu_probe_stop(&probe));              #endif @@ -220,7 +220,7 @@ int main(int argc, char **argv) {          accum = 0;          if(rep >= p.n_warmup) -		    start(&timer, 5, 0); +		    start(&timer, TMR_READ, 0);          // PARALLEL RETRIEVE TRANSFER          DPU_FOREACH(dpu_set, dpu, i) { @@ -240,20 +240,20 @@ int main(int argc, char **argv) {              accum += results[i].t_count;          }          if(rep >= p.n_warmup) -            stop(&timer, 5); +            stop(&timer, TMR_READ);          i = 0;  #if WITH_ALLOC_OVERHEAD  #if WITH_FREE_OVERHEAD          if(rep >= p.n_warmup) { -            start(&timer, 8, 0); +            start(&timer, TMR_FREE, 0);          }  #endif          DPU_ASSERT(dpu_free(dpu_set));  #if WITH_FREE_OVERHEAD          if(rep >= p.n_warmup) { -            stop(&timer, 8); +            stop(&timer, TMR_FREE);          }  #endif  #endif  | 
