summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--COUNT/baselines/cpu/Makefile27
-rw-r--r--COUNT/baselines/cpu/app_baseline.c22
-rw-r--r--COUNT/host/app.c34
3 files changed, 60 insertions, 23 deletions
diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile
index 4608944..ede0498 100644
--- a/COUNT/baselines/cpu/Makefile
+++ b/COUNT/baselines/cpu/Makefile
@@ -1,8 +1,23 @@
-NUMA ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
-ifeq (${NUMA}, 1)
- FLAGS += -lnuma
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
endif
.PHONY: all
@@ -11,7 +26,7 @@ all: count
TYPE ?= uint64_t
count: app_baseline.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS}
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS}
.PHONY: run
run: count
@@ -19,4 +34,4 @@ run: count
.PHONY: clean
clean:
- rm -f count count_O0 count_O2
+ rm -f count
diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c
index d52257a..4e96276 100644
--- a/COUNT/baselines/cpu/app_baseline.c
+++ b/COUNT/baselines/cpu/app_baseline.c
@@ -12,7 +12,13 @@
#include <assert.h>
#include <stdint.h>
#include <omp.h>
+
+#if WITH_BENCHMARK
#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
#if NUMA
#include <numaif.h>
@@ -186,13 +192,22 @@ int main(int argc, char **argv) {
// Create an input file with arbitrary data.
create_test_file(file_size);
+#if WITH_BENCHMARK
Timer timer;
+#endif
+
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
start(&timer, 0, 0);
total_count = count_host(file_size, p.n_threads);
stop(&timer, 0);
+#if WITH_BENCHMARK
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
@@ -213,8 +228,15 @@ int main(int argc, char **argv) {
file_size / timer.time[0]);
printall(&timer, 0);
}
+#endif // WITH_BENCHMARK
}
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
#if NUMA
numa_free(A, file_size * sizeof(T));
#else
diff --git a/COUNT/host/app.c b/COUNT/host/app.c
index 7708f6d..9ea6bea 100644
--- a/COUNT/host/app.c
+++ b/COUNT/host/app.c
@@ -76,17 +76,17 @@ int main(int argc, char **argv) {
// Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ timer.time[TMR_ALLOC] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ timer.time[TMR_LOAD] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[TMR_FREE] = 0; // free
#endif
#if ENERGY
@@ -122,20 +122,20 @@ int main(int argc, char **argv) {
#if WITH_ALLOC_OVERHEAD
if(rep >= p.n_warmup) {
- start(&timer, 0, 0);
+ start(&timer, TMR_ALLOC, 0);
}
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
if(rep >= p.n_warmup) {
- stop(&timer, 0);
+ stop(&timer, TMR_ALLOC);
}
#endif
#if WITH_LOAD_OVERHEAD
if(rep >= p.n_warmup) {
- start(&timer, 1, 0);
+ start(&timer, TMR_LOAD, 0);
}
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
if(rep >= p.n_warmup) {
- stop(&timer, 1);
+ stop(&timer, TMR_LOAD);
}
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
@@ -161,14 +161,14 @@ int main(int argc, char **argv) {
// Compute output on CPU (performance comparison and verification purposes)
if(rep >= p.n_warmup)
- start(&timer, 2, 0);
+ start(&timer, TMR_CPU, 0);
total_count = count_host(A, input_size);
if(rep >= p.n_warmup)
- stop(&timer, 2);
+ stop(&timer, TMR_CPU);
printf("Load input data\n");
if(rep >= p.n_warmup)
- start(&timer, 3, 0);
+ start(&timer, TMR_WRITE, 0);
// Input arguments
const unsigned int input_size_dpu = input_size_dpu_round;
unsigned int kernel = 0;
@@ -184,19 +184,19 @@ int main(int argc, char **argv) {
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));
if(rep >= p.n_warmup)
- stop(&timer, 3);
+ stop(&timer, TMR_WRITE);
printf("Run program on DPU(s) \n");
// Run DPU kernel
if(rep >= p.n_warmup) {
- start(&timer, 4, 0);
+ start(&timer, TMR_KERNEL, 0);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
if(rep >= p.n_warmup) {
- stop(&timer, 4);
+ stop(&timer, TMR_KERNEL);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
@@ -220,7 +220,7 @@ int main(int argc, char **argv) {
accum = 0;
if(rep >= p.n_warmup)
- start(&timer, 5, 0);
+ start(&timer, TMR_READ, 0);
// PARALLEL RETRIEVE TRANSFER
DPU_FOREACH(dpu_set, dpu, i) {
@@ -240,20 +240,20 @@ int main(int argc, char **argv) {
accum += results[i].t_count;
}
if(rep >= p.n_warmup)
- stop(&timer, 5);
+ stop(&timer, TMR_READ);
i = 0;
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
if(rep >= p.n_warmup) {
- start(&timer, 8, 0);
+ start(&timer, TMR_FREE, 0);
}
#endif
DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
if(rep >= p.n_warmup) {
- stop(&timer, 8);
+ stop(&timer, TMR_FREE);
}
#endif
#endif