summaryrefslogtreecommitdiff
path: root/BS/baselines/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'BS/baselines/cpu')
-rw-r--r--BS/baselines/cpu/Makefile3
-rw-r--r--BS/baselines/cpu/bs_omp.c98
2 files changed, 97 insertions, 4 deletions
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile
index bed2561..b67602f 100644
--- a/BS/baselines/cpu/Makefile
+++ b/BS/baselines/cpu/Makefile
@@ -1,4 +1,5 @@
NUMA ?= 0
+NUMA_MEMCPY ?= 0
FLAGS =
ifeq (${NUMA}, 1)
@@ -9,7 +10,7 @@ endif
all: bs_omp
bs_omp: bs_omp.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} bs_omp.c -o bs_omp -fopenmp ${FLAGS}
+ gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS}
bs_omp_O0: bs_omp.c
gcc bs_omp.c -o bs_omp_O0 -fopenmp
diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c
index 1b9947c..874299b 100644
--- a/BS/baselines/cpu/bs_omp.c
+++ b/BS/baselines/cpu/bs_omp.c
@@ -21,6 +21,15 @@ int numa_node_in = -1;
int numa_node_cpu = -1;
#endif
+
+#if NUMA_MEMCPY
+struct bitmask* bitmask_cpu;
+int numa_node_cpu_memcpy = -1;
+int numa_node_local = -1;
+int numa_node_in_is_local = 0;
+#endif
+
+
#define DTYPE uint64_t
/*
* @brief creates a "test file" by filling a bufferwith values
@@ -90,6 +99,10 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne
bitmask_in = numa_parse_nodestring(argv[3]);
numa_node_cpu = atoi(argv[4]);
#endif
+#if NUMA_MEMCPY
+ bitmask_cpu = numa_parse_nodestring(argv[5]);
+ numa_node_cpu_memcpy = atoi(argv[6]);
+#endif
printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
@@ -106,10 +119,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne
#endif
#if NUMA
+#if NUMA_MEMCPY
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
+#else
struct bitmask *bitmask_all = numa_allocate_nodemask();
numa_bitmask_setall(bitmask_all);
numa_set_membind(bitmask_all);
numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
#endif
DTYPE result_host = -1;
@@ -137,10 +157,71 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne
}
#endif
+#if NUMA_MEMCPY
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
+#if NUMA_MEMCPY
+ DTYPE *input_local = input;
+ DTYPE *querys_local = querys;
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ input_local = numa_alloc((input_size) * sizeof(DTYPE));
+ querys_local = numa_alloc((n_querys) * sizeof(DTYPE));
+ }
+ stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ memcpy(input_local, input, input_size * sizeof(DTYPE));
+ memcpy(querys_local, querys, n_querys * sizeof(DTYPE));
+ } else {
+ input_local = input;
+ querys_local = querys;
+ }
+ stop(&timer, 2);
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+ mp_pages[0] = input_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(input_local)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_local = mp_status[0];
+ }
+#endif
+
start(&timer, 0, 0);
+#if NUMA_MEMCPY
+ result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys);
+#else
result_host = binarySearch(input, input_size - 1, querys, n_querys);
+#endif
stop(&timer, 0);
+#if NUMA_MEMCPY
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(input_local, input_size * sizeof(DTYPE));
+ numa_free(querys_local, n_querys * sizeof(DTYPE));
+ }
+ stop(&timer, 3);
+#endif
+
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
@@ -148,6 +229,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne
int status = (result_host);
if (status) {
+#if NUMA_MEMCPY
+ printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu"
+ " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f"
+ " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ nr_threads, "uint64_t", input_size,
+ numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
+ n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0],
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+#else
printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu"
#if NUMA
" numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
@@ -158,9 +250,9 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne
numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
#endif
n_querys * sizeof(DTYPE) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- n_querys / timer.time[0]);
- printall(&timer, 0);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ n_querys / timer.time[0], timer.time[0]);
+#endif
} else {
printf("[ERROR]\n");
}