summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-07-17 16:13:24 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-07-17 16:13:24 +0200
commit6e923c56e0881ab1faa7eaec078cd25782786fd6 (patch)
tree0196190ad41e91fc56a1a43c290ab4052ce35102
parentdb1cd145998ea88037f7dc427e06d3f82c0b4930 (diff)
TRNS baseline: Add NUMA_MEMCPY support
-rw-r--r--TRNS/baselines/cpu/Makefile3
-rw-r--r--TRNS/baselines/cpu/main.cpp174
2 files changed, 121 insertions, 56 deletions
diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile
index a5a1635..236f7bb 100644
--- a/TRNS/baselines/cpu/Makefile
+++ b/TRNS/baselines/cpu/Makefile
@@ -33,6 +33,7 @@
#
NUMA ?= 0
+NUMA_MEMCPY ?= 0
FLAGS =
ifeq (${NUMA}, 1)
@@ -40,7 +41,7 @@ ifeq (${NUMA}, 1)
endif
CXX=g++
-CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA}
+CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY}
LIB=-L/usr/lib/ -lm -pthread
diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp
index 09465e4..837c75d 100644
--- a/TRNS/baselines/cpu/main.cpp
+++ b/TRNS/baselines/cpu/main.cpp
@@ -55,10 +55,14 @@ void* mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
-int numa_node_out = -1;
int numa_node_cpu = -1;
#endif
+#if NUMA_MEMCPY
+int numa_node_local = -1;
+int numa_node_in_is_local = 0;
+#endif
+
// Params ---------------------------------------------------------------------
struct Params {
@@ -72,9 +76,11 @@ struct Params {
int n;
#if NUMA
struct bitmask* bitmask_in;
- struct bitmask* bitmask_out;
int numa_node_cpu;
#endif
+#if NUMA_MEMCPY
+ struct bitmask* bitmask_cpu;
+#endif
Params(int argc, char **argv) {
n_threads = 4;
@@ -86,11 +92,13 @@ struct Params {
n = 8;
#if NUMA
bitmask_in = NULL;
- bitmask_out = NULL;
numa_node_cpu = -1;
#endif
+#if NUMA_MEMCPY
+ bitmask_cpu = NULL;
+#endif
int opt;
- while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:b:c:")) >= 0) {
+ while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:c:C:")) >= 0) {
switch(opt) {
case 'h':
usage();
@@ -105,9 +113,11 @@ struct Params {
case 'p': N_ = atoi(optarg); break;
#if NUMA
case 'a': bitmask_in = numa_parse_nodestring(optarg); break;
- case 'b': bitmask_out = numa_parse_nodestring(optarg); break;
case 'c': numa_node_cpu = atoi(optarg); break;
-#endif
+#if NUMA_MEMCPY
+ case 'C': bitmask_cpu = numa_parse_nodestring(optarg); break;
+#endif // NUMA_MEMCPY
+#endif // NUMA
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
@@ -154,7 +164,6 @@ int main(int argc, char **argv) {
Timer timer;
// Allocate
- timer.start("Allocation");
int M_ = p.M_;
int m = p.m;
int N_ = p.N_;
@@ -162,54 +171,59 @@ int main(int argc, char **argv) {
int in_size = M_ * m * N_ * n;
int finished_size = M_ * m * N_;
+#if !NUMA_MEMCPY
+ T *h_in_backup = (T *)malloc(in_size * sizeof(T));
+ ALLOC_ERR(h_in_backup);
+#endif
+
#if NUMA
if (p.bitmask_in) {
numa_set_membind(p.bitmask_in);
numa_free_nodemask(p.bitmask_in);
}
- T * h_in_out = (T *)numa_alloc(in_size * sizeof(T));
- std::atomic_int *h_finished =
- (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size);
+ T *h_in_out = (T *)numa_alloc(in_size * sizeof(T));
#else
- T * h_in_out = (T *)malloc(in_size * sizeof(T));
- std::atomic_int *h_finished =
- (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size);
+ T *h_in_out = (T *)malloc(in_size * sizeof(T));
#endif
-#if NUMA
- if (p.bitmask_out) {
- numa_set_membind(p.bitmask_out);
- numa_free_nodemask(p.bitmask_out);
- }
- std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int));
-#else
- std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int));
-#endif
- ALLOC_ERR(h_in_out, h_finished, h_head);
+ T *h_local = h_in_out;
#if NUMA
+#if NUMA_MEMCPY
+ if (p.bitmask_cpu) {
+ numa_set_membind(p.bitmask_cpu);
+ numa_free_nodemask(p.bitmask_cpu);
+ }
+#endif // NUMA_MEMCPY
+ std::atomic_int *h_finished =
+ (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size);
+ std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int));
+#if !NUMA_MEMCPY
struct bitmask *bitmask_all = numa_allocate_nodemask();
numa_bitmask_setall(bitmask_all);
numa_set_membind(bitmask_all);
numa_free_nodemask(bitmask_all);
-#endif
+#endif // !NUMA_MEMCPY
+#else // NUMA
+ std::atomic_int *h_finished =
+ (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size);
+ std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int));
- T *h_in_backup = (T *)malloc(in_size * sizeof(T));
- ALLOC_ERR(h_in_backup);
- timer.stop("Allocation");
- //timer.print("Allocation", 1);
+#endif // NUMA
+
+ ALLOC_ERR(h_in_out, h_finished, h_head);
// Initialize
- timer.start("Initialization");
read_input(h_in_out, p);
memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size);
for(int i = 0; i < N_; i++)
h_head[i].store(0);
- timer.stop("Initialization");
- //timer.print("Initialization", 1);
+
+#if ! NUMA_MEMCPY
memcpy(h_in_backup, h_in_out, in_size * sizeof(T)); // Backup for reuse across iterations
+#endif
#if NUMA
mp_pages[0] = h_in_out;
@@ -223,17 +237,6 @@ int main(int argc, char **argv) {
numa_node_in = mp_status[0];
}
- mp_pages[0] = h_finished;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(C)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
numa_node_cpu = p.numa_node_cpu;
if (numa_node_cpu != -1) {
if (numa_run_on_node(numa_node_cpu) == -1) {
@@ -244,12 +247,47 @@ int main(int argc, char **argv) {
#endif
+#if NUMA_MEMCPY
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
// Loop over main kernel
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+#if NUMA_MEMCPY
+ if(rep >= p.n_warmup)
+ timer.start("local alloc");
+ if (!numa_node_in_is_local) {
+ h_local = (T *)numa_alloc(in_size * sizeof(T));
+ }
+ if(rep >= p.n_warmup)
+ timer.stop("local alloc");
+
+ if(rep >= p.n_warmup)
+ timer.start("memcpy");
+ if (!numa_node_in_is_local) {
+ memcpy(h_local, h_in_out, in_size * sizeof(T));
+ }
+ if(rep >= p.n_warmup)
+ timer.stop("memcpy");
+
+ mp_pages[0] = h_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_local = mp_status[0];
+ }
+#else
+ h_local = h_in_out;
+ memcpy(h_local, h_in_backup, in_size * sizeof(T));
+#endif
+
// Reset
- memcpy(h_in_out, h_in_backup, in_size * sizeof(T));
memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size);
for(int i = 0; i < N_; i++)
h_head[i].store(0);
@@ -258,7 +296,7 @@ int main(int argc, char **argv) {
if(rep >= p.n_warmup)
timer.start("Step 1");
// Launch CPU threads
- std::thread main_thread_1(run_cpu_threads_100, h_in_out, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_);
+ std::thread main_thread_1(run_cpu_threads_100, h_local, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_);
main_thread_1.join();
// end timer
if(rep >= p.n_warmup)
@@ -271,7 +309,7 @@ int main(int argc, char **argv) {
if(rep >= p.n_warmup)
timer.start("Step 2");
// Launch CPU threads
- std::thread main_thread_2(run_cpu_threads_010, h_in_out, h_head, m, n, M_ * N_, p.n_threads);
+ std::thread main_thread_2(run_cpu_threads_010, h_local, h_head, m, n, M_ * N_, p.n_threads);
main_thread_2.join();
// end timer
if(rep >= p.n_warmup)
@@ -286,28 +324,56 @@ int main(int argc, char **argv) {
timer.start("Step 3");
// Launch CPU threads
for(int i = 0; i < N_; i++){
- std::thread main_thread_3(run_cpu_threads_100, h_in_out + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n);
+ std::thread main_thread_3(run_cpu_threads_100, h_local + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n);
main_thread_3.join();
}
// end timer
if(rep >= p.n_warmup)
timer.stop("Step 3");
+#if NUMA_MEMCPY
+ if(rep >= p.n_warmup)
+ timer.start("free");
+ if (!numa_node_in_is_local) {
+ numa_free(h_local, in_size * sizeof(T));
+ }
+ if(rep >= p.n_warmup)
+ timer.stop("free");
+#endif
+
if (rep >= p.n_warmup) {
+#if NUMA_MEMCPY
+ printf("[::] TRNS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
+ " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d"
+ " | throughput_MBps=%f",
+ p.n_threads, XSTR(T), in_size,
+ numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+ in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
+ printf(" throughput_MOpps=%f",
+ in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
+ double latency_kernel = timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3");
+ printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f",
+ timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"));
+ printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ latency_kernel, timer.get("local alloc"), timer.get("memcpy"), timer.get("free"),
+ latency_kernel + timer.get("local alloc") + timer.get("memcpy") + timer.get("free"));
+#else
printf("[::] TRNS-CPU | n_threads=%d e_type=%s n_elements=%d"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d"
#endif
" | throughput_MBps=%f",
p.n_threads, XSTR(T), in_size,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
#endif
in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
printf(" throughput_MOpps=%f",
in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
- printf(" timer1_us=%f timer2_us=%f timer3_us=%f\n",
- timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"));
+ printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f latency_total_us=%f\n",
+ timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"),
+ timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"));
+#endif // NUMA_MEMCPY
}
}
//timer.print("Step 1", p.n_reps);
@@ -315,24 +381,22 @@ int main(int argc, char **argv) {
//timer.print("Step 3", p.n_reps);
// Verify answer
- //verify(h_in_out, h_in_backup, M_ * m, N_ * n, 1);
+ //verify(h_local, h_in_backup, M_ * m, N_ * n, 1);
// Free memory
- timer.start("Deallocation");
#if NUMA
numa_free(h_in_out, in_size * sizeof(T));
numa_free(h_finished, sizeof(std::atomic_int) * finished_size);
numa_free(h_head, N_ * sizeof(std::atomic_int));
+#if !NUMA_MEMCPY
numa_free(h_in_backup, in_size * sizeof(T));
+#endif
#else
free(h_in_out);
free(h_finished);
free(h_head);
free(h_in_backup);
#endif
- timer.stop("Deallocation");
- //timer.print("Deallocation", 1);
- printf("Test Passed\n");
return 0;
}