TRNS baseline: Add NUMA_MEMCPY support

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-07-17 16:13:24 +0200
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-07-17 16:13:24 +0200
commit: 6e923c56e0881ab1faa7eaec078cd25782786fd6 (patch)
tree: 0196190ad41e91fc56a1a43c290ab4052ce35102
parent: db1cd145998ea88037f7dc427e06d3f82c0b4930 (diff)
2 files changed, 121 insertions, 56 deletions
diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile
index a5a1635..236f7bb 100644
--- a/TRNS/baselines/cpu/Makefile
+++ b/TRNS/baselines/cpu/Makefile
@@ -33,6 +33,7 @@
 # 
 
 NUMA ?= 0
+NUMA_MEMCPY ?= 0
 FLAGS =
 
 ifeq (${NUMA}, 1)
@@ -40,7 +41,7 @@ ifeq (${NUMA}, 1)
 endif
 
 CXX=g++
-CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA}
+CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY}
 
 LIB=-L/usr/lib/ -lm -pthread
 
diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp
index 09465e4..837c75d 100644
--- a/TRNS/baselines/cpu/main.cpp
+++ b/TRNS/baselines/cpu/main.cpp
@@ -55,10 +55,14 @@ void* mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
-int numa_node_out = -1;
 int numa_node_cpu = -1;
 #endif
 
+#if NUMA_MEMCPY
+int numa_node_local = -1;
+int numa_node_in_is_local = 0;
+#endif
+
 
 // Params ---------------------------------------------------------------------
 struct Params {
@@ -72,9 +76,11 @@ struct Params {
     int n;
 #if NUMA
     struct bitmask* bitmask_in;
-    struct bitmask* bitmask_out;
     int numa_node_cpu;
 #endif
+#if NUMA_MEMCPY
+    struct bitmask* bitmask_cpu;
+#endif
 
     Params(int argc, char **argv) {
         n_threads     = 4;
@@ -86,11 +92,13 @@ struct Params {
         n             = 8;
 #if NUMA
         bitmask_in    = NULL;
-        bitmask_out   = NULL;
         numa_node_cpu = -1;
 #endif
+#if NUMA_MEMCPY
+        bitmask_cpu    = NULL;
+#endif
         int opt;
-        while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:b:c:")) >= 0) {
+        while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:c:C:")) >= 0) {
             switch(opt) {
             case 'h':
                 usage();
@@ -105,9 +113,11 @@ struct Params {
             case 'p': N_            = atoi(optarg); break;
 #if NUMA
             case 'a': bitmask_in    = numa_parse_nodestring(optarg); break;
-            case 'b': bitmask_out   = numa_parse_nodestring(optarg); break;
             case 'c': numa_node_cpu = atoi(optarg); break;
-#endif
+#if NUMA_MEMCPY
+            case 'C': bitmask_cpu   = numa_parse_nodestring(optarg); break;
+#endif // NUMA_MEMCPY
+#endif // NUMA
             default:
                 fprintf(stderr, "\nUnrecognized option!\n");
                 usage();
@@ -154,7 +164,6 @@ int main(int argc, char **argv) {
     Timer        timer;
 
     // Allocate
-    timer.start("Allocation");
     int M_       = p.M_;
     int m       = p.m;
     int N_       = p.N_;
@@ -162,54 +171,59 @@ int main(int argc, char **argv) {
     int in_size       = M_ * m * N_ * n;
     int finished_size = M_ * m * N_;
 
+#if !NUMA_MEMCPY
+    T *h_in_backup = (T *)malloc(in_size * sizeof(T));
+    ALLOC_ERR(h_in_backup);
+#endif
+
 #if NUMA
     if (p.bitmask_in) {
         numa_set_membind(p.bitmask_in);
         numa_free_nodemask(p.bitmask_in);
     }
-    T *              h_in_out = (T *)numa_alloc(in_size * sizeof(T));
-    std::atomic_int *h_finished =
-        (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size);
+    T *h_in_out = (T *)numa_alloc(in_size * sizeof(T));
 #else
-    T *              h_in_out = (T *)malloc(in_size * sizeof(T));
-    std::atomic_int *h_finished =
-        (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size);
+    T *h_in_out = (T *)malloc(in_size * sizeof(T));
 #endif
 
-#if NUMA
-    if (p.bitmask_out) {
-        numa_set_membind(p.bitmask_out);
-        numa_free_nodemask(p.bitmask_out);
-    }
-    std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int));
-#else
-    std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int));
-#endif
 
-    ALLOC_ERR(h_in_out, h_finished, h_head);
+    T *h_local = h_in_out;
 
 
 #if NUMA
+#if NUMA_MEMCPY
+    if (p.bitmask_cpu) {
+        numa_set_membind(p.bitmask_cpu);
+        numa_free_nodemask(p.bitmask_cpu);
+    }
+#endif // NUMA_MEMCPY
+    std::atomic_int *h_finished =
+        (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size);
+    std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int));
+#if !NUMA_MEMCPY
     struct bitmask *bitmask_all = numa_allocate_nodemask();
     numa_bitmask_setall(bitmask_all);
     numa_set_membind(bitmask_all);
     numa_free_nodemask(bitmask_all);
-#endif
+#endif // !NUMA_MEMCPY
+#else // NUMA
+    std::atomic_int *h_finished =
+        (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size);
+    std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int));
 
-    T *h_in_backup = (T *)malloc(in_size * sizeof(T));
-    ALLOC_ERR(h_in_backup);
-    timer.stop("Allocation");
-    //timer.print("Allocation", 1);
+#endif // NUMA
+
+    ALLOC_ERR(h_in_out, h_finished, h_head);
 
     // Initialize
-    timer.start("Initialization");
     read_input(h_in_out, p);
     memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size);
     for(int i = 0; i < N_; i++)
         h_head[i].store(0);
-    timer.stop("Initialization");
-    //timer.print("Initialization", 1);
+
+#if ! NUMA_MEMCPY
     memcpy(h_in_backup, h_in_out, in_size * sizeof(T)); // Backup for reuse across iterations
+#endif
 
 #if NUMA
     mp_pages[0] = h_in_out;
@@ -223,17 +237,6 @@ int main(int argc, char **argv) {
         numa_node_in = mp_status[0];
     }
 
-    mp_pages[0] = h_finished;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(C)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_out = mp_status[0];
-    }
-
     numa_node_cpu = p.numa_node_cpu;
     if (numa_node_cpu != -1) {
         if (numa_run_on_node(numa_node_cpu) == -1) {
@@ -244,12 +247,47 @@ int main(int argc, char **argv) {
 #endif
 
 
+#if NUMA_MEMCPY
+    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
 
     // Loop over main kernel
     for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
+#if NUMA_MEMCPY
+        if(rep >= p.n_warmup)
+            timer.start("local alloc");
+        if (!numa_node_in_is_local) {
+            h_local = (T *)numa_alloc(in_size * sizeof(T));
+        }
+        if(rep >= p.n_warmup)
+            timer.stop("local alloc");
+
+        if(rep >= p.n_warmup)
+            timer.start("memcpy");
+        if (!numa_node_in_is_local) {
+            memcpy(h_local, h_in_out, in_size * sizeof(T));
+        }
+        if(rep >= p.n_warmup)
+            timer.stop("memcpy");
+
+        mp_pages[0] = h_local;
+        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+            perror("move_pages(A_local)");
+        }
+        else if (mp_status[0] < 0) {
+            printf("move_pages error: %d", mp_status[0]);
+        }
+        else {
+            numa_node_local = mp_status[0];
+        }
+#else
+        h_local = h_in_out;
+        memcpy(h_local, h_in_backup, in_size * sizeof(T));
+#endif
+
         // Reset
-        memcpy(h_in_out, h_in_backup, in_size * sizeof(T));
         memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size);
 	    for(int i = 0; i < N_; i++)
 	        h_head[i].store(0);
@@ -258,7 +296,7 @@ int main(int argc, char **argv) {
         if(rep >= p.n_warmup)
             timer.start("Step 1");
         // Launch CPU threads
-        std::thread main_thread_1(run_cpu_threads_100, h_in_out, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_);
+        std::thread main_thread_1(run_cpu_threads_100, h_local, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_);
         main_thread_1.join();
         // end timer
         if(rep >= p.n_warmup)
@@ -271,7 +309,7 @@ int main(int argc, char **argv) {
         if(rep >= p.n_warmup)
             timer.start("Step 2");
         // Launch CPU threads
-        std::thread main_thread_2(run_cpu_threads_010, h_in_out, h_head, m, n, M_ * N_, p.n_threads);
+        std::thread main_thread_2(run_cpu_threads_010, h_local, h_head, m, n, M_ * N_, p.n_threads);
         main_thread_2.join();
         // end timer
         if(rep >= p.n_warmup)
@@ -286,28 +324,56 @@ int main(int argc, char **argv) {
             timer.start("Step 3");
         // Launch CPU threads
         for(int i = 0; i < N_; i++){
-            std::thread main_thread_3(run_cpu_threads_100, h_in_out + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n);
+            std::thread main_thread_3(run_cpu_threads_100, h_local + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n);
             main_thread_3.join();
         }
         // end timer
         if(rep >= p.n_warmup)
             timer.stop("Step 3");
 
+#if NUMA_MEMCPY
+        if(rep >= p.n_warmup)
+            timer.start("free");
+        if (!numa_node_in_is_local) {
+            numa_free(h_local, in_size * sizeof(T));
+        }
+        if(rep >= p.n_warmup)
+            timer.stop("free");
+#endif
+
         if (rep >= p.n_warmup) {
+#if NUMA_MEMCPY
+            printf("[::] TRNS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
+                " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d"
+                " | throughput_MBps=%f",
+                p.n_threads, XSTR(T), in_size,
+                numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+                in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
+            printf(" throughput_MOpps=%f",
+                in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
+            double latency_kernel = timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3");
+            printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f",
+                timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"));
+            printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+                latency_kernel, timer.get("local alloc"), timer.get("memcpy"), timer.get("free"),
+                latency_kernel + timer.get("local alloc") + timer.get("memcpy") + timer.get("free"));
+#else
             printf("[::] TRNS-CPU | n_threads=%d e_type=%s n_elements=%d"
 #if NUMA
-                " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+                " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d"
 #endif
                 " | throughput_MBps=%f",
                 p.n_threads, XSTR(T), in_size,
 #if NUMA
-                numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+                numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
 #endif
                 in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
             printf(" throughput_MOpps=%f",
                 in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
-            printf(" timer1_us=%f timer2_us=%f timer3_us=%f\n",
-                timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"));
+            printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f latency_total_us=%f\n",
+                timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"),
+                timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"));
+#endif // NUMA_MEMCPY
         }
     }
     //timer.print("Step 1", p.n_reps);
@@ -315,24 +381,22 @@ int main(int argc, char **argv) {
     //timer.print("Step 3", p.n_reps);
 
     // Verify answer
-    //verify(h_in_out, h_in_backup, M_ * m, N_ * n, 1);
+    //verify(h_local, h_in_backup, M_ * m, N_ * n, 1);
 
     // Free memory
-    timer.start("Deallocation");
 #if NUMA
     numa_free(h_in_out, in_size * sizeof(T));
     numa_free(h_finished, sizeof(std::atomic_int) * finished_size);
     numa_free(h_head, N_ * sizeof(std::atomic_int));
+#if !NUMA_MEMCPY
     numa_free(h_in_backup, in_size * sizeof(T));
+#endif
 #else
     free(h_in_out);
     free(h_finished);
     free(h_head);
     free(h_in_backup);
 #endif
-    timer.stop("Deallocation");
-    //timer.print("Deallocation", 1);
 
-    printf("Test Passed\n");
     return 0;
 }
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-07-17 16:13:24 +0200
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-07-17 16:13:24 +0200
commit	6e923c56e0881ab1faa7eaec078cd25782786fd6 (patch)
tree	0196190ad41e91fc56a1a43c290ab4052ce35102
parent	db1cd145998ea88037f7dc427e06d3f82c0b4930 (diff)