From 6e923c56e0881ab1faa7eaec078cd25782786fd6 Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Wed, 17 Jul 2024 16:13:24 +0200 Subject: TRNS baseline: Add NUMA_MEMCPY support --- TRNS/baselines/cpu/Makefile | 3 +- TRNS/baselines/cpu/main.cpp | 174 ++++++++++++++++++++++++++++++-------------- 2 files changed, 121 insertions(+), 56 deletions(-) diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile index a5a1635..236f7bb 100644 --- a/TRNS/baselines/cpu/Makefile +++ b/TRNS/baselines/cpu/Makefile @@ -33,6 +33,7 @@ # NUMA ?= 0 +NUMA_MEMCPY ?= 0 FLAGS = ifeq (${NUMA}, 1) @@ -40,7 +41,7 @@ ifeq (${NUMA}, 1) endif CXX=g++ -CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} +CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} LIB=-L/usr/lib/ -lm -pthread diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp index 09465e4..837c75d 100644 --- a/TRNS/baselines/cpu/main.cpp +++ b/TRNS/baselines/cpu/main.cpp @@ -55,10 +55,14 @@ void* mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_in = -1; -int numa_node_out = -1; int numa_node_cpu = -1; #endif +#if NUMA_MEMCPY +int numa_node_local = -1; +int numa_node_in_is_local = 0; +#endif + // Params --------------------------------------------------------------------- struct Params { @@ -72,9 +76,11 @@ struct Params { int n; #if NUMA struct bitmask* bitmask_in; - struct bitmask* bitmask_out; int numa_node_cpu; #endif +#if NUMA_MEMCPY + struct bitmask* bitmask_cpu; +#endif Params(int argc, char **argv) { n_threads = 4; @@ -86,11 +92,13 @@ struct Params { n = 8; #if NUMA bitmask_in = NULL; - bitmask_out = NULL; numa_node_cpu = -1; +#endif +#if NUMA_MEMCPY + bitmask_cpu = NULL; #endif int opt; - while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:b:c:")) >= 0) { + while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:c:C:")) >= 0) { switch(opt) { case 'h': usage(); @@ -105,9 +113,11 @@ struct Params { case 'p': N_ = atoi(optarg); break; #if NUMA case 'a': bitmask_in = numa_parse_nodestring(optarg); break; - case 'b': bitmask_out = numa_parse_nodestring(optarg); break; case 'c': numa_node_cpu = atoi(optarg); break; -#endif +#if NUMA_MEMCPY + case 'C': bitmask_cpu = numa_parse_nodestring(optarg); break; +#endif // NUMA_MEMCPY +#endif // NUMA default: fprintf(stderr, "\nUnrecognized option!\n"); usage(); @@ -154,7 +164,6 @@ int main(int argc, char **argv) { Timer timer; // Allocate - timer.start("Allocation"); int M_ = p.M_; int m = p.m; int N_ = p.N_; @@ -162,54 +171,59 @@ int main(int argc, char **argv) { int in_size = M_ * m * N_ * n; int finished_size = M_ * m * N_; +#if !NUMA_MEMCPY + T *h_in_backup = (T *)malloc(in_size * sizeof(T)); + ALLOC_ERR(h_in_backup); +#endif + #if NUMA if (p.bitmask_in) { numa_set_membind(p.bitmask_in); numa_free_nodemask(p.bitmask_in); } - T * h_in_out = (T *)numa_alloc(in_size * sizeof(T)); - std::atomic_int *h_finished = - (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size); + T *h_in_out = (T *)numa_alloc(in_size * sizeof(T)); #else - T * h_in_out = (T *)malloc(in_size * sizeof(T)); - std::atomic_int *h_finished = - (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size); + T *h_in_out = (T *)malloc(in_size * sizeof(T)); #endif -#if NUMA - if (p.bitmask_out) { - numa_set_membind(p.bitmask_out); - numa_free_nodemask(p.bitmask_out); - } - std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int)); -#else - std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int)); -#endif - ALLOC_ERR(h_in_out, h_finished, h_head); + T *h_local = h_in_out; #if NUMA +#if NUMA_MEMCPY + if (p.bitmask_cpu) { + numa_set_membind(p.bitmask_cpu); + numa_free_nodemask(p.bitmask_cpu); + } +#endif // NUMA_MEMCPY + std::atomic_int *h_finished = + (std::atomic_int *)numa_alloc(sizeof(std::atomic_int) * finished_size); + std::atomic_int *h_head = (std::atomic_int *)numa_alloc(N_ * sizeof(std::atomic_int)); +#if !NUMA_MEMCPY struct bitmask *bitmask_all = numa_allocate_nodemask(); numa_bitmask_setall(bitmask_all); numa_set_membind(bitmask_all); numa_free_nodemask(bitmask_all); -#endif +#endif // !NUMA_MEMCPY +#else // NUMA + std::atomic_int *h_finished = + (std::atomic_int *)malloc(sizeof(std::atomic_int) * finished_size); + std::atomic_int *h_head = (std::atomic_int *)malloc(N_ * sizeof(std::atomic_int)); - T *h_in_backup = (T *)malloc(in_size * sizeof(T)); - ALLOC_ERR(h_in_backup); - timer.stop("Allocation"); - //timer.print("Allocation", 1); +#endif // NUMA + + ALLOC_ERR(h_in_out, h_finished, h_head); // Initialize - timer.start("Initialization"); read_input(h_in_out, p); memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); for(int i = 0; i < N_; i++) h_head[i].store(0); - timer.stop("Initialization"); - //timer.print("Initialization", 1); + +#if ! NUMA_MEMCPY memcpy(h_in_backup, h_in_out, in_size * sizeof(T)); // Backup for reuse across iterations +#endif #if NUMA mp_pages[0] = h_in_out; @@ -223,17 +237,6 @@ int main(int argc, char **argv) { numa_node_in = mp_status[0]; } - mp_pages[0] = h_finished; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(C)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_out = mp_status[0]; - } - numa_node_cpu = p.numa_node_cpu; if (numa_node_cpu != -1) { if (numa_run_on_node(numa_node_cpu) == -1) { @@ -244,12 +247,47 @@ int main(int argc, char **argv) { #endif +#if NUMA_MEMCPY + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; +#endif + // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { +#if NUMA_MEMCPY + if(rep >= p.n_warmup) + timer.start("local alloc"); + if (!numa_node_in_is_local) { + h_local = (T *)numa_alloc(in_size * sizeof(T)); + } + if(rep >= p.n_warmup) + timer.stop("local alloc"); + + if(rep >= p.n_warmup) + timer.start("memcpy"); + if (!numa_node_in_is_local) { + memcpy(h_local, h_in_out, in_size * sizeof(T)); + } + if(rep >= p.n_warmup) + timer.stop("memcpy"); + + mp_pages[0] = h_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A_local)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_local = mp_status[0]; + } +#else + h_local = h_in_out; + memcpy(h_local, h_in_backup, in_size * sizeof(T)); +#endif + // Reset - memcpy(h_in_out, h_in_backup, in_size * sizeof(T)); memset((void *)h_finished, 0, sizeof(std::atomic_int) * finished_size); for(int i = 0; i < N_; i++) h_head[i].store(0); @@ -258,7 +296,7 @@ int main(int argc, char **argv) { if(rep >= p.n_warmup) timer.start("Step 1"); // Launch CPU threads - std::thread main_thread_1(run_cpu_threads_100, h_in_out, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_); + std::thread main_thread_1(run_cpu_threads_100, h_local, h_finished, h_head, M_ * m, N_, n, p.n_threads); //M_ * m * N_); main_thread_1.join(); // end timer if(rep >= p.n_warmup) @@ -271,7 +309,7 @@ int main(int argc, char **argv) { if(rep >= p.n_warmup) timer.start("Step 2"); // Launch CPU threads - std::thread main_thread_2(run_cpu_threads_010, h_in_out, h_head, m, n, M_ * N_, p.n_threads); + std::thread main_thread_2(run_cpu_threads_010, h_local, h_head, m, n, M_ * N_, p.n_threads); main_thread_2.join(); // end timer if(rep >= p.n_warmup) @@ -286,28 +324,56 @@ int main(int argc, char **argv) { timer.start("Step 3"); // Launch CPU threads for(int i = 0; i < N_; i++){ - std::thread main_thread_3(run_cpu_threads_100, h_in_out + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n); + std::thread main_thread_3(run_cpu_threads_100, h_local + i * M_ * n * m, h_finished + i * M_ * n, h_head + i, M_, n, m, p.n_threads); //M_ * n); main_thread_3.join(); } // end timer if(rep >= p.n_warmup) timer.stop("Step 3"); +#if NUMA_MEMCPY + if(rep >= p.n_warmup) + timer.start("free"); + if (!numa_node_in_is_local) { + numa_free(h_local, in_size * sizeof(T)); + } + if(rep >= p.n_warmup) + timer.stop("free"); +#endif + if (rep >= p.n_warmup) { +#if NUMA_MEMCPY + printf("[::] TRNS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d" + " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d" + " | throughput_MBps=%f", + p.n_threads, XSTR(T), in_size, + numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), + in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"))); + printf(" throughput_MOpps=%f", + in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"))); + double latency_kernel = timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"); + printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f", + timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3")); + printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + latency_kernel, timer.get("local alloc"), timer.get("memcpy"), timer.get("free"), + latency_kernel + timer.get("local alloc") + timer.get("memcpy") + timer.get("free")); +#else printf("[::] TRNS-CPU | n_threads=%d e_type=%s n_elements=%d" #if NUMA - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d" #endif " | throughput_MBps=%f", p.n_threads, XSTR(T), in_size, #if NUMA - numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), #endif in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"))); printf(" throughput_MOpps=%f", in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"))); - printf(" timer1_us=%f timer2_us=%f timer3_us=%f\n", - timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3")); + printf(" latency_step1_us=%f latency_step2_us=%f latency_step3_us=%f latency_total_us=%f\n", + timer.get("Step 1"), timer.get("Step 2"), timer.get("Step 3"), + timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")); +#endif // NUMA_MEMCPY } } //timer.print("Step 1", p.n_reps); @@ -315,24 +381,22 @@ int main(int argc, char **argv) { //timer.print("Step 3", p.n_reps); // Verify answer - //verify(h_in_out, h_in_backup, M_ * m, N_ * n, 1); + //verify(h_local, h_in_backup, M_ * m, N_ * n, 1); // Free memory - timer.start("Deallocation"); #if NUMA numa_free(h_in_out, in_size * sizeof(T)); numa_free(h_finished, sizeof(std::atomic_int) * finished_size); numa_free(h_head, N_ * sizeof(std::atomic_int)); +#if !NUMA_MEMCPY numa_free(h_in_backup, in_size * sizeof(T)); +#endif #else free(h_in_out); free(h_finished); free(h_head); free(h_in_backup); #endif - timer.stop("Deallocation"); - //timer.print("Deallocation", 1); - printf("Test Passed\n"); return 0; } -- cgit v1.2.3