From 79daeb2c93ca46318c23a5c28a8b27c935544d67 Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Thu, 18 Jul 2024 09:45:03 +0200 Subject: GEMV: add MEMCPY variant --- GEMV/baselines/cpu/Makefile | 3 +- GEMV/baselines/cpu/gemv_openmp.c | 93 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile index 382fa96..016d561 100644 --- a/GEMV/baselines/cpu/Makefile +++ b/GEMV/baselines/cpu/Makefile @@ -1,4 +1,5 @@ NUMA ?= 0 +NUMA_MEMCPY ?= 0 FLAGS = TYPE ?= double @@ -10,7 +11,7 @@ endif all: gemv gemv: gemv_openmp.c - gcc -Wall -Wextra -pedantic -march=native -O2 -o gemv -fopenmp -DNUMA=${NUMA} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${FLAGS} + gcc -ggdb -Wall -Wextra -pedantic -march=native -O2 -o gemv -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${FLAGS} gemv_O0: gemv_openmp.c gcc -o gemv_O0 -fopenmp gemv_openmp.c diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c index 798f7de..3da1504 100644 --- a/GEMV/baselines/cpu/gemv_openmp.c +++ b/GEMV/baselines/cpu/gemv_openmp.c @@ -21,6 +21,12 @@ int numa_node_out = -1; int numa_node_cpu = -1; #endif +#if NUMA_MEMCPY +struct bitmask* bitmask_cpu; +int numa_node_local = -1; +int numa_node_in_is_local = 0; +#endif + #define XSTR(x) STR(x) #define STR(x) #x @@ -40,13 +46,18 @@ int main(int argc, char *argv[]) T **A, *b, *x; + T **A_local, *x_local; + #if NUMA bitmask_in = numa_parse_nodestring(argv[1]); bitmask_out = numa_parse_nodestring(argv[2]); numa_node_cpu = atoi(argv[3]); +#if NUMA_MEMCPY + bitmask_cpu = numa_parse_nodestring(argv[4]); +#endif // NUMA_MEMCPY #else (void) argv; -#endif +#endif // NUMA #if NUMA if (bitmask_out) { @@ -79,11 +90,21 @@ int main(int argc, char *argv[]) make_hilbert_mat(rows,cols, &A); #if NUMA +#if NUMA_MEMCPY + if (bitmask_cpu) { + numa_set_membind(bitmask_cpu); + numa_free_nodemask(bitmask_cpu); + } +#else struct bitmask *bitmask_all = numa_allocate_nodemask(); numa_bitmask_setall(bitmask_all); numa_set_membind(bitmask_all); numa_free_nodemask(bitmask_all); -#endif +#endif // NUMA_MEMCPY +#endif // NUMA + + A_local = A; + x_local = x; #if NUMA mp_pages[0] = A; @@ -116,6 +137,10 @@ int main(int argc, char *argv[]) } #endif +#if NUMA_MEMCPY + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; +#endif + Timer timer; for (int i = 0; i < 100; i++) { @@ -132,14 +157,70 @@ int main(int argc, char *argv[]) } } +#if NUMA_MEMCPY + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + x_local = (T*) numa_alloc(sizeof(T)*cols); + allocate_dense(rows, cols, &A_local); + } + stop(&timer, 1); + + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + //for (size_t i=0; i < rows; i++ ) { + // memcpy(A_local[i], A[i], cols * sizeof(T)); + //} + memcpy(*A_local, *A, rows * cols * sizeof(T)); + memcpy(x_local, x, cols * sizeof(T)); + } else { + A_local = A; + x_local = x; + } + stop(&timer, 2); + + mp_pages[0] = A_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A_local)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_local = mp_status[0]; + } +#endif + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic nr_threads++; start(&timer, 0, 0); - gemv(A, x, rows, cols, &b); + gemv(A_local, x_local, rows, cols, &b); stop(&timer, 0); + +#if NUMA_MEMCPY + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(x_local, sizeof(T) * rows); + numa_free(*A_local, sizeof(T) * rows * cols); + numa_free(A_local, sizeof(void*) * rows); + } + stop(&timer, 3); +#endif + +#if NUMA_MEMCPY + printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " | throughput_MBps=%f throughput_MOpps=%f", + nr_threads, XSTR(T), rows * cols, + numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + rows * cols * sizeof(T) / timer.time[0], + rows * cols / timer.time[0]); + printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); +#else printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld" #if NUMA " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" @@ -150,9 +231,9 @@ int main(int argc, char *argv[]) numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), #endif rows * cols * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f", - rows * cols / timer.time[0]); - printall(&timer, 0); + printf(" throughput_MOpps=%f latency_us=%f\n", + rows * cols / timer.time[0], timer.time[0]); +#endif } -- cgit v1.2.3