summaryrefslogtreecommitdiff
path: root/GEMV/baselines/cpu/gemv_openmp.c
diff options
context:
space:
mode:
Diffstat (limited to 'GEMV/baselines/cpu/gemv_openmp.c')
-rw-r--r--GEMV/baselines/cpu/gemv_openmp.c93
1 files changed, 87 insertions, 6 deletions
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
index 798f7de..3da1504 100644
--- a/GEMV/baselines/cpu/gemv_openmp.c
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -21,6 +21,12 @@ int numa_node_out = -1;
int numa_node_cpu = -1;
#endif
+#if NUMA_MEMCPY
+struct bitmask* bitmask_cpu;
+int numa_node_local = -1;
+int numa_node_in_is_local = 0;
+#endif
+
#define XSTR(x) STR(x)
#define STR(x) #x
@@ -40,13 +46,18 @@ int main(int argc, char *argv[])
T **A, *b, *x;
+ T **A_local, *x_local;
+
#if NUMA
bitmask_in = numa_parse_nodestring(argv[1]);
bitmask_out = numa_parse_nodestring(argv[2]);
numa_node_cpu = atoi(argv[3]);
+#if NUMA_MEMCPY
+ bitmask_cpu = numa_parse_nodestring(argv[4]);
+#endif // NUMA_MEMCPY
#else
(void) argv;
-#endif
+#endif // NUMA
#if NUMA
if (bitmask_out) {
@@ -79,11 +90,21 @@ int main(int argc, char *argv[])
make_hilbert_mat(rows,cols, &A);
#if NUMA
+#if NUMA_MEMCPY
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
+#else
struct bitmask *bitmask_all = numa_allocate_nodemask();
numa_bitmask_setall(bitmask_all);
numa_set_membind(bitmask_all);
numa_free_nodemask(bitmask_all);
-#endif
+#endif // NUMA_MEMCPY
+#endif // NUMA
+
+ A_local = A;
+ x_local = x;
#if NUMA
mp_pages[0] = A;
@@ -116,6 +137,10 @@ int main(int argc, char *argv[])
}
#endif
+#if NUMA_MEMCPY
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
Timer timer;
for (int i = 0; i < 100; i++) {
@@ -132,14 +157,70 @@ int main(int argc, char *argv[])
}
}
+#if NUMA_MEMCPY
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ x_local = (T*) numa_alloc(sizeof(T)*cols);
+ allocate_dense(rows, cols, &A_local);
+ }
+ stop(&timer, 1);
+
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ //for (size_t i=0; i < rows; i++ ) {
+ // memcpy(A_local[i], A[i], cols * sizeof(T));
+ //}
+ memcpy(*A_local, *A, rows * cols * sizeof(T));
+ memcpy(x_local, x, cols * sizeof(T));
+ } else {
+ A_local = A;
+ x_local = x;
+ }
+ stop(&timer, 2);
+
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_local = mp_status[0];
+ }
+#endif
+
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
nr_threads++;
start(&timer, 0, 0);
- gemv(A, x, rows, cols, &b);
+ gemv(A_local, x_local, rows, cols, &b);
stop(&timer, 0);
+
+#if NUMA_MEMCPY
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(x_local, sizeof(T) * rows);
+ numa_free(*A_local, sizeof(T) * rows * cols);
+ numa_free(A_local, sizeof(void*) * rows);
+ }
+ stop(&timer, 3);
+#endif
+
+#if NUMA_MEMCPY
+ printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f",
+ nr_threads, XSTR(T), rows * cols,
+ numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ rows * cols * sizeof(T) / timer.time[0],
+ rows * cols / timer.time[0]);
+ printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+#else
printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
#if NUMA
" numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
@@ -150,9 +231,9 @@ int main(int argc, char *argv[])
numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
#endif
rows * cols * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- rows * cols / timer.time[0]);
- printall(&timer, 0);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ rows * cols / timer.time[0], timer.time[0]);
+#endif
}