summaryrefslogtreecommitdiff
path: root/GEMV/baselines
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2025-01-16 08:18:46 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2025-01-16 08:18:46 +0100
commitd83ae32594f5407c71a67f1b913fe9893f30015e (patch)
treed7b5a37778811ce6847a55967c467506905ae639 /GEMV/baselines
parent8e71c657efe8c10648849a7dc31c1689bbc7882d (diff)
GEMV: indent -linux
Diffstat (limited to 'GEMV/baselines')
-rw-r--r--GEMV/baselines/cpu/gemv_openmp.c410
1 files changed, 209 insertions, 201 deletions
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
index 21e24cb..99bba55 100644
--- a/GEMV/baselines/cpu/gemv_openmp.c
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -10,10 +10,10 @@
#include <numaif.h>
#include <numa.h>
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -22,7 +22,7 @@ int numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
@@ -35,284 +35,292 @@ int numa_node_in_is_local = 0;
int main(int argc, char *argv[])
{
- (void) argc;
+ (void)argc;
/* // upstream config:
const size_t rows = 20480;
const size_t cols = 8192;
*/
- // DPU config: 163840 -n 4096
- const size_t rows = 163840;
- const size_t cols = 4096;
+ // DPU config: 163840 -n 4096
+ const size_t rows = 163840;
+ const size_t cols = 4096;
- T **A, *b, *x;
+ T **A, *b, *x;
- T **A_local, *x_local;
+ T **A_local, *x_local;
#if NUMA
- bitmask_in = numa_parse_nodestring(argv[1]);
- bitmask_out = numa_parse_nodestring(argv[2]);
- numa_node_cpu = atoi(argv[3]);
+ bitmask_in = numa_parse_nodestring(argv[1]);
+ bitmask_out = numa_parse_nodestring(argv[2]);
+ numa_node_cpu = atoi(argv[3]);
#if NUMA_MEMCPY
- bitmask_cpu = numa_parse_nodestring(argv[4]);
- numa_node_cpu_memcpy = atoi(argv[5]);
-#endif // NUMA_MEMCPY
+ bitmask_cpu = numa_parse_nodestring(argv[4]);
+ numa_node_cpu_memcpy = atoi(argv[5]);
+#endif // NUMA_MEMCPY
#else
- (void) argv;
-#endif // NUMA
+ (void)argv;
+#endif // NUMA
#if NUMA
- if (bitmask_out) {
- numa_set_membind(bitmask_out);
- numa_free_nodemask(bitmask_out);
- }
- b = (T*) numa_alloc(sizeof(T)*rows);
+ if (bitmask_out) {
+ numa_set_membind(bitmask_out);
+ numa_free_nodemask(bitmask_out);
+ }
+ b = (T *) numa_alloc(sizeof(T) * rows);
#else
- b = (T*) malloc(sizeof(T)*rows);
+ b = (T *) malloc(sizeof(T) * rows);
#endif
#if NUMA
- if (bitmask_in) {
- numa_set_membind(bitmask_in);
- // no free yet, re-used in allocate_dense
- }
- x = (T*) numa_alloc(sizeof(T)*cols);
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ // no free yet, re-used in allocate_dense
+ }
+ x = (T *) numa_alloc(sizeof(T) * cols);
#else
- x = (T*) malloc(sizeof(T)*cols);
+ x = (T *) malloc(sizeof(T) * cols);
#endif
- allocate_dense(rows, cols, &A);
+ allocate_dense(rows, cols, &A);
#if NUMA
- if (bitmask_in) {
- numa_free_nodemask(bitmask_in);
- }
+ if (bitmask_in) {
+ numa_free_nodemask(bitmask_in);
+ }
#endif
- make_hilbert_mat(rows,cols, &A);
+ make_hilbert_mat(rows, cols, &A);
#if NUMA
#if NUMA_MEMCPY
- if (bitmask_cpu) {
- numa_set_membind(bitmask_cpu);
- numa_free_nodemask(bitmask_cpu);
- }
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
+#endif // NUMA
- A_local = A;
- x_local = x;
+ A_local = A;
+ x_local = x;
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(A) error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- mp_pages[0] = b;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(b)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(b) error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(A) error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = b;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(b)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(b) error: %d", mp_status[0]);
+ } else {
+ numa_node_out = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
#endif
- Timer timer;
- for (int i = 0; i < 20; i++) {
+ Timer timer;
+ for (int i = 0; i < 20; i++) {
#pragma omp parallel
- {
+ {
#pragma omp for
- for (size_t i = 0; i < cols; i++) {
- x[i] = (T) i+1 ;
- }
+ for (size_t i = 0; i < cols; i++) {
+ x[i] = (T) i + 1;
+ }
#pragma omp for
- for (size_t i = 0; i < rows; i++) {
- b[i] = (T) 0;
- }
- }
+ for (size_t i = 0; i < rows; i++) {
+ b[i] = (T) 0;
+ }
+ }
#if NUMA_MEMCPY
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- x_local = (T*) numa_alloc(sizeof(T)*cols);
- allocate_dense(rows, cols, &A_local);
- }
- stop(&timer, 1);
-
- if (x_local == NULL) {
- return 1;
- }
- if (A_local == NULL) {
- return 1;
- }
-
- if (!numa_node_in_is_local) {
- if (numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
-
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- //for (size_t i=0; i < rows; i++ ) {
- // memcpy(A_local[i], A[i], cols * sizeof(T));
- //}
- memcpy(*A_local, *A, rows * cols * sizeof(T));
- memcpy(x_local, x, cols * sizeof(T));
- } else {
- A_local = A;
- x_local = x;
- }
- stop(&timer, 2);
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
-
- mp_pages[0] = A_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ x_local = (T *) numa_alloc(sizeof(T) * cols);
+ allocate_dense(rows, cols, &A_local);
+ }
+ stop(&timer, 1);
+
+ if (x_local == NULL) {
+ return 1;
+ }
+ if (A_local == NULL) {
+ return 1;
+ }
+
+ if (!numa_node_in_is_local) {
+ if (numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(numa_node_cpu_memcpy) ==
+ -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ //for (size_t i=0; i < rows; i++ ) {
+ // memcpy(A_local[i], A[i], cols * sizeof(T));
+ //}
+ memcpy(*A_local, *A, rows * cols * sizeof(T));
+ memcpy(x_local, x, cols * sizeof(T));
+ } else {
+ A_local = A;
+ x_local = x;
+ }
+ stop(&timer, 2);
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
#endif
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- start(&timer, 0, 0);
- gemv(A_local, x_local, rows, cols, &b);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ gemv(A_local, x_local, rows, cols, &b);
+ stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(x_local, sizeof(T) * cols);
- numa_free(*A_local, sizeof(T) * rows * cols);
- numa_free(A_local, sizeof(void*) * rows);
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(x_local, sizeof(T) * cols);
+ numa_free(*A_local, sizeof(T) * rows * cols);
+ numa_free(A_local, sizeof(void *) * rows);
+ }
+ stop(&timer, 3);
#endif
#if NUMA_MEMCPY
- printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
- " | throughput_MBps=%f throughput_MOpps=%f",
- nr_threads, XSTR(T), rows * cols,
- numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
- rows * cols * sizeof(T) / timer.time[0],
- rows * cols / timer.time[0]);
- printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f", nr_threads,
+ XSTR(T), rows * cols, numa_node_in, numa_node_out,
+ numa_node_cpu, numa_node_local, numa_node_cpu_memcpy,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+ rows * cols * sizeof(T) / timer.time[0],
+ rows * cols / timer.time[0]);
+ printf
+ (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] +
+ timer.time[3]);
#else
- printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
+ printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), rows * cols,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), rows * cols,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
#endif
- rows * cols * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f latency_us=%f\n",
- rows * cols / timer.time[0], timer.time[0]);
+ rows * cols * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ rows * cols / timer.time[0], timer.time[0]);
#endif
- }
-
+ }
#if 0
- print_vec(x, rows);
- print_mat(A, rows, cols);
- print_vec(b, rows);
+ print_vec(x, rows);
+ print_mat(A, rows, cols);
+ print_vec(b, rows);
#endif
#if TYPE_double || TYPE_float
- printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#else
- printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#endif
#if NUMA
- numa_free(b, sizeof(T)*rows);
- numa_free(x, sizeof(T)*cols);
- numa_free(*A, sizeof(T)*rows*cols);
- numa_free(A, sizeof(void*)*rows);
+ numa_free(b, sizeof(T) * rows);
+ numa_free(x, sizeof(T) * cols);
+ numa_free(*A, sizeof(T) * rows * cols);
+ numa_free(A, sizeof(void *) * rows);
#else
- free(b);
- free(x);
- free(*A);
- free(A);
+ free(b);
+ free(x);
+ free(*A);
+ free(A);
#endif
- return 0;
+ return 0;
}
-void gemv(T** A, T* x, size_t rows, size_t cols, T** b) {
+void gemv(T **A, T *x, size_t rows, size_t cols, T **b)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i ++ )
- for (size_t j = 0; j < cols; j ++ ) {
- (*b)[i] = (*b)[i] + A[i][j]*x[j];
- }
+ for (size_t i = 0; i < rows; i++)
+ for (size_t j = 0; j < cols; j++) {
+ (*b)[i] = (*b)[i] + A[i][j] * x[j];
+ }
}
-void make_hilbert_mat(size_t rows, size_t cols, T*** A) {
+void make_hilbert_mat(size_t rows, size_t cols, T ***A)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i++) {
- for (size_t j = 0; j < cols; j++) {
+ for (size_t i = 0; i < rows; i++) {
+ for (size_t j = 0; j < cols; j++) {
#if TYPE_double || TYPE_float
- (*A)[i][j] = 1.0/( (T) i + (T) j + 1.0);
+ (*A)[i][j] = 1.0 / ((T) i + (T) j + 1.0);
#else
- (*A)[i][j] = (T)(((i+j)%10));
+ (*A)[i][j] = (T) (((i + j) % 10));
#endif
- }
- }
+ }
+ }
}
-T sum_vec(T* vec, size_t rows) {
- T sum = 0;
+T sum_vec(T *vec, size_t rows)
+{
+ T sum = 0;
#pragma omp parallel for reduction(+:sum)
- for (int i = 0; i < rows; i++) sum = sum + vec[i];
- return sum;
+ for (int i = 0; i < rows; i++)
+ sum = sum + vec[i];
+ return sum;
}