summaryrefslogtreecommitdiff
path: root/GEMV
diff options
context:
space:
mode:
Diffstat (limited to 'GEMV')
-rw-r--r--GEMV/baselines/cpu/gemv_openmp.c410
-rw-r--r--GEMV/dpu/task.c152
-rw-r--r--GEMV/host/app.c296
-rwxr-xr-xGEMV/support/common.h14
-rw-r--r--GEMV/support/params.h95
-rwxr-xr-xGEMV/support/timer.h143
6 files changed, 614 insertions, 496 deletions
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
index 21e24cb..99bba55 100644
--- a/GEMV/baselines/cpu/gemv_openmp.c
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -10,10 +10,10 @@
#include <numaif.h>
#include <numa.h>
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -22,7 +22,7 @@ int numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
@@ -35,284 +35,292 @@ int numa_node_in_is_local = 0;
int main(int argc, char *argv[])
{
- (void) argc;
+ (void)argc;
/* // upstream config:
const size_t rows = 20480;
const size_t cols = 8192;
*/
- // DPU config: 163840 -n 4096
- const size_t rows = 163840;
- const size_t cols = 4096;
+ // DPU config: 163840 -n 4096
+ const size_t rows = 163840;
+ const size_t cols = 4096;
- T **A, *b, *x;
+ T **A, *b, *x;
- T **A_local, *x_local;
+ T **A_local, *x_local;
#if NUMA
- bitmask_in = numa_parse_nodestring(argv[1]);
- bitmask_out = numa_parse_nodestring(argv[2]);
- numa_node_cpu = atoi(argv[3]);
+ bitmask_in = numa_parse_nodestring(argv[1]);
+ bitmask_out = numa_parse_nodestring(argv[2]);
+ numa_node_cpu = atoi(argv[3]);
#if NUMA_MEMCPY
- bitmask_cpu = numa_parse_nodestring(argv[4]);
- numa_node_cpu_memcpy = atoi(argv[5]);
-#endif // NUMA_MEMCPY
+ bitmask_cpu = numa_parse_nodestring(argv[4]);
+ numa_node_cpu_memcpy = atoi(argv[5]);
+#endif // NUMA_MEMCPY
#else
- (void) argv;
-#endif // NUMA
+ (void)argv;
+#endif // NUMA
#if NUMA
- if (bitmask_out) {
- numa_set_membind(bitmask_out);
- numa_free_nodemask(bitmask_out);
- }
- b = (T*) numa_alloc(sizeof(T)*rows);
+ if (bitmask_out) {
+ numa_set_membind(bitmask_out);
+ numa_free_nodemask(bitmask_out);
+ }
+ b = (T *) numa_alloc(sizeof(T) * rows);
#else
- b = (T*) malloc(sizeof(T)*rows);
+ b = (T *) malloc(sizeof(T) * rows);
#endif
#if NUMA
- if (bitmask_in) {
- numa_set_membind(bitmask_in);
- // no free yet, re-used in allocate_dense
- }
- x = (T*) numa_alloc(sizeof(T)*cols);
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ // no free yet, re-used in allocate_dense
+ }
+ x = (T *) numa_alloc(sizeof(T) * cols);
#else
- x = (T*) malloc(sizeof(T)*cols);
+ x = (T *) malloc(sizeof(T) * cols);
#endif
- allocate_dense(rows, cols, &A);
+ allocate_dense(rows, cols, &A);
#if NUMA
- if (bitmask_in) {
- numa_free_nodemask(bitmask_in);
- }
+ if (bitmask_in) {
+ numa_free_nodemask(bitmask_in);
+ }
#endif
- make_hilbert_mat(rows,cols, &A);
+ make_hilbert_mat(rows, cols, &A);
#if NUMA
#if NUMA_MEMCPY
- if (bitmask_cpu) {
- numa_set_membind(bitmask_cpu);
- numa_free_nodemask(bitmask_cpu);
- }
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
+#endif // NUMA
- A_local = A;
- x_local = x;
+ A_local = A;
+ x_local = x;
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(A) error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- mp_pages[0] = b;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(b)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(b) error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(A) error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = b;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(b)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(b) error: %d", mp_status[0]);
+ } else {
+ numa_node_out = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
#endif
- Timer timer;
- for (int i = 0; i < 20; i++) {
+ Timer timer;
+ for (int i = 0; i < 20; i++) {
#pragma omp parallel
- {
+ {
#pragma omp for
- for (size_t i = 0; i < cols; i++) {
- x[i] = (T) i+1 ;
- }
+ for (size_t i = 0; i < cols; i++) {
+ x[i] = (T) i + 1;
+ }
#pragma omp for
- for (size_t i = 0; i < rows; i++) {
- b[i] = (T) 0;
- }
- }
+ for (size_t i = 0; i < rows; i++) {
+ b[i] = (T) 0;
+ }
+ }
#if NUMA_MEMCPY
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- x_local = (T*) numa_alloc(sizeof(T)*cols);
- allocate_dense(rows, cols, &A_local);
- }
- stop(&timer, 1);
-
- if (x_local == NULL) {
- return 1;
- }
- if (A_local == NULL) {
- return 1;
- }
-
- if (!numa_node_in_is_local) {
- if (numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
-
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- //for (size_t i=0; i < rows; i++ ) {
- // memcpy(A_local[i], A[i], cols * sizeof(T));
- //}
- memcpy(*A_local, *A, rows * cols * sizeof(T));
- memcpy(x_local, x, cols * sizeof(T));
- } else {
- A_local = A;
- x_local = x;
- }
- stop(&timer, 2);
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
-
- mp_pages[0] = A_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ x_local = (T *) numa_alloc(sizeof(T) * cols);
+ allocate_dense(rows, cols, &A_local);
+ }
+ stop(&timer, 1);
+
+ if (x_local == NULL) {
+ return 1;
+ }
+ if (A_local == NULL) {
+ return 1;
+ }
+
+ if (!numa_node_in_is_local) {
+ if (numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(numa_node_cpu_memcpy) ==
+ -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ //for (size_t i=0; i < rows; i++ ) {
+ // memcpy(A_local[i], A[i], cols * sizeof(T));
+ //}
+ memcpy(*A_local, *A, rows * cols * sizeof(T));
+ memcpy(x_local, x, cols * sizeof(T));
+ } else {
+ A_local = A;
+ x_local = x;
+ }
+ stop(&timer, 2);
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
#endif
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- start(&timer, 0, 0);
- gemv(A_local, x_local, rows, cols, &b);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ gemv(A_local, x_local, rows, cols, &b);
+ stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(x_local, sizeof(T) * cols);
- numa_free(*A_local, sizeof(T) * rows * cols);
- numa_free(A_local, sizeof(void*) * rows);
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(x_local, sizeof(T) * cols);
+ numa_free(*A_local, sizeof(T) * rows * cols);
+ numa_free(A_local, sizeof(void *) * rows);
+ }
+ stop(&timer, 3);
#endif
#if NUMA_MEMCPY
- printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
- " | throughput_MBps=%f throughput_MOpps=%f",
- nr_threads, XSTR(T), rows * cols,
- numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
- rows * cols * sizeof(T) / timer.time[0],
- rows * cols / timer.time[0]);
- printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f", nr_threads,
+ XSTR(T), rows * cols, numa_node_in, numa_node_out,
+ numa_node_cpu, numa_node_local, numa_node_cpu_memcpy,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+ rows * cols * sizeof(T) / timer.time[0],
+ rows * cols / timer.time[0]);
+ printf
+ (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] +
+ timer.time[3]);
#else
- printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
+ printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), rows * cols,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), rows * cols,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
#endif
- rows * cols * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f latency_us=%f\n",
- rows * cols / timer.time[0], timer.time[0]);
+ rows * cols * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ rows * cols / timer.time[0], timer.time[0]);
#endif
- }
-
+ }
#if 0
- print_vec(x, rows);
- print_mat(A, rows, cols);
- print_vec(b, rows);
+ print_vec(x, rows);
+ print_mat(A, rows, cols);
+ print_vec(b, rows);
#endif
#if TYPE_double || TYPE_float
- printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#else
- printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#endif
#if NUMA
- numa_free(b, sizeof(T)*rows);
- numa_free(x, sizeof(T)*cols);
- numa_free(*A, sizeof(T)*rows*cols);
- numa_free(A, sizeof(void*)*rows);
+ numa_free(b, sizeof(T) * rows);
+ numa_free(x, sizeof(T) * cols);
+ numa_free(*A, sizeof(T) * rows * cols);
+ numa_free(A, sizeof(void *) * rows);
#else
- free(b);
- free(x);
- free(*A);
- free(A);
+ free(b);
+ free(x);
+ free(*A);
+ free(A);
#endif
- return 0;
+ return 0;
}
-void gemv(T** A, T* x, size_t rows, size_t cols, T** b) {
+void gemv(T **A, T *x, size_t rows, size_t cols, T **b)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i ++ )
- for (size_t j = 0; j < cols; j ++ ) {
- (*b)[i] = (*b)[i] + A[i][j]*x[j];
- }
+ for (size_t i = 0; i < rows; i++)
+ for (size_t j = 0; j < cols; j++) {
+ (*b)[i] = (*b)[i] + A[i][j] * x[j];
+ }
}
-void make_hilbert_mat(size_t rows, size_t cols, T*** A) {
+void make_hilbert_mat(size_t rows, size_t cols, T ***A)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i++) {
- for (size_t j = 0; j < cols; j++) {
+ for (size_t i = 0; i < rows; i++) {
+ for (size_t j = 0; j < cols; j++) {
#if TYPE_double || TYPE_float
- (*A)[i][j] = 1.0/( (T) i + (T) j + 1.0);
+ (*A)[i][j] = 1.0 / ((T) i + (T) j + 1.0);
#else
- (*A)[i][j] = (T)(((i+j)%10));
+ (*A)[i][j] = (T) (((i + j) % 10));
#endif
- }
- }
+ }
+ }
}
-T sum_vec(T* vec, size_t rows) {
- T sum = 0;
+T sum_vec(T *vec, size_t rows)
+{
+ T sum = 0;
#pragma omp parallel for reduction(+:sum)
- for (int i = 0; i < rows; i++) sum = sum + vec[i];
- return sum;
+ for (int i = 0; i < rows; i++)
+ sum = sum + vec[i];
+ return sum;
}
diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c
index 0226437..3bf52e8 100644
--- a/GEMV/dpu/task.c
+++ b/GEMV/dpu/task.c
@@ -17,7 +17,8 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
bufferC[pos] += bufferA[i] * bufferB[i];
}
@@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
+int main()
+{
unsigned int tasklet_id = me();
#if PRINT
// printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
@@ -44,15 +46,15 @@ int main() {
uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
- unsigned int element_per_cacheC = 8/sizeof(T);
+ unsigned int element_per_cacheC = 8 / sizeof(T);
unsigned int nrows = nr_rows;
- unsigned int rows_per_tasklet;
+ unsigned int rows_per_tasklet;
unsigned int start_row;
unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC);
- unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
+ unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
rows_per_tasklet = dbl_chunks;
- unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
+ unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
if ((tasklet_id * element_per_cacheC) < rest_rows)
rows_per_tasklet += element_per_cacheC;
@@ -60,22 +62,32 @@ int main() {
if ((tasklet_id * element_per_cacheC) >= rest_rows) {
// unsigned int hlf_rest_rows = rest_rows >> 1;
if ((rest_rows % element_per_cacheC) != 0)
- start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+ start_row =
+ roundup(rest_rows,
+ element_per_cacheC) +
+ tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
else
- start_row = rest_rows + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
- } else
- start_row = tasklet_id * (dbl_chunks + element_per_cacheC);
- // start_row = tasklet_id * (dbl_chunks + 2);
+ start_row = rest_rows + tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
+ } else
+ start_row =
+ tasklet_id * (dbl_chunks + element_per_cacheC);
+ // start_row = tasklet_id * (dbl_chunks + 2);
} else {
start_row = tasklet_id * (dbl_chunks);
}
// Address of the current row in MRAM
- uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
- uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
- uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+ uint32_t mram_base_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+ uint32_t mram_base_addr_B =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T));
+ uint32_t mram_base_addr_C =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T) + start_row * sizeof(T));
uint32_t mram_temp_addr_A = mram_base_addr_A;
uint32_t mram_temp_addr_B = mram_base_addr_B;
@@ -87,55 +99,65 @@ int main() {
int offset = 0;
- #if PRINT
- printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet);
- printf("id: %d, start_row = %d\n",tasklet_id, start_row);
- #endif
+#if PRINT
+ printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet);
+ printf("id: %d, start_row = %d\n", tasklet_id, start_row);
+#endif
// Iterate over nr_rows
// for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
- for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) {
+ for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+ i += element_per_cacheC) {
- mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+ mram_temp_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
// cache_C[0] = 0;
// cache_C[1] = 0;
// clear the cache
- for(unsigned int c = 0; c < element_per_cacheC; c++){
- cache_C[c] = 0;
+ for (unsigned int c = 0; c < element_per_cacheC; c++) {
+ cache_C[c] = 0;
}
// for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
// for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){
// for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){
- for(unsigned int pos = 0; pos < element_per_cacheC; pos++){
- if(i + pos >= nr_rows){
+ for (unsigned int pos = 0; pos < element_per_cacheC; pos++) {
+ if (i + pos >= nr_rows) {
// printf("id: %d, nrows: %d, error\n", tasklet_id, nrows);
break;
- }
+ }
int n = 0, j;
- for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
- {
-
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- if(offset)
- {
-
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
- {
+ for (n = 0;
+ n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+ n += (BLOCK_SIZE / sizeof(T))) {
+
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A), cache_A,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_B), cache_B,
+ BLOCK_SIZE);
+
+ if (offset) {
+
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A +
+ BLOCK_SIZE), cache_A_aux,
+ 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
-
// Compute GEMV
gemv(cache_C, cache_A, cache_B, pos);
@@ -144,53 +166,55 @@ int main() {
mram_temp_addr_B += BLOCK_SIZE;
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-
+ mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+ cache_A, BLOCK_SIZE);
- if(offset)
- {
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
- {
+ if (offset) {
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A + BLOCK_SIZE),
+ cache_A_aux, 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
+ mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+ cache_B, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- for (j = 0; j < (int) (n_size - n); j++) {
+ for (j = 0; j < (int)(n_size - n); j++) {
// Compute GEMV
- if(j >= (int)(BLOCK_SIZE / sizeof(T))){
+ if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
printf("error\n");
break;
}
cache_C[pos] += cache_A[j] * cache_B[j];
}
-
- mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+ mram_temp_addr_A +=
+ (BLOCK_SIZE -
+ ((BLOCK_SIZE / sizeof(T)) -
+ (n_size - n)) * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
- if(mram_temp_addr_A % 8 != 0)
- {
+ if (mram_temp_addr_A % 8 != 0) {
offset = 1;
- }
- else
- {
+ } else {
offset = 0;
}
}
// Write cache to current MRAM block
- mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+ mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
// Update memory address
// mram_base_addr_C += 2 * sizeof(T);
- mram_base_addr_C += 8;
+ mram_base_addr_C += 8;
}
diff --git a/GEMV/host/app.c b/GEMV/host/app.c
index ebd0336..6553774 100644
--- a/GEMV/host/app.c
+++ b/GEMV/host/app.c
@@ -33,69 +33,69 @@
#define DPU_BINARY "./bin/gemv_dpu"
#endif
-static T* A;
-static T* B;
-static T* C;
-static T* C_dpu;
+static T *A;
+static T *B;
+static T *C;
+static T *C_dpu;
// Create input arrays
-static void init_data(T* A, T* B, unsigned int m_size, unsigned int n_size) {
+static void init_data(T *A, T *B, unsigned int m_size, unsigned int n_size)
+{
srand(0);
- for (unsigned int i = 0; i < m_size * n_size; i++)
- {
- A[i] = (unsigned int) (rand()%50);
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ A[i] = (unsigned int)(rand() % 50);
}
- for (unsigned int i = 0; i < n_size; i++)
- {
- B[i] = (unsigned int) (rand()%50);
+ for (unsigned int i = 0; i < n_size; i++) {
+ B[i] = (unsigned int)(rand() % 50);
}
}
// Compute output in the host
-static void gemv_host(T* C, T* A, T* B, unsigned int m_size, unsigned int n_size) {
- for (unsigned int i = 0; i < m_size; i++)
- {
+static void gemv_host(T *C, T *A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
+ for (unsigned int i = 0; i < m_size; i++) {
C[i] = 0;
}
for (unsigned int m = 0; m < m_size; m++) {
- for (unsigned int n = 0; n < n_size; n++)
- {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[m * n_size + n] * B[n];
}
}
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ uint32_t nr_of_ranks;
// Timer
Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[8] = 0; // free
+ timer.time[8] = 0; // free
#endif
#if ENERGY
@@ -108,12 +108,13 @@ int main(int argc, char **argv) {
unsigned int n_size = p.n_size;
// Initialize help data
- dpu_info = (struct dpu_info_t *) malloc(NR_DPUS * sizeof(struct dpu_info_t));
- dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
+ dpu_info =
+ (struct dpu_info_t *)malloc(NR_DPUS * sizeof(struct dpu_info_t));
+ dpu_arguments_t *input_args =
+ (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
uint32_t max_rows_per_dpu = 0;
uint32_t n_size_pad = n_size;
- if(n_size % 2 == 1)
- {
+ if (n_size % 2 == 1) {
n_size_pad++;
}
@@ -127,7 +128,10 @@ int main(int argc, char **argv) {
rows_per_dpu++;
if (rest_rows > 0) {
if (i >= rest_rows)
- prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+ prev_rows_dpu =
+ rest_rows * (chunks + 1) + (i -
+ rest_rows) *
+ chunks;
else
prev_rows_dpu = i * (chunks + 1);
} else {
@@ -136,7 +140,7 @@ int main(int argc, char **argv) {
// Keep max rows for parallel transfers
uint32_t rows_per_dpu_pad = rows_per_dpu;
- if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+ if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
rows_per_dpu_pad++;
if (rows_per_dpu_pad > max_rows_per_dpu)
max_rows_per_dpu = rows_per_dpu_pad;
@@ -163,20 +167,20 @@ int main(int argc, char **argv) {
for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 0, 0);
}
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 0);
}
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 1, 0);
}
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 1);
}
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -186,26 +190,33 @@ int main(int argc, char **argv) {
// int prev_rank_id = -1;
int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
numa_node_rank = -1;
} else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
}
/*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
}
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 2, 0);
}
gemv_host(C, A, B, m_size, n_size);
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
}
if (rep >= p.n_warmup) {
@@ -220,23 +231,30 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 3);
}
if (rep >= p.n_warmup) {
start(&timer, 6, 0);
}
-
// Copy input array and vector
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A + dpu_info[i].prev_rows_dpu * n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 6);
}
if (rep >= p.n_warmup) {
@@ -246,12 +264,15 @@ int main(int argc, char **argv) {
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup) {
stop(&timer, 7);
}
-
// Run kernel on DPUs
if (rep >= p.n_warmup) {
start(&timer, 4, 0);
@@ -280,89 +301,140 @@ int main(int argc, char **argv) {
start(&timer, 5, 0);
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup) {
stop(&timer, 5);
}
-
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 8, 0);
}
#endif
DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 8);
}
#endif
#endif
-
// Check output
bool status = true;
- unsigned int n,j;
+ unsigned int n, j;
i = 0;
for (n = 0; n < NR_DPUS; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+ if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
status = false;
#if PRINT
- // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+ // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
#endif
}
i++;
}
}
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
if (rep >= p.n_warmup) {
- printf("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
- NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, n_size * m_size);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3] + timer.time[6] + timer.time[7],
- timer.time[4],
- timer.time[5],
- timer.time[8]);
- printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
- timer.time[3],
- timer.time[6],
- timer.time[7]
- );
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- n_size * m_size * sizeof(T) / timer.time[2],
- n_size * m_size * sizeof(T) / (timer.time[4]),
- n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- n_size * m_size * sizeof(T) / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- n_size * m_size / timer.time[2],
- n_size * m_size / (timer.time[4]),
- n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- n_size * m_size / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
+ printf
+ ("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
+ NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T),
+ BLOCK_SIZE, n_size * m_size);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ timer.time[0], timer.time[1],
+ timer.time[2],
+ timer.time[3] + timer.time[6] +
+ timer.time[7], timer.time[4],
+ timer.time[5], timer.time[8]);
+ printf
+ (" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
+ timer.time[3], timer.time[6], timer.time[7]
+ );
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ n_size * m_size * sizeof(T) /
+ timer.time[2],
+ n_size * m_size * sizeof(T) /
+ (timer.time[4]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5] + timer.time[8]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ n_size * m_size * sizeof(T) /
+ (timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[1] + timer.time[3] +
+ timer.time[6] + timer.time[7] +
+ timer.time[4] + timer.time[5]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ n_size * m_size / timer.time[2],
+ n_size * m_size / (timer.time[4]),
+ n_size * m_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[8]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ n_size * m_size / (timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]),
+ n_size * m_size / (timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]),
+ n_size * m_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]));
}
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
}
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -370,15 +442,15 @@ int main(int argc, char **argv) {
// Print timing results
/*
- printf("CPU Version Time (ms): ");
- print(&timer, 0, 1);
- printf("CPU-DPU Time (ms): ");
- print(&timer, 1, p.n_reps);
- printf("DPU Kernel Time (ms): ");
- print(&timer, 2, p.n_reps);
- printf("DPU-CPU Time (ms): ");
- print(&timer, 3, p.n_reps);
- */
+ printf("CPU Version Time (ms): ");
+ print(&timer, 0, 1);
+ printf("CPU-DPU Time (ms): ");
+ print(&timer, 1, p.n_reps);
+ printf("DPU Kernel Time (ms): ");
+ print(&timer, 2, p.n_reps);
+ printf("DPU-CPU Time (ms): ");
+ print(&timer, 3, p.n_reps);
+ */
#if ENERGY
printf("Energy (J): %f J\t", avg_energy);
diff --git a/GEMV/support/common.h b/GEMV/support/common.h
index 0deebcb..47a9628 100755
--- a/GEMV/support/common.h
+++ b/GEMV/support/common.h
@@ -3,17 +3,17 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t n_size;
- uint32_t n_size_pad;
- uint32_t nr_rows;
- uint32_t max_rows;
+ uint32_t n_size;
+ uint32_t n_size_pad;
+ uint32_t nr_rows;
+ uint32_t max_rows;
} dpu_arguments_t;
// Specific information for each DPU
struct dpu_info_t {
- uint32_t rows_per_dpu;
- uint32_t rows_per_dpu_pad;
- uint32_t prev_rows_dpu;
+ uint32_t rows_per_dpu;
+ uint32_t rows_per_dpu_pad;
+ uint32_t prev_rows_dpu;
};
struct dpu_info_t *dpu_info;
diff --git a/GEMV/support/params.h b/GEMV/support/params.h
index 526c71c..c72b0c1 100644
--- a/GEMV/support/params.h
+++ b/GEMV/support/params.h
@@ -4,53 +4,62 @@
#include "common.h"
typedef struct Params {
- unsigned int m_size;
- unsigned int n_size;
- unsigned int n_warmup;
- unsigned int n_reps;
-}Params;
+ unsigned int m_size;
+ unsigned int n_size;
+ unsigned int n_warmup;
+ unsigned int n_reps;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -m <I> m_size (default=8192 elements)"
- "\n -n <I> n_size (default=8192 elements)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -m <I> m_size (default=8192 elements)"
+ "\n -n <I> n_size (default=8192 elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.m_size = 8192;
- p.n_size = 8192;
- p.n_warmup = 1;
- p.n_reps = 3;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.m_size = 8192;
+ p.n_size = 8192;
+ p.n_warmup = 1;
+ p.n_reps = 3;
- int opt;
- while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'm': p.m_size = atoi(optarg); break;
- case 'n': p.n_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'm':
+ p.m_size = atoi(optarg);
+ break;
+ case 'n':
+ p.n_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/GEMV/support/timer.h b/GEMV/support/timer.h
index 99d79f4..b2b9148 100755
--- a/GEMV/support/timer.h
+++ b/GEMV/support/timer.h
@@ -1,69 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
- //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
-
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+typedef struct Timer {
+ struct timeval startTime[9];
+ struct timeval stopTime[9];
+ double time[9];
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+
+ //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("%f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}