summaryrefslogtreecommitdiff
path: root/GEMV/dpu/task.c
diff options
context:
space:
mode:
Diffstat (limited to 'GEMV/dpu/task.c')
-rw-r--r--GEMV/dpu/task.c152
1 files changed, 88 insertions, 64 deletions
diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c
index 0226437..3bf52e8 100644
--- a/GEMV/dpu/task.c
+++ b/GEMV/dpu/task.c
@@ -17,7 +17,8 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
bufferC[pos] += bufferA[i] * bufferB[i];
}
@@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
+int main()
+{
unsigned int tasklet_id = me();
#if PRINT
// printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
@@ -44,15 +46,15 @@ int main() {
uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
- unsigned int element_per_cacheC = 8/sizeof(T);
+ unsigned int element_per_cacheC = 8 / sizeof(T);
unsigned int nrows = nr_rows;
- unsigned int rows_per_tasklet;
+ unsigned int rows_per_tasklet;
unsigned int start_row;
unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC);
- unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
+ unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
rows_per_tasklet = dbl_chunks;
- unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
+ unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
if ((tasklet_id * element_per_cacheC) < rest_rows)
rows_per_tasklet += element_per_cacheC;
@@ -60,22 +62,32 @@ int main() {
if ((tasklet_id * element_per_cacheC) >= rest_rows) {
// unsigned int hlf_rest_rows = rest_rows >> 1;
if ((rest_rows % element_per_cacheC) != 0)
- start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+ start_row =
+ roundup(rest_rows,
+ element_per_cacheC) +
+ tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
else
- start_row = rest_rows + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
- } else
- start_row = tasklet_id * (dbl_chunks + element_per_cacheC);
- // start_row = tasklet_id * (dbl_chunks + 2);
+ start_row = rest_rows + tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
+ } else
+ start_row =
+ tasklet_id * (dbl_chunks + element_per_cacheC);
+ // start_row = tasklet_id * (dbl_chunks + 2);
} else {
start_row = tasklet_id * (dbl_chunks);
}
// Address of the current row in MRAM
- uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
- uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
- uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+ uint32_t mram_base_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+ uint32_t mram_base_addr_B =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T));
+ uint32_t mram_base_addr_C =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T) + start_row * sizeof(T));
uint32_t mram_temp_addr_A = mram_base_addr_A;
uint32_t mram_temp_addr_B = mram_base_addr_B;
@@ -87,55 +99,65 @@ int main() {
int offset = 0;
- #if PRINT
- printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet);
- printf("id: %d, start_row = %d\n",tasklet_id, start_row);
- #endif
+#if PRINT
+ printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet);
+ printf("id: %d, start_row = %d\n", tasklet_id, start_row);
+#endif
// Iterate over nr_rows
// for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
- for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) {
+ for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+ i += element_per_cacheC) {
- mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+ mram_temp_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
// cache_C[0] = 0;
// cache_C[1] = 0;
// clear the cache
- for(unsigned int c = 0; c < element_per_cacheC; c++){
- cache_C[c] = 0;
+ for (unsigned int c = 0; c < element_per_cacheC; c++) {
+ cache_C[c] = 0;
}
// for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
// for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){
// for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){
- for(unsigned int pos = 0; pos < element_per_cacheC; pos++){
- if(i + pos >= nr_rows){
+ for (unsigned int pos = 0; pos < element_per_cacheC; pos++) {
+ if (i + pos >= nr_rows) {
// printf("id: %d, nrows: %d, error\n", tasklet_id, nrows);
break;
- }
+ }
int n = 0, j;
- for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
- {
-
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- if(offset)
- {
-
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
- {
+ for (n = 0;
+ n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+ n += (BLOCK_SIZE / sizeof(T))) {
+
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A), cache_A,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_B), cache_B,
+ BLOCK_SIZE);
+
+ if (offset) {
+
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A +
+ BLOCK_SIZE), cache_A_aux,
+ 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
-
// Compute GEMV
gemv(cache_C, cache_A, cache_B, pos);
@@ -144,53 +166,55 @@ int main() {
mram_temp_addr_B += BLOCK_SIZE;
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-
+ mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+ cache_A, BLOCK_SIZE);
- if(offset)
- {
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
- {
+ if (offset) {
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A + BLOCK_SIZE),
+ cache_A_aux, 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
+ mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+ cache_B, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- for (j = 0; j < (int) (n_size - n); j++) {
+ for (j = 0; j < (int)(n_size - n); j++) {
// Compute GEMV
- if(j >= (int)(BLOCK_SIZE / sizeof(T))){
+ if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
printf("error\n");
break;
}
cache_C[pos] += cache_A[j] * cache_B[j];
}
-
- mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+ mram_temp_addr_A +=
+ (BLOCK_SIZE -
+ ((BLOCK_SIZE / sizeof(T)) -
+ (n_size - n)) * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
- if(mram_temp_addr_A % 8 != 0)
- {
+ if (mram_temp_addr_A % 8 != 0) {
offset = 1;
- }
- else
- {
+ } else {
offset = 0;
}
}
// Write cache to current MRAM block
- mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+ mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
// Update memory address
// mram_base_addr_C += 2 * sizeof(T);
- mram_base_addr_C += 8;
+ mram_base_addr_C += 8;
}