diff options
Diffstat (limited to 'GEMV/dpu/task.c')
-rw-r--r-- | GEMV/dpu/task.c | 152 |
1 files changed, 88 insertions, 64 deletions
diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c index 0226437..3bf52e8 100644 --- a/GEMV/dpu/task.c +++ b/GEMV/dpu/task.c @@ -17,7 +17,8 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; // GEMV -static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { +static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) +{ for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) { bufferC[pos] += bufferA[i] * bufferB[i]; } @@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { BARRIER_INIT(my_barrier, NR_TASKLETS); // main -int main() { +int main() +{ unsigned int tasklet_id = me(); #if PRINT // printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap } // Barrier barrier_wait(&my_barrier); @@ -44,15 +46,15 @@ int main() { uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows; uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows; - unsigned int element_per_cacheC = 8/sizeof(T); + unsigned int element_per_cacheC = 8 / sizeof(T); unsigned int nrows = nr_rows; - unsigned int rows_per_tasklet; + unsigned int rows_per_tasklet; unsigned int start_row; unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC); - unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks; + unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks; rows_per_tasklet = dbl_chunks; - unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS); + unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS); if ((tasklet_id * element_per_cacheC) < rest_rows) rows_per_tasklet += element_per_cacheC; @@ -60,22 +62,32 @@ int main() { if ((tasklet_id * element_per_cacheC) >= rest_rows) { // unsigned int hlf_rest_rows = rest_rows >> 1; if ((rest_rows % element_per_cacheC) != 0) - start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks; - // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; + start_row = + roundup(rest_rows, + element_per_cacheC) + + tasklet_id * dbl_chunks; + // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; else - start_row = rest_rows + tasklet_id * dbl_chunks; - // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; - } else - start_row = tasklet_id * (dbl_chunks + element_per_cacheC); - // start_row = tasklet_id * (dbl_chunks + 2); + start_row = rest_rows + tasklet_id * dbl_chunks; + // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; + } else + start_row = + tasklet_id * (dbl_chunks + element_per_cacheC); + // start_row = tasklet_id * (dbl_chunks + 2); } else { start_row = tasklet_id * (dbl_chunks); } // Address of the current row in MRAM - uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); - uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T)); - uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T)); + uint32_t mram_base_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); + uint32_t mram_base_addr_B = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T)); + uint32_t mram_base_addr_C = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T) + start_row * sizeof(T)); uint32_t mram_temp_addr_A = mram_base_addr_A; uint32_t mram_temp_addr_B = mram_base_addr_B; @@ -87,55 +99,65 @@ int main() { int offset = 0; - #if PRINT - printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet); - printf("id: %d, start_row = %d\n",tasklet_id, start_row); - #endif +#if PRINT + printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet); + printf("id: %d, start_row = %d\n", tasklet_id, start_row); +#endif // Iterate over nr_rows // for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) { - for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) { + for (unsigned int i = start_row; i < start_row + rows_per_tasklet; + i += element_per_cacheC) { - mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); + mram_temp_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; // cache_C[0] = 0; // cache_C[1] = 0; // clear the cache - for(unsigned int c = 0; c < element_per_cacheC; c++){ - cache_C[c] = 0; + for (unsigned int c = 0; c < element_per_cacheC; c++) { + cache_C[c] = 0; } // for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){ // for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){ // for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){ - for(unsigned int pos = 0; pos < element_per_cacheC; pos++){ - if(i + pos >= nr_rows){ + for (unsigned int pos = 0; pos < element_per_cacheC; pos++) { + if (i + pos >= nr_rows) { // printf("id: %d, nrows: %d, error\n", tasklet_id, nrows); break; - } + } int n = 0, j; - for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T))) - { - - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - if(offset) - { - - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++) - { + for (n = 0; + n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T))); + n += (BLOCK_SIZE / sizeof(T))) { + + mram_read((__mram_ptr void const + *)(mram_temp_addr_A), cache_A, + BLOCK_SIZE); + mram_read((__mram_ptr void const + *)(mram_temp_addr_B), cache_B, + BLOCK_SIZE); + + if (offset) { + + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + + BLOCK_SIZE), cache_A_aux, + 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } - // Compute GEMV gemv(cache_C, cache_A, cache_B, pos); @@ -144,53 +166,55 @@ int main() { mram_temp_addr_B += BLOCK_SIZE; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); - + mram_read((__mram_ptr void const *)(mram_temp_addr_A), + cache_A, BLOCK_SIZE); - if(offset) - { - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++) - { + if (offset) { + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + BLOCK_SIZE), + cache_A_aux, 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } + mram_read((__mram_ptr void const *)(mram_temp_addr_B), + cache_B, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - for (j = 0; j < (int) (n_size - n); j++) { + for (j = 0; j < (int)(n_size - n); j++) { // Compute GEMV - if(j >= (int)(BLOCK_SIZE / sizeof(T))){ + if (j >= (int)(BLOCK_SIZE / sizeof(T))) { printf("error\n"); break; } cache_C[pos] += cache_A[j] * cache_B[j]; } - - mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T)); + mram_temp_addr_A += + (BLOCK_SIZE - + ((BLOCK_SIZE / sizeof(T)) - + (n_size - n)) * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; - if(mram_temp_addr_A % 8 != 0) - { + if (mram_temp_addr_A % 8 != 0) { offset = 1; - } - else - { + } else { offset = 0; } } // Write cache to current MRAM block - mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8); + mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8); // Update memory address // mram_base_addr_C += 2 * sizeof(T); - mram_base_addr_C += 8; + mram_base_addr_C += 8; } |