From 3de4b495fb176eba9a0eb517a4ce05903cb67acb Mon Sep 17 00:00:00 2001 From: Juan Gomez Luna Date: Wed, 16 Jun 2021 19:46:05 +0200 Subject: PrIM -- first commit --- NW/dpu/task.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 NW/dpu/task.c (limited to 'NW/dpu') diff --git a/NW/dpu/task.c b/NW/dpu/task.c new file mode 100644 index 0000000..c022f70 --- /dev/null +++ b/NW/dpu/task.c @@ -0,0 +1,185 @@ +/** +* Needleman-Wunsch with multiple tasklets +* +*/ +#include +#include +#include +#include +#include +#include +#include + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +// Barrier +BARRIER_INIT(my_barrier, NR_TASKLETS); + +// main +int main() { + unsigned int tasklet_id = me(); + if (tasklet_id == 0){ // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + uint32_t nblocks = DPU_INPUT_ARGUMENTS.nblocks; + uint32_t active_blocks = DPU_INPUT_ARGUMENTS.active_blocks; + uint32_t penalty = DPU_INPUT_ARGUMENTS.penalty; +#if PRINT + printf("tasklet_id = %d, nblocks = %d \n", tasklet_id, nblocks); +#endif + + uint32_t mram_base_addr_input_itemsets = (uint32_t) (DPU_MRAM_HEAP_POINTER); + uint32_t mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + nblocks * (BL+1) * (BL+2) * sizeof(int32_t)); + if (nblocks != active_blocks) + mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + active_blocks * (BL+1) * (BL+2) * sizeof(int32_t)); + + int32_t *cache_input = mem_alloc((BL_IN+1) * (BL_IN+2) * sizeof(int32_t)); + int32_t *cache_ref = mem_alloc(BL_IN * BL_IN * sizeof(int32_t)); + uint32_t REP = BL/BL_IN; + uint32_t chunks; + uint32_t mod; + uint32_t start; + uint32_t addr_input; + uint32_t addr_ref; + uint32_t cache_input_offset; + + for (uint32_t bl = 0; bl < nblocks; bl++) { + + // Top-left computation + for(uint32_t blk = 0; blk <= REP; blk++) { + + // Partition chunks/subblocks of the diagonal to tasklets + chunks = blk / NR_TASKLETS; + mod = blk % NR_TASKLETS; + if (tasklet_id < mod) + chunks++; + if (mod > 0) { + if(tasklet_id < mod) + start = tasklet_id * chunks; + else + start = mod * (chunks + 1) + (tasklet_id - mod) * chunks; + } else + start = tasklet_id * chunks; + + // Compute all assigned chunks + for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) { + int t_index_x = start + bl_indx; + int t_index_y = blk - 1 - t_index_x; + + // Move input from MRAM to WRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = 0; + for (int i = 0; i < BL_IN; i++) { + mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); + cache_input_offset += BL_IN; + addr_ref += (BL * sizeof(int32_t)); + } + + // Computation + for (uint32_t i = 1; i < BL_IN + 1; i++) { + for (uint32_t j = 1; j < BL_IN + 1; j++) { + cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1], + cache_input[i*(BL_IN+2) + j - 1] - penalty, + cache_input[(i-1)*(BL_IN+2) + j] - penalty); + } + } + + // Move output from WRAM to MRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_write((cache_input + cache_input_offset), (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + } + + barrier_wait(&my_barrier); + } + + // Bottom-right computation + for(uint32_t blk = 2; blk <= REP; blk++) { + // Partition chunks/subblocks of the diagonal to tasklets + chunks = (REP - blk + 1) / NR_TASKLETS; + mod = (REP - blk + 1) % NR_TASKLETS; + if (tasklet_id < mod) + chunks++; + if (mod > 0){ + if(tasklet_id < mod) + start = tasklet_id * chunks; + else + start = mod * (chunks + 1) + (tasklet_id - mod) * chunks; + } else + start = tasklet_id * chunks; + + // Compute all assigned chunks + for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) { + int t_index_x = blk - 1 + start + bl_indx; + int t_index_y = REP + blk - 2 - t_index_x; + + // Move input from MRAM to WRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = 0; + for (int i = 0; i < BL_IN; i++) { + mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); + cache_input_offset += BL_IN; + addr_ref += (BL * sizeof(int32_t)); + } + + + // Computation + for (int i = 1; i < BL_IN + 1; i++) { + for (int j = 1; j < BL_IN + 1; j++) { + cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1], + cache_input[i*(BL_IN+2) + j - 1] - penalty, + cache_input[(i-1)*(BL_IN+2) + j] - penalty); + } + } + + // Move output from WRAM to MRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_write(cache_input + cache_input_offset, (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + } + + barrier_wait(&my_barrier); + + } + + mram_base_addr_input_itemsets += ((BL+1) * (BL+2) * sizeof(int32_t)); + mram_base_addr_ref += (BL * BL * sizeof(int32_t)); + } + return 0; +} -- cgit v1.2.3