summaryrefslogtreecommitdiff
path: root/NW/dpu
diff options
context:
space:
mode:
authorJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
committerJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
commit3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
treefc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW/dpu
parentef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
PrIM -- first commit
Diffstat (limited to 'NW/dpu')
-rw-r--r--NW/dpu/task.c185
1 files changed, 185 insertions, 0 deletions
diff --git a/NW/dpu/task.c b/NW/dpu/task.c
new file mode 100644
index 0000000..c022f70
--- /dev/null
+++ b/NW/dpu/task.c
@@ -0,0 +1,185 @@
+/**
+* Needleman-Wunsch with multiple tasklets
+*
+*/
+#include <stdint.h>
+#include <stdio.h>
+#include <defs.h>
+#include <mram.h>
+#include <alloc.h>
+#include <perfcounter.h>
+#include <barrier.h>
+
+#include "../support/common.h"
+
+__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
+
+// Barrier
+BARRIER_INIT(my_barrier, NR_TASKLETS);
+
+// main
+int main() {
+ unsigned int tasklet_id = me();
+ if (tasklet_id == 0){ // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+ uint32_t nblocks = DPU_INPUT_ARGUMENTS.nblocks;
+ uint32_t active_blocks = DPU_INPUT_ARGUMENTS.active_blocks;
+ uint32_t penalty = DPU_INPUT_ARGUMENTS.penalty;
+#if PRINT
+ printf("tasklet_id = %d, nblocks = %d \n", tasklet_id, nblocks);
+#endif
+
+ uint32_t mram_base_addr_input_itemsets = (uint32_t) (DPU_MRAM_HEAP_POINTER);
+ uint32_t mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + nblocks * (BL+1) * (BL+2) * sizeof(int32_t));
+ if (nblocks != active_blocks)
+ mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + active_blocks * (BL+1) * (BL+2) * sizeof(int32_t));
+
+ int32_t *cache_input = mem_alloc((BL_IN+1) * (BL_IN+2) * sizeof(int32_t));
+ int32_t *cache_ref = mem_alloc(BL_IN * BL_IN * sizeof(int32_t));
+ uint32_t REP = BL/BL_IN;
+ uint32_t chunks;
+ uint32_t mod;
+ uint32_t start;
+ uint32_t addr_input;
+ uint32_t addr_ref;
+ uint32_t cache_input_offset;
+
+ for (uint32_t bl = 0; bl < nblocks; bl++) {
+
+ // Top-left computation
+ for(uint32_t blk = 0; blk <= REP; blk++) {
+
+ // Partition chunks/subblocks of the diagonal to tasklets
+ chunks = blk / NR_TASKLETS;
+ mod = blk % NR_TASKLETS;
+ if (tasklet_id < mod)
+ chunks++;
+ if (mod > 0) {
+ if(tasklet_id < mod)
+ start = tasklet_id * chunks;
+ else
+ start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
+ } else
+ start = tasklet_id * chunks;
+
+ // Compute all assigned chunks
+ for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
+ int t_index_x = start + bl_indx;
+ int t_index_y = blk - 1 - t_index_x;
+
+ // Move input from MRAM to WRAM
+ addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = (BL_IN+2);
+ mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t));
+ addr_input += ((BL+2) * sizeof(int32_t));
+ for (int i = 1; i < BL_IN + 1; i++) {
+ mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t));
+ cache_input_offset += (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ }
+
+ addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = 0;
+ for (int i = 0; i < BL_IN; i++) {
+ mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t));
+ cache_input_offset += BL_IN;
+ addr_ref += (BL * sizeof(int32_t));
+ }
+
+ // Computation
+ for (uint32_t i = 1; i < BL_IN + 1; i++) {
+ for (uint32_t j = 1; j < BL_IN + 1; j++) {
+ cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
+ cache_input[i*(BL_IN+2) + j - 1] - penalty,
+ cache_input[(i-1)*(BL_IN+2) + j] - penalty);
+ }
+ }
+
+ // Move output from WRAM to MRAM
+ addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ for (int i = 1; i < BL_IN + 1; i++) {
+ mram_write((cache_input + cache_input_offset), (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t));
+ cache_input_offset += (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ }
+
+ }
+
+ barrier_wait(&my_barrier);
+ }
+
+ // Bottom-right computation
+ for(uint32_t blk = 2; blk <= REP; blk++) {
+ // Partition chunks/subblocks of the diagonal to tasklets
+ chunks = (REP - blk + 1) / NR_TASKLETS;
+ mod = (REP - blk + 1) % NR_TASKLETS;
+ if (tasklet_id < mod)
+ chunks++;
+ if (mod > 0){
+ if(tasklet_id < mod)
+ start = tasklet_id * chunks;
+ else
+ start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
+ } else
+ start = tasklet_id * chunks;
+
+ // Compute all assigned chunks
+ for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
+ int t_index_x = blk - 1 + start + bl_indx;
+ int t_index_y = REP + blk - 2 - t_index_x;
+
+ // Move input from MRAM to WRAM
+ addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = (BL_IN+2);
+ mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t));
+ addr_input += ((BL+2) * sizeof(int32_t));
+ for (int i = 1; i < BL_IN + 1; i++) {
+ mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t));
+ cache_input_offset += (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ }
+
+ addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = 0;
+ for (int i = 0; i < BL_IN; i++) {
+ mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t));
+ cache_input_offset += BL_IN;
+ addr_ref += (BL * sizeof(int32_t));
+ }
+
+
+ // Computation
+ for (int i = 1; i < BL_IN + 1; i++) {
+ for (int j = 1; j < BL_IN + 1; j++) {
+ cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
+ cache_input[i*(BL_IN+2) + j - 1] - penalty,
+ cache_input[(i-1)*(BL_IN+2) + j] - penalty);
+ }
+ }
+
+ // Move output from WRAM to MRAM
+ addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+ cache_input_offset = (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ for (int i = 1; i < BL_IN + 1; i++) {
+ mram_write(cache_input + cache_input_offset, (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t));
+ cache_input_offset += (BL_IN+2);
+ addr_input += ((BL+2) * sizeof(int32_t));
+ }
+
+ }
+
+ barrier_wait(&my_barrier);
+
+ }
+
+ mram_base_addr_input_itemsets += ((BL+1) * (BL+2) * sizeof(int32_t));
+ mram_base_addr_ref += (BL * BL * sizeof(int32_t));
+ }
+ return 0;
+}