1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
/**
* Needleman-Wunsch with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <perfcounter.h>
#include <barrier.h>
#include "../support/common.h"
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// Barrier
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
int main() {
unsigned int tasklet_id = me();
if (tasklet_id == 0){ // Initialize once the cycle counter
mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
uint32_t nblocks = DPU_INPUT_ARGUMENTS.nblocks;
uint32_t active_blocks = DPU_INPUT_ARGUMENTS.active_blocks;
uint32_t penalty = DPU_INPUT_ARGUMENTS.penalty;
#if PRINT
printf("tasklet_id = %d, nblocks = %d \n", tasklet_id, nblocks);
#endif
uint32_t mram_base_addr_input_itemsets = (uint32_t) (DPU_MRAM_HEAP_POINTER);
uint32_t mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + nblocks * (BL+1) * (BL+2) * sizeof(int32_t));
if (nblocks != active_blocks)
mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + active_blocks * (BL+1) * (BL+2) * sizeof(int32_t));
int32_t *cache_input = mem_alloc((BL_IN+1) * (BL_IN+2) * sizeof(int32_t));
int32_t *cache_ref = mem_alloc(BL_IN * BL_IN * sizeof(int32_t));
uint32_t REP = BL/BL_IN;
uint32_t chunks;
uint32_t mod;
uint32_t start;
uint32_t addr_input;
uint32_t addr_ref;
uint32_t cache_input_offset;
for (uint32_t bl = 0; bl < nblocks; bl++) {
// Top-left computation
for(uint32_t blk = 0; blk <= REP; blk++) {
// Partition chunks/subblocks of the diagonal to tasklets
chunks = blk / NR_TASKLETS;
mod = blk % NR_TASKLETS;
if (tasklet_id < mod)
chunks++;
if (mod > 0) {
if(tasklet_id < mod)
start = tasklet_id * chunks;
else
start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
} else
start = tasklet_id * chunks;
// Compute all assigned chunks
for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
int t_index_x = start + bl_indx;
int t_index_y = blk - 1 - t_index_x;
// Move input from MRAM to WRAM
addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = (BL_IN+2);
mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t));
addr_input += ((BL+2) * sizeof(int32_t));
for (int i = 1; i < BL_IN + 1; i++) {
mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t));
cache_input_offset += (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
}
addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = 0;
for (int i = 0; i < BL_IN; i++) {
mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t));
cache_input_offset += BL_IN;
addr_ref += (BL * sizeof(int32_t));
}
// Computation
for (uint32_t i = 1; i < BL_IN + 1; i++) {
for (uint32_t j = 1; j < BL_IN + 1; j++) {
cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
cache_input[i*(BL_IN+2) + j - 1] - penalty,
cache_input[(i-1)*(BL_IN+2) + j] - penalty);
}
}
// Move output from WRAM to MRAM
addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
for (int i = 1; i < BL_IN + 1; i++) {
mram_write((cache_input + cache_input_offset), (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t));
cache_input_offset += (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
}
}
barrier_wait(&my_barrier);
}
// Bottom-right computation
for(uint32_t blk = 2; blk <= REP; blk++) {
// Partition chunks/subblocks of the diagonal to tasklets
chunks = (REP - blk + 1) / NR_TASKLETS;
mod = (REP - blk + 1) % NR_TASKLETS;
if (tasklet_id < mod)
chunks++;
if (mod > 0){
if(tasklet_id < mod)
start = tasklet_id * chunks;
else
start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
} else
start = tasklet_id * chunks;
// Compute all assigned chunks
for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
int t_index_x = blk - 1 + start + bl_indx;
int t_index_y = REP + blk - 2 - t_index_x;
// Move input from MRAM to WRAM
addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = (BL_IN+2);
mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t));
addr_input += ((BL+2) * sizeof(int32_t));
for (int i = 1; i < BL_IN + 1; i++) {
mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t));
cache_input_offset += (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
}
addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = 0;
for (int i = 0; i < BL_IN; i++) {
mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t));
cache_input_offset += BL_IN;
addr_ref += (BL * sizeof(int32_t));
}
// Computation
for (int i = 1; i < BL_IN + 1; i++) {
for (int j = 1; j < BL_IN + 1; j++) {
cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
cache_input[i*(BL_IN+2) + j - 1] - penalty,
cache_input[(i-1)*(BL_IN+2) + j] - penalty);
}
}
// Move output from WRAM to MRAM
addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
cache_input_offset = (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
for (int i = 1; i < BL_IN + 1; i++) {
mram_write(cache_input + cache_input_offset, (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t));
cache_input_offset += (BL_IN+2);
addr_input += ((BL+2) * sizeof(int32_t));
}
}
barrier_wait(&my_barrier);
}
mram_base_addr_input_itemsets += ((BL+1) * (BL+2) * sizeof(int32_t));
mram_base_addr_ref += (BL * BL * sizeof(int32_t));
}
return 0;
}
|