1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
/*
* STREAMP implementation of Matrix Profile with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <mram.h>
#include <barrier.h>
#include "common.h"
#define DOTPIP BLOCK_SIZE / sizeof(DTYPE)
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
__host dpu_result_t DPU_RESULTS[NR_TASKLETS];
// Dot product
static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) {
for(uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++)
{
for(uint32_t j = 0; j < DOTPIP; j++)
{
if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1)
{
result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)] * vectorB[i];
}
else
{
result[j] += vectorA[j + i] * vectorB[i];
}
}
}
}
BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
int(*kernels[nr_kernels])(void) = {main_kernel1};
int main(void){
// Kernel
return kernels[DPU_INPUT_ARGUMENTS.kernel]();
}
// main_kernel1
int main_kernel1() {
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
if(tasklet_id == 0){
mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
// Input arguments
uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length;
DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean;
DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std;
uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu;
// Boundaries for current tasklet
uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS));
uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;
// Check time series limit
if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length;
// Starting address of the current processing block in MRAM
uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER;
// Starting address of the query subsequence
uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset);
mem_offset += query_length * sizeof(DTYPE);
// Starting address of the time series slice
mem_offset += myStartElem * sizeof(DTYPE);
uint32_t starting_offset_ts = mem_offset;
uint32_t current_mram_block_addr_TS = (uint32_t) mem_offset;
// Starting address of the time series means
mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset);
// Starting address of the time series standard deviations
mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset);
// Initialize local caches to store the MRAM blocks
DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE);
// Create result structure pointer
dpu_result_t *result = &DPU_RESULTS[tasklet_id];
// Auxiliary variables
DTYPE distance;
DTYPE min_distance = DTYPE_MAX;
uint32_t min_index = 0;
for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE)))
{
for(uint32_t d = 0; d < DOTPIP; d++)
cache_dotprods[d] = 0;
current_mram_block_addr_TS = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE);
current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER;
for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++)
{
mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE);
mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE);
mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE);
current_mram_block_addr_TS += BLOCK_SIZE;
current_mram_block_addr_query += BLOCK_SIZE;
dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods);
}
mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE);
mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE);
current_mram_block_addr_TSMean += BLOCK_SIZE;
current_mram_block_addr_TSSigma += BLOCK_SIZE;
for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++)
{
distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k]
* query_mean) / (cache_TSSigma[k] * query_std));
if(distance < min_distance)
{
min_distance = distance;
min_index = i + k;
}
}
}
// Save the result
result->minValue = min_distance;
result->minIndex = min_index;
return 0;
}
|