summaryrefslogtreecommitdiff
path: root/TS/dpu/task.c
blob: d7041607ebfab3d7781dc213b5e6838c5585ad3c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
 * STREAMP implementation of Matrix Profile with multiple tasklets
 *
 */

#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <mram.h>
#include <barrier.h>
#include "common.h"

#define DOTPIP BLOCK_SIZE / sizeof(DTYPE)

__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
__host dpu_result_t DPU_RESULTS[NR_TASKLETS];

// Dot product
static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) {

	for(uint32_t i = 0; i <  BLOCK_SIZE / sizeof(DTYPE); i++)
	{
		for(uint32_t j = 0; j < DOTPIP; j++)
		{
			if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1)
			{
				result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)]  * vectorB[i];
			}
			else
			{
				result[j] += vectorA[j + i] * vectorB[i];
			}
		}
	}
}

BARRIER_INIT(my_barrier, NR_TASKLETS);

extern int main_kernel1(void);

int(*kernels[nr_kernels])(void) = {main_kernel1};

int main(void){
	// Kernel
	return kernels[DPU_INPUT_ARGUMENTS.kernel]();
}

// main_kernel1
int main_kernel1() {
	unsigned int tasklet_id = me();
#if PRINT
	printf("tasklet_id = %u\n", tasklet_id);
#endif
	if(tasklet_id == 0){
		mem_reset(); // Reset the heap
	}
	// Barrier
	barrier_wait(&my_barrier);

	// Input arguments
	uint32_t query_length  = DPU_INPUT_ARGUMENTS.query_length;
	DTYPE query_mean       = DPU_INPUT_ARGUMENTS.query_mean;
	DTYPE query_std        = DPU_INPUT_ARGUMENTS.query_std;
	uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu;

	// Boundaries for current tasklet
	uint32_t myStartElem = tasklet_id  * (slice_per_dpu / (NR_TASKLETS));
	uint32_t myEndElem   = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;

	// Check time series limit
	if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length;

	// Starting address of the current processing block in MRAM
	uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER;

	// Starting address of the query subsequence
	uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset);
	mem_offset += query_length * sizeof(DTYPE);

	// Starting address of the time series slice
	mem_offset += myStartElem * sizeof(DTYPE);
	uint32_t starting_offset_ts = mem_offset;
	uint32_t current_mram_block_addr_TS = (uint32_t) mem_offset;

	// Starting address of the time series means
	mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
	uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset);

	// Starting address of the time series standard deviations
	mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
	uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset);

	// Initialize local caches to store the MRAM blocks
	DTYPE *cache_TS       = (DTYPE *) mem_alloc(BLOCK_SIZE);
	DTYPE *cache_TS_aux   = (DTYPE *) mem_alloc(BLOCK_SIZE);
	DTYPE *cache_query    = (DTYPE *) mem_alloc(BLOCK_SIZE);
	DTYPE *cache_TSMean   = (DTYPE *) mem_alloc(BLOCK_SIZE);
	DTYPE *cache_TSSigma  = (DTYPE *) mem_alloc(BLOCK_SIZE);
	DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE);

	// Create result structure pointer
	dpu_result_t *result = &DPU_RESULTS[tasklet_id];

	// Auxiliary variables
	DTYPE distance;
	DTYPE min_distance = DTYPE_MAX;
	uint32_t min_index = 0;


	for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE)))
	{
		for(uint32_t d = 0; d < DOTPIP; d++)
			cache_dotprods[d] = 0;

		current_mram_block_addr_TS    = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE);
		current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER;

		for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++)
		{
			mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE);
			mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE);
			mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE);

			current_mram_block_addr_TS    += BLOCK_SIZE;
			current_mram_block_addr_query += BLOCK_SIZE;
			dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods);
		}


		mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE);
		mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE);
		current_mram_block_addr_TSMean  += BLOCK_SIZE;
		current_mram_block_addr_TSSigma += BLOCK_SIZE;

		for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++)
		{
			distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k]
						* query_mean) / (cache_TSSigma[k] * query_std));

			if(distance < min_distance)
			{
				min_distance =  distance;
				min_index    =  i + k;
			}
		}
	}

	// Save the result
	result->minValue = min_distance;
	result->minIndex = min_index;

	return 0;
}