diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /TS/dpu | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'TS/dpu')
-rw-r--r-- | TS/dpu/task.c | 155 |
1 files changed, 155 insertions, 0 deletions
diff --git a/TS/dpu/task.c b/TS/dpu/task.c new file mode 100644 index 0000000..d704160 --- /dev/null +++ b/TS/dpu/task.c @@ -0,0 +1,155 @@ +/* + * STREAMP implementation of Matrix Profile with multiple tasklets + * + */ + +#include <stdint.h> +#include <stdio.h> +#include <defs.h> +#include <mram.h> +#include <alloc.h> +#include <mram.h> +#include <barrier.h> +#include "common.h" + +#define DOTPIP BLOCK_SIZE / sizeof(DTYPE) + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; +__host dpu_result_t DPU_RESULTS[NR_TASKLETS]; + +// Dot product +static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) { + + for(uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++) + { + for(uint32_t j = 0; j < DOTPIP; j++) + { + if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1) + { + result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)] * vectorB[i]; + } + else + { + result[j] += vectorA[j + i] * vectorB[i]; + } + } + } +} + +BARRIER_INIT(my_barrier, NR_TASKLETS); + +extern int main_kernel1(void); + +int(*kernels[nr_kernels])(void) = {main_kernel1}; + +int main(void){ + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel](); +} + +// main_kernel1 +int main_kernel1() { + unsigned int tasklet_id = me(); +#if PRINT + printf("tasklet_id = %u\n", tasklet_id); +#endif + if(tasklet_id == 0){ + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + // Input arguments + uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length; + DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean; + DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std; + uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu; + + // Boundaries for current tasklet + uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS)); + uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1; + + // Check time series limit + if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length; + + // Starting address of the current processing block in MRAM + uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER; + + // Starting address of the query subsequence + uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset); + mem_offset += query_length * sizeof(DTYPE); + + // Starting address of the time series slice + mem_offset += myStartElem * sizeof(DTYPE); + uint32_t starting_offset_ts = mem_offset; + uint32_t current_mram_block_addr_TS = (uint32_t) mem_offset; + + // Starting address of the time series means + mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE); + uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset); + + // Starting address of the time series standard deviations + mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE); + uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset); + + // Initialize local caches to store the MRAM blocks + DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE); + + // Create result structure pointer + dpu_result_t *result = &DPU_RESULTS[tasklet_id]; + + // Auxiliary variables + DTYPE distance; + DTYPE min_distance = DTYPE_MAX; + uint32_t min_index = 0; + + + for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE))) + { + for(uint32_t d = 0; d < DOTPIP; d++) + cache_dotprods[d] = 0; + + current_mram_block_addr_TS = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE); + current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER; + + for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++) + { + mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE); + mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE); + mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE); + + current_mram_block_addr_TS += BLOCK_SIZE; + current_mram_block_addr_query += BLOCK_SIZE; + dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods); + } + + + mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE); + mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE); + current_mram_block_addr_TSMean += BLOCK_SIZE; + current_mram_block_addr_TSSigma += BLOCK_SIZE; + + for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++) + { + distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k] + * query_mean) / (cache_TSSigma[k] * query_std)); + + if(distance < min_distance) + { + min_distance = distance; + min_index = i + k; + } + } + } + + // Save the result + result->minValue = min_distance; + result->minIndex = min_index; + + return 0; +} |