summaryrefslogtreecommitdiff
path: root/HST-L/dpu/task.c
blob: 356b2f9a706c8f0308649a0a9bcc288f2c752ede (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*
* Histogram (HST-L) with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <perfcounter.h>
#include <barrier.h>
#include <atomic_bit.h>
#include <mutex.h>

#include "../support/common.h"

__host dpu_arguments_t DPU_INPUT_ARGUMENTS;

// Array for communication between adjacent tasklets
uint32_t* message[NR_TASKLETS];
// DPU histogram
uint32_t* histo_dpu;

// Barrier
BARRIER_INIT(my_barrier, NR_TASKLETS);
ATOMIC_BIT_INIT(barriers_mutexes)[NR_HISTO];
barrier_t barriers[NR_HISTO];

// Mutex
mutex_id_t my_mutex[NR_HISTO];

// Histogram in each tasklet
static void histogram(uint32_t* histo, uint32_t bins, T *input, uint32_t histo_id, unsigned int l_size){
    for(unsigned int j = 0; j < l_size; j++) {
        T d = (input[j] * bins) >> DEPTH;
        mutex_lock(my_mutex[histo_id]);
        histo[d] += 1;
        mutex_unlock(my_mutex[histo_id]);
    }
}

extern int main_kernel1(void);

int (*kernels[nr_kernels])(void) = {main_kernel1};

int main(void) { 
    // Kernel
    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
}

// main_kernel1
int main_kernel1() {
    unsigned int tasklet_id = me();
#if PRINT
    printf("tasklet_id = %u\n", tasklet_id);
#endif
    unsigned int l_tasklet_id = tasklet_id / NR_HISTO;
    unsigned int nr_l_tasklet = NR_TASKLETS / NR_HISTO;
    unsigned int my_histo_id = tasklet_id & (NR_HISTO - 1);

    if (tasklet_id == 0){ // Initialize once the cycle counter
        mem_reset(); // Reset the heap
        // Initialize barriers
        for (unsigned int each_barrier = 0; each_barrier < NR_HISTO; each_barrier++) {
            barriers[each_barrier].wait_queue = 0xff;
            barriers[each_barrier].count = nr_l_tasklet;
            barriers[each_barrier].initial_count = nr_l_tasklet;
            barriers[each_barrier].lock = (uint8_t) &ATOMIC_BIT_GET(barriers_mutexes)[each_barrier];
        }
    }
    // Barrier
    barrier_wait(&my_barrier);

    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
    uint32_t bins = DPU_INPUT_ARGUMENTS.bins;

    // Address of the current processing block in MRAM
    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
    uint32_t mram_base_addr_histo = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);

    // Initialize a local cache to store the MRAM block
    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
	
    // Local histogram
    if (tasklet_id < NR_HISTO){ // Allocate DPU histogram
        uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
        message[tasklet_id] = histo;
    }
    // Barrier
    barrier_wait(&barriers[my_histo_id]);

    uint32_t *my_histo = message[my_histo_id];

    // Initialize local histogram
    for(unsigned int i = l_tasklet_id; i < bins; i += nr_l_tasklet){
        my_histo[i] = 0;
    }
    // Barrier
    barrier_wait(&barriers[my_histo_id]);

    // Compute histogram
    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){

        // Bound checking
        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;

        // Load cache with current MRAM block
        mram_read((const __mram_ptr void*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);

        // Histogram in each tasklet
        histogram(my_histo, bins, cache_A, my_histo_id, l_size_bytes >> DIV);
    }

    // Barrier
    barrier_wait(&my_barrier);

    uint32_t *histo_dpu = message[0];
    for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS){
        uint32_t b = 0;
        for (unsigned int j = 0; j < NR_HISTO; j++){			
            b += *(message[j] + i);
        }
        histo_dpu[i] = b;
    }

    // Barrier
    barrier_wait(&my_barrier);

    // Write dpu histogram to current MRAM block
    if(tasklet_id == 0){
        if(bins * sizeof(uint32_t) <= 2048)
            mram_write(histo_dpu, (__mram_ptr void*)(mram_base_addr_histo), bins * sizeof(uint32_t));
        else 
            for(unsigned int offset = 0; offset < ((bins * sizeof(uint32_t)) >> 11); offset++){
    	        mram_write(histo_dpu + (offset << 9), (__mram_ptr void*)(mram_base_addr_histo + (offset << 11)), 2048);
            }
    }

    return 0;
}