| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
 | /*
* SpMV with multiple tasklets
*
*/
#include <stdio.h>
#include <alloc.h>
#include <barrier.h>
#include <defs.h>
#include <mram.h>
#include <perfcounter.h>
#include <seqread.h>
#include "../support/common.h"
#define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m   "fmt"\n", ##__VA_ARGS__)
#define MIN(x, y)   (((x) < (y))?(x):(y))
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
int main() {
    if(me() == 0) {
        mem_reset(); // Reset the heap
    }
    // Barrier
    barrier_wait(&my_barrier);
    // Load parameters
    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
    uint32_t numRows = params_w->dpuNumRows;
    // Sanity check
    if(me() == 0) {
        if(numRows%2 != 0) {
            // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
            PRINT_ERROR("The number of rows is not a multiple of two!");
        }
    }
    // Identify tasklet's rows
    uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
    uint32_t taskletRowsStart = me()*numRowsPerTasklet;
    uint32_t taskletNumRows;
    if(taskletRowsStart > numRows) {
        taskletNumRows = 0;
    } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
        taskletNumRows = numRows - taskletRowsStart;
    } else {
        taskletNumRows = numRowsPerTasklet;
    }
    // Only process tasklets with nonzero number of rows
    if(taskletNumRows > 0) {
        // Extract parameters
        uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
        uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
        uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
        uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
        uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
        // Initialize row pointer sequential reader
        uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
        seqreader_t rowPtrReader;
        uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
        uint32_t firstRowPtr = *taskletRowPtrs_w;
        // Initialize nonzeros sequential reader
        uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
        uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
        seqreader_t nonzerosReader;
        struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
        // Initialize input vector cache
        uint32_t inVectorTileSize = 64;
        float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
        mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
        uint32_t currInVectorTileIdx = 0;
        // Initialize output vector cache
        uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
        uint32_t outVectorTileSize = 64;
        float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
        // SpMV
        uint32_t nextRowPtr = firstRowPtr;
        for(uint32_t row = 0; row < taskletNumRows; ++row) {
            // Find row nonzeros
            taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
            uint32_t rowPtr = nextRowPtr;
            nextRowPtr = *taskletRowPtrs_w;
            uint32_t taskletNNZ = nextRowPtr - rowPtr;
            // Multiply row with vector
            float outValue = 0.0f;
            for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
                // Get matrix value
                float matValue = taskletNonzeros_w->value;
                // Get input vector value
                uint32_t col = taskletNonzeros_w->col;
                uint32_t inVectorTileIdx = col/inVectorTileSize;
                uint32_t inVectorTileOffset = col%inVectorTileSize;
                if(inVectorTileIdx != currInVectorTileIdx) {
                    mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
                    currInVectorTileIdx = inVectorTileIdx;
                }
                float inValue = inVectorTile_w[inVectorTileOffset];
                // Multiply and add
                outValue += matValue*inValue;
                // Read next nonzero
                taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
            }
            // Store output
            uint32_t outVectorTileIdx = row/outVectorTileSize;
            uint32_t outVectorTileOffset = row%outVectorTileSize;
            outVectorTile_w[outVectorTileOffset] = outValue;
            if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
            } else if(row == taskletNumRows - 1) { // Last row for tasklet
                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
            }
        }
    }
    return 0;
}
 |