summaryrefslogtreecommitdiff
path: root/SpMV/dpu/task.c
diff options
context:
space:
mode:
Diffstat (limited to 'SpMV/dpu/task.c')
-rw-r--r--SpMV/dpu/task.c276
1 files changed, 160 insertions, 116 deletions
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..501a62a 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -20,120 +20,164 @@
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
-
- if(me() == 0) {
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- // Load parameters
- uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
- struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- uint32_t numRows = params_w->dpuNumRows;
-
- // Sanity check
- if(me() == 0) {
- if(numRows%2 != 0) {
- // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
- PRINT_ERROR("The number of rows is not a multiple of two!");
- }
- }
-
- // Identify tasklet's rows
- uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
- uint32_t taskletRowsStart = me()*numRowsPerTasklet;
- uint32_t taskletNumRows;
- if(taskletRowsStart > numRows) {
- taskletNumRows = 0;
- } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
- taskletNumRows = numRows - taskletRowsStart;
- } else {
- taskletNumRows = numRowsPerTasklet;
- }
-
- // Only process tasklets with nonzero number of rows
- if(taskletNumRows > 0) {
-
- // Extract parameters
- uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
- uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
- uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
- uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
- uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
- // Initialize row pointer sequential reader
- uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
- seqreader_t rowPtrReader;
- uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
- uint32_t firstRowPtr = *taskletRowPtrs_w;
-
- // Initialize nonzeros sequential reader
- uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
- uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
- seqreader_t nonzerosReader;
- struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
- // Initialize input vector cache
- uint32_t inVectorTileSize = 64;
- float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
- mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
- uint32_t currInVectorTileIdx = 0;
-
- // Initialize output vector cache
- uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
- uint32_t outVectorTileSize = 64;
- float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
- // SpMV
- uint32_t nextRowPtr = firstRowPtr;
- for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
- // Find row nonzeros
- taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
- uint32_t rowPtr = nextRowPtr;
- nextRowPtr = *taskletRowPtrs_w;
- uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
- // Multiply row with vector
- float outValue = 0.0f;
- for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
- // Get matrix value
- float matValue = taskletNonzeros_w->value;
-
- // Get input vector value
- uint32_t col = taskletNonzeros_w->col;
- uint32_t inVectorTileIdx = col/inVectorTileSize;
- uint32_t inVectorTileOffset = col%inVectorTileSize;
- if(inVectorTileIdx != currInVectorTileIdx) {
- mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
- currInVectorTileIdx = inVectorTileIdx;
- }
- float inValue = inVectorTile_w[inVectorTileOffset];
-
- // Multiply and add
- outValue += matValue*inValue;
-
- // Read next nonzero
- taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
- }
-
- // Store output
- uint32_t outVectorTileIdx = row/outVectorTileSize;
- uint32_t outVectorTileOffset = row%outVectorTileSize;
- outVectorTile_w[outVectorTileOffset] = outValue;
- if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
- } else if(row == taskletNumRows - 1) { // Last row for tasklet
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
- }
-
- }
- }
-
- return 0;
+int main()
+{
+
+ if (me() == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Load parameters
+ uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ struct DPUParams *params_w =
+ (struct DPUParams *)
+ mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ mram_read((__mram_ptr void const *)params_m, params_w,
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ uint32_t numRows = params_w->dpuNumRows;
+
+ // Sanity check
+ if (me() == 0) {
+ if (numRows % 2 != 0) {
+ // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+ PRINT_ERROR
+ ("The number of rows is not a multiple of two!");
+ }
+ }
+ // Identify tasklet's rows
+ uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+ uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+ uint32_t taskletNumRows;
+ if (taskletRowsStart > numRows) {
+ taskletNumRows = 0;
+ } else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+ taskletNumRows = numRows - taskletRowsStart;
+ } else {
+ taskletNumRows = numRowsPerTasklet;
+ }
+
+ // Only process tasklets with nonzero number of rows
+ if (taskletNumRows > 0) {
+
+ // Extract parameters
+ uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+ uint32_t rowPtrs_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+ uint32_t nonzeros_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuNonzeros_m;
+ uint32_t inVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuInVector_m;
+ uint32_t outVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuOutVector_m;
+
+ // Initialize row pointer sequential reader
+ uint32_t taskletRowPtrs_m =
+ rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+ seqreader_t rowPtrReader;
+ uint32_t *taskletRowPtrs_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletRowPtrs_m,
+ &rowPtrReader);
+ uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+ // Initialize nonzeros sequential reader
+ uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+ uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
+ seqreader_t nonzerosReader;
+ struct Nonzero *taskletNonzeros_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletNonzeros_m,
+ &nonzerosReader);
+
+ // Initialize input vector cache
+ uint32_t inVectorTileSize = 64;
+ float *inVectorTile_w =
+ mem_alloc(inVectorTileSize * sizeof(float));
+ mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+ 256);
+ uint32_t currInVectorTileIdx = 0;
+
+ // Initialize output vector cache
+ uint32_t taskletOutVector_m =
+ outVector_m + taskletRowsStart * sizeof(float);
+ uint32_t outVectorTileSize = 64;
+ float *outVectorTile_w =
+ mem_alloc(outVectorTileSize * sizeof(float));
+
+ // SpMV
+ uint32_t nextRowPtr = firstRowPtr;
+ for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+ // Find row nonzeros
+ taskletRowPtrs_w =
+ seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+ &rowPtrReader);
+ uint32_t rowPtr = nextRowPtr;
+ nextRowPtr = *taskletRowPtrs_w;
+ uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+ // Multiply row with vector
+ float outValue = 0.0f;
+ for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+ // Get matrix value
+ float matValue = taskletNonzeros_w->value;
+
+ // Get input vector value
+ uint32_t col = taskletNonzeros_w->col;
+ uint32_t inVectorTileIdx =
+ col / inVectorTileSize;
+ uint32_t inVectorTileOffset =
+ col % inVectorTileSize;
+ if (inVectorTileIdx != currInVectorTileIdx) {
+ mram_read((__mram_ptr void const
+ *)(inVector_m +
+ inVectorTileIdx *
+ inVectorTileSize *
+ sizeof(float)),
+ inVectorTile_w, 256);
+ currInVectorTileIdx = inVectorTileIdx;
+ }
+ float inValue =
+ inVectorTile_w[inVectorTileOffset];
+
+ // Multiply and add
+ outValue += matValue * inValue;
+
+ // Read next nonzero
+ taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
+
+ }
+
+ // Store output
+ uint32_t outVectorTileIdx = row / outVectorTileSize;
+ uint32_t outVectorTileOffset = row % outVectorTileSize;
+ outVectorTile_w[outVectorTileOffset] = outValue;
+ if (outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)), 256);
+ } else if (row == taskletNumRows - 1) { // Last row for tasklet
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)),
+ (taskletNumRows %
+ outVectorTileSize) * sizeof(float));
+ }
+
+ }
+ }
+
+ return 0;
}