summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2025-01-17 09:34:47 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2025-01-17 09:34:47 +0100
commit24dea071574167bcad2d553f6b6bc24f40ccbb48 (patch)
treefb21b3c4b855e09f4a79b4c1bfab1c7f4966cc97
parentb4d38f56de51e5ac7c2c813b420e9f6d42abc2e6 (diff)
SpMV: indent -linux
-rw-r--r--SpMV/baselines/cpu/app.c95
-rw-r--r--SpMV/dpu/task.c276
-rw-r--r--SpMV/host/app.c498
-rw-r--r--SpMV/host/mram-management.h46
-rw-r--r--SpMV/support/common.h17
-rw-r--r--SpMV/support/matrix.h199
-rw-r--r--SpMV/support/params.h67
-rw-r--r--SpMV/support/timer.h23
-rw-r--r--SpMV/support/utils.h1
9 files changed, 681 insertions, 541 deletions
diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c
index 8d360ee..e33761f 100644
--- a/SpMV/baselines/cpu/app.c
+++ b/SpMV/baselines/cpu/app.c
@@ -13,60 +13,63 @@
#include "../../support/timer.h"
#include "../../support/utils.h"
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
- // Process parameters
- struct Params p = input_params(argc, argv);
+ // Process parameters
+ struct Params p = input_params(argc, argv);
- // Initialize SpMV data structures
- PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
- struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
- struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
- float* inVector = malloc(csrMatrix.numCols*sizeof(float));
- float* outVector = malloc(csrMatrix.numRows*sizeof(float));
- initVector(inVector, csrMatrix.numCols);
+ // Initialize SpMV data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+ struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros",
+ cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+ struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+ float *inVector = malloc(csrMatrix.numCols * sizeof(float));
+ float *outVector = malloc(csrMatrix.numRows * sizeof(float));
+ initVector(inVector, csrMatrix.numCols);
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- //omp_set_num_threads(4);
- Timer timer;
- startTimer(&timer);
- #pragma omp parallel for
- for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
- float sum = 0.0f;
- for(uint32_t i = csrMatrix.rowPtrs[rowIdx]; i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
- uint32_t colIdx = csrMatrix.nonzeros[i].col;
- float value = csrMatrix.nonzeros[i].value;
- sum += inVector[colIdx]*value;
- }
- outVector[rowIdx] = sum;
- }
- stopTimer(&timer);
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ //omp_set_num_threads(4);
+ Timer timer;
+ startTimer(&timer);
+#pragma omp parallel for
+ for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+ float sum = 0.0f;
+ for (uint32_t i = csrMatrix.rowPtrs[rowIdx];
+ i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
+ uint32_t colIdx = csrMatrix.nonzeros[i].col;
+ float value = csrMatrix.nonzeros[i].value;
+ sum += inVector[colIdx] * value;
+ }
+ outVector[rowIdx] = sum;
+ }
+ stopTimer(&timer);
-
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
-
+ nr_threads++;
- // coomatrix / csrmatrix use uint32_t indexes and float values
- printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
- " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
- nr_threads, csrMatrix.numNonzeros,
- csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6),
- csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6),
- getElapsedTime(timer)*1e6);
- //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
- PRINT_INFO(p.verbosity >= 1, " Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+ // coomatrix / csrmatrix use uint32_t indexes and float values
+ printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
+ " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
+ nr_threads, csrMatrix.numNonzeros,
+ csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer) *
+ 1e6),
+ csrMatrix.numNonzeros / (getElapsedTime(timer) * 1e6),
+ getElapsedTime(timer) * 1e6);
+ //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+ PRINT_INFO(p.verbosity >= 1, " Elapsed time: %f ms",
+ getElapsedTime(timer) * 1e3);
- // Deallocate data structures
- freeCOOMatrix(cooMatrix);
- freeCSRMatrix(csrMatrix);
- free(inVector);
- free(outVector);
+ // Deallocate data structures
+ freeCOOMatrix(cooMatrix);
+ freeCSRMatrix(csrMatrix);
+ free(inVector);
+ free(outVector);
- return 0;
+ return 0;
}
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..501a62a 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -20,120 +20,164 @@
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
-
- if(me() == 0) {
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- // Load parameters
- uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
- struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- uint32_t numRows = params_w->dpuNumRows;
-
- // Sanity check
- if(me() == 0) {
- if(numRows%2 != 0) {
- // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
- PRINT_ERROR("The number of rows is not a multiple of two!");
- }
- }
-
- // Identify tasklet's rows
- uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
- uint32_t taskletRowsStart = me()*numRowsPerTasklet;
- uint32_t taskletNumRows;
- if(taskletRowsStart > numRows) {
- taskletNumRows = 0;
- } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
- taskletNumRows = numRows - taskletRowsStart;
- } else {
- taskletNumRows = numRowsPerTasklet;
- }
-
- // Only process tasklets with nonzero number of rows
- if(taskletNumRows > 0) {
-
- // Extract parameters
- uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
- uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
- uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
- uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
- uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
- // Initialize row pointer sequential reader
- uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
- seqreader_t rowPtrReader;
- uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
- uint32_t firstRowPtr = *taskletRowPtrs_w;
-
- // Initialize nonzeros sequential reader
- uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
- uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
- seqreader_t nonzerosReader;
- struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
- // Initialize input vector cache
- uint32_t inVectorTileSize = 64;
- float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
- mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
- uint32_t currInVectorTileIdx = 0;
-
- // Initialize output vector cache
- uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
- uint32_t outVectorTileSize = 64;
- float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
- // SpMV
- uint32_t nextRowPtr = firstRowPtr;
- for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
- // Find row nonzeros
- taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
- uint32_t rowPtr = nextRowPtr;
- nextRowPtr = *taskletRowPtrs_w;
- uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
- // Multiply row with vector
- float outValue = 0.0f;
- for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
- // Get matrix value
- float matValue = taskletNonzeros_w->value;
-
- // Get input vector value
- uint32_t col = taskletNonzeros_w->col;
- uint32_t inVectorTileIdx = col/inVectorTileSize;
- uint32_t inVectorTileOffset = col%inVectorTileSize;
- if(inVectorTileIdx != currInVectorTileIdx) {
- mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
- currInVectorTileIdx = inVectorTileIdx;
- }
- float inValue = inVectorTile_w[inVectorTileOffset];
-
- // Multiply and add
- outValue += matValue*inValue;
-
- // Read next nonzero
- taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
- }
-
- // Store output
- uint32_t outVectorTileIdx = row/outVectorTileSize;
- uint32_t outVectorTileOffset = row%outVectorTileSize;
- outVectorTile_w[outVectorTileOffset] = outValue;
- if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
- } else if(row == taskletNumRows - 1) { // Last row for tasklet
- mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
- }
-
- }
- }
-
- return 0;
+int main()
+{
+
+ if (me() == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Load parameters
+ uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ struct DPUParams *params_w =
+ (struct DPUParams *)
+ mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ mram_read((__mram_ptr void const *)params_m, params_w,
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ uint32_t numRows = params_w->dpuNumRows;
+
+ // Sanity check
+ if (me() == 0) {
+ if (numRows % 2 != 0) {
+ // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+ PRINT_ERROR
+ ("The number of rows is not a multiple of two!");
+ }
+ }
+ // Identify tasklet's rows
+ uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+ uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+ uint32_t taskletNumRows;
+ if (taskletRowsStart > numRows) {
+ taskletNumRows = 0;
+ } else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+ taskletNumRows = numRows - taskletRowsStart;
+ } else {
+ taskletNumRows = numRowsPerTasklet;
+ }
+
+ // Only process tasklets with nonzero number of rows
+ if (taskletNumRows > 0) {
+
+ // Extract parameters
+ uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+ uint32_t rowPtrs_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+ uint32_t nonzeros_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuNonzeros_m;
+ uint32_t inVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuInVector_m;
+ uint32_t outVector_m =
+ ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+ params_w->dpuOutVector_m;
+
+ // Initialize row pointer sequential reader
+ uint32_t taskletRowPtrs_m =
+ rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+ seqreader_t rowPtrReader;
+ uint32_t *taskletRowPtrs_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletRowPtrs_m,
+ &rowPtrReader);
+ uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+ // Initialize nonzeros sequential reader
+ uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+ uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
+ seqreader_t nonzerosReader;
+ struct Nonzero *taskletNonzeros_w =
+ seqread_init(seqread_alloc(),
+ (__mram_ptr void *)taskletNonzeros_m,
+ &nonzerosReader);
+
+ // Initialize input vector cache
+ uint32_t inVectorTileSize = 64;
+ float *inVectorTile_w =
+ mem_alloc(inVectorTileSize * sizeof(float));
+ mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+ 256);
+ uint32_t currInVectorTileIdx = 0;
+
+ // Initialize output vector cache
+ uint32_t taskletOutVector_m =
+ outVector_m + taskletRowsStart * sizeof(float);
+ uint32_t outVectorTileSize = 64;
+ float *outVectorTile_w =
+ mem_alloc(outVectorTileSize * sizeof(float));
+
+ // SpMV
+ uint32_t nextRowPtr = firstRowPtr;
+ for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+ // Find row nonzeros
+ taskletRowPtrs_w =
+ seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+ &rowPtrReader);
+ uint32_t rowPtr = nextRowPtr;
+ nextRowPtr = *taskletRowPtrs_w;
+ uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+ // Multiply row with vector
+ float outValue = 0.0f;
+ for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+ // Get matrix value
+ float matValue = taskletNonzeros_w->value;
+
+ // Get input vector value
+ uint32_t col = taskletNonzeros_w->col;
+ uint32_t inVectorTileIdx =
+ col / inVectorTileSize;
+ uint32_t inVectorTileOffset =
+ col % inVectorTileSize;
+ if (inVectorTileIdx != currInVectorTileIdx) {
+ mram_read((__mram_ptr void const
+ *)(inVector_m +
+ inVectorTileIdx *
+ inVectorTileSize *
+ sizeof(float)),
+ inVectorTile_w, 256);
+ currInVectorTileIdx = inVectorTileIdx;
+ }
+ float inValue =
+ inVectorTile_w[inVectorTileOffset];
+
+ // Multiply and add
+ outValue += matValue * inValue;
+
+ // Read next nonzero
+ taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
+
+ }
+
+ // Store output
+ uint32_t outVectorTileIdx = row / outVectorTileSize;
+ uint32_t outVectorTileOffset = row % outVectorTileSize;
+ outVectorTile_w[outVectorTileOffset] = outValue;
+ if (outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)), 256);
+ } else if (row == taskletNumRows - 1) { // Last row for tasklet
+ mram_write(outVectorTile_w,
+ (__mram_ptr void
+ *)(taskletOutVector_m +
+ outVectorTileIdx *
+ outVectorTileSize *
+ sizeof(float)),
+ (taskletNumRows %
+ outVectorTileSize) * sizeof(float));
+ }
+
+ }
+ }
+
+ return 0;
}
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index be9ee37..ffccb70 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -33,228 +33,284 @@
#endif
// Main of the Host Application
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
- // Process parameters
- struct Params p = input_params(argc, argv);
+ // Process parameters
+ struct Params p = input_params(argc, argv);
- // Timing and profiling
- Timer timer;
- double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime = 0.0f, readTime = 0.0f, freeTime = 0.0f;
- #if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- #endif
-
- // Allocate DPUs and load binary
- struct dpu_set_t dpu_set, dpu;
- uint32_t numDPUs, numRanks;
-
- startTimer(&timer);
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- stopTimer(&timer);
- allocTime += getElapsedTime(timer);
-
- startTimer(&timer);
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- stopTimer(&timer);
- loadTime += getElapsedTime(timer);
-
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
- assert(numDPUs == NR_DPUS);
- PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
- // Initialize SpMV data structures
- PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
- struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
- struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
- uint32_t numRows = csrMatrix.numRows;
- uint32_t numCols = csrMatrix.numCols;
- uint32_t* rowPtrs = csrMatrix.rowPtrs;
- struct Nonzero* nonzeros = csrMatrix.nonzeros;
- float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
- initVector(inVector, numCols);
- float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
-
- // Partition data structure across DPUs
- uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
- PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
- struct DPUParams dpuParams[numDPUs];
- unsigned int dpuIdx = 0;
- PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
- DPU_FOREACH (dpu_set, dpu) {
-
- // Allocate parameters
- struct mram_heap_allocator_t allocator;
- init_allocator(&allocator);
- uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
- // Find DPU's rows
- uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
- uint32_t dpuNumRows;
- if(dpuStartRowIdx > numRows) {
- dpuNumRows = 0;
- } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
- dpuNumRows = numRows - dpuStartRowIdx;
- } else {
- dpuNumRows = numRowsPerDPU;
- }
- dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
- PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
- PRINT_INFO(p.verbosity >= 2, " Receives %u rows", dpuNumRows);
-
- // Partition nonzeros and copy data
- if(dpuNumRows > 0) {
-
- // Find DPU's CSR matrix partition
- uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
- uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
- struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
- uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
-
- // Allocate MRAM
- uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
- uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
- uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
- uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
- assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
- PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
-
- // Set up DPU parameters
- dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
- dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
- dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
- dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
- dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
-
- // Send data to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer);
- copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
- copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
- stopTimer(&timer);
- writeTime += getElapsedTime(timer);
-
- }
-
- // Send parameters to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer);
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
- stopTimer(&timer);
- writeTime += getElapsedTime(timer);
-
- ++dpuIdx;
-
- }
- PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", writeTime*1e3);
-
- // Run all DPUs
- PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
- startTimer(&timer);
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy);
- #endif
- stopTimer(&timer);
- dpuTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime*1e3);
-
- // Copy back result
- PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
- if(dpuNumRows > 0) {
- uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
- }
- ++dpuIdx;
- }
- stopTimer(&timer);
- readTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", readTime*1e3);
-
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- float* outVectorReference = malloc(numRows*sizeof(float));
- for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
- float sum = 0.0f;
- for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
- uint32_t colIdx = nonzeros[i].col;
- float value = nonzeros[i].value;
- sum += inVector[colIdx]*value;
- }
- outVectorReference[rowIdx] = sum;
- }
-
- // Verify the result
- PRINT_INFO(p.verbosity >= 1, "Verifying the result");
- int status = 1;
- for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
- float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
- const float tolerance = 0.00001;
- if(diff > tolerance || diff < -tolerance) {
- status = 0;
- PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
- }
- }
-
- startTimer(&timer);
- DPU_ASSERT(dpu_free(dpu_set));
- stopTimer(&timer);
- freeTime += getElapsedTime(timer);
-
- if (status) {
- printf("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
- numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- allocTime, loadTime, writeTime, dpuTime, readTime, freeTime);
- printf(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
- csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
- printf(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
- csrMatrix.numNonzeros / (dpuTime * 1e6),
- csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
- csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
- }
-
- // Display DPU Logs
- if(p.verbosity >= 2) {
- PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- PRINT("DPU %u:", dpuIdx);
- DPU_ASSERT(dpu_log_read(dpu, stdout));
- ++dpuIdx;
- }
- }
-
- // Deallocate data structures
- freeCOOMatrix(cooMatrix);
- freeCSRMatrix(csrMatrix);
- free(inVector);
- free(outVector);
- free(outVectorReference);
+ // Timing and profiling
+ Timer timer;
+ double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime =
+ 0.0f, readTime = 0.0f, freeTime = 0.0f;
+#if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+#endif
- return 0;
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs, numRanks;
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ stopTimer(&timer);
+ allocTime += getElapsedTime(timer);
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ stopTimer(&timer);
+ loadTime += getElapsedTime(timer);
+
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+ assert(numDPUs == NR_DPUS);
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize SpMV data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+ struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros",
+ cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+ struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+ uint32_t numRows = csrMatrix.numRows;
+ uint32_t numCols = csrMatrix.numCols;
+ uint32_t *rowPtrs = csrMatrix.rowPtrs;
+ struct Nonzero *nonzeros = csrMatrix.nonzeros;
+ float *inVector =
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+ initVector(inVector, numCols);
+ float *outVector =
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+
+ // Partition data structure across DPUs
+ uint32_t numRowsPerDPU =
+ ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU",
+ numRowsPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ unsigned int dpuIdx = 0;
+ PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+ DPU_FOREACH(dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ uint32_t dpuParams_m =
+ mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's rows
+ uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+ uint32_t dpuNumRows;
+ if (dpuStartRowIdx > numRows) {
+ dpuNumRows = 0;
+ } else if (dpuStartRowIdx + numRowsPerDPU > numRows) {
+ dpuNumRows = numRows - dpuStartRowIdx;
+ } else {
+ dpuNumRows = numRowsPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u rows",
+ dpuNumRows);
+
+ // Partition nonzeros and copy data
+ if (dpuNumRows > 0) {
+
+ // Find DPU's CSR matrix partition
+ uint32_t *dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+ uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+ struct Nonzero *dpuNonzeros_h =
+ &nonzeros[dpuRowPtrsOffset];
+ uint32_t dpuNumNonzeros =
+ dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+ // Allocate MRAM
+ uint32_t dpuRowPtrs_m =
+ mram_heap_alloc(&allocator,
+ (dpuNumRows +
+ 1) * sizeof(uint32_t));
+ uint32_t dpuNonzeros_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNonzeros *
+ sizeof(struct Nonzero));
+ uint32_t dpuInVector_m =
+ mram_heap_alloc(&allocator,
+ numCols * sizeof(float));
+ uint32_t dpuOutVector_m =
+ mram_heap_alloc(&allocator,
+ dpuNumRows * sizeof(float));
+ assert((dpuNumRows * sizeof(float)) % 8 == 0
+ &&
+ "Output sub-vector must be a multiple of 8 bytes!");
+ PRINT_INFO(p.verbosity >= 2,
+ " Total memory allocated is %d bytes",
+ allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+ dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+ dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+ dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+ dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying data to DPU");
+ startTimer(&timer);
+ copyToDPU(dpu, (uint8_t *) dpuRowPtrs_h, dpuRowPtrs_m,
+ (dpuNumRows + 1) * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) dpuNonzeros_h, dpuNonzeros_m,
+ dpuNumNonzeros * sizeof(struct Nonzero));
+ copyToDPU(dpu, (uint8_t *) inVector, dpuInVector_m,
+ numCols * sizeof(float));
+ stopTimer(&timer);
+ writeTime += getElapsedTime(timer);
+
+ }
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying parameters to DPU");
+ startTimer(&timer);
+ copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], dpuParams_m,
+ sizeof(struct DPUParams));
+ stopTimer(&timer);
+ writeTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+ PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms",
+ writeTime * 1e3);
+
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+ startTimer(&timer);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy);
+#endif
+ stopTimer(&timer);
+ dpuTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime * 1e3);
+
+ // Copy back result
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+ if (dpuNumRows > 0) {
+ uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+ copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m,
+ (uint8_t *) (outVector + dpuStartRowIdx),
+ dpuNumRows * sizeof(float));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer);
+ readTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", readTime * 1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ float *outVectorReference = malloc(numRows * sizeof(float));
+ for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float sum = 0.0f;
+ for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+ uint32_t colIdx = nonzeros[i].col;
+ float value = nonzeros[i].value;
+ sum += inVector[colIdx] * value;
+ }
+ outVectorReference[rowIdx] = sum;
+ }
+
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
+ for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float diff =
+ (outVectorReference[rowIdx] -
+ outVector[rowIdx]) / outVectorReference[rowIdx];
+ const float tolerance = 0.00001;
+ if (diff > tolerance || diff < -tolerance) {
+ status = 0;
+ PRINT_ERROR
+ ("Mismatch at index %u (CPU result = %f, DPU result = %f)",
+ rowIdx, outVectorReference[rowIdx],
+ outVector[rowIdx]);
+ }
+ }
+
+ startTimer(&timer);
+ DPU_ASSERT(dpu_free(dpu_set));
+ stopTimer(&timer);
+ freeTime += getElapsedTime(timer);
+
+ if (status) {
+ printf
+ ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
+ numDPUs, numRanks, NR_TASKLETS, "float",
+ csrMatrix.numNonzeros);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ allocTime, loadTime, writeTime, dpuTime, readTime,
+ freeTime);
+ printf
+ (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+ csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((allocTime + loadTime + writeTime + dpuTime + readTime +
+ freeTime) * 1e6));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros * sizeof(float) /
+ ((allocTime + loadTime + writeTime + dpuTime +
+ readTime) * 1e6));
+ printf
+ (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+ csrMatrix.numNonzeros / (dpuTime * 1e6),
+ csrMatrix.numNonzeros /
+ ((allocTime + loadTime + writeTime + dpuTime + readTime +
+ freeTime) * 1e6));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
+ 1e6),
+ csrMatrix.numNonzeros /
+ ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+ csrMatrix.numNonzeros /
+ ((allocTime + loadTime + writeTime + dpuTime +
+ readTime) * 1e6));
+ }
+ // Display DPU Logs
+ if (p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+ // Deallocate data structures
+ freeCOOMatrix(cooMatrix);
+ freeCSRMatrix(csrMatrix);
+ free(inVector);
+ free(outVector);
+ free(outVectorReference);
+
+ return 0;
}
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index 627dfde..f2ee031 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -5,33 +5,45 @@
#include "../support/common.h"
#include "../support/utils.h"
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
struct mram_heap_allocator_t {
- uint32_t totalAllocated;
+ uint32_t totalAllocated;
};
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
- allocator->totalAllocated = 0;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+ allocator->totalAllocated = 0;
}
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
- uint32_t ret = allocator->totalAllocated;
- allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
- if(allocator->totalAllocated > DPU_CAPACITY) {
- PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
- exit(0);
- }
- return ret;
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+ uint32_t size)
+{
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if (allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR
+ (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+ allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
}
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
- DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
+ uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_to
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
- DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
+ uint8_t *hostPtr, uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_from
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
#endif
-
diff --git a/SpMV/support/common.h b/SpMV/support/common.h
index 58fede8..6118814 100644
--- a/SpMV/support/common.h
+++ b/SpMV/support/common.h
@@ -8,18 +8,17 @@
#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8)
struct DPUParams {
- uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
- uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
- uint32_t dpuRowPtrs_m;
- uint32_t dpuNonzeros_m;
- uint32_t dpuInVector_m;
- uint32_t dpuOutVector_m;
+ uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
+ uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
+ uint32_t dpuRowPtrs_m;
+ uint32_t dpuNonzeros_m;
+ uint32_t dpuInVector_m;
+ uint32_t dpuOutVector_m;
};
struct Nonzero {
- uint32_t col;
- float value;
+ uint32_t col;
+ float value;
};
#endif
-
diff --git a/SpMV/support/matrix.h b/SpMV/support/matrix.h
index d25da1b..ce8745e 100644
--- a/SpMV/support/matrix.h
+++ b/SpMV/support/matrix.h
@@ -9,111 +9,130 @@
#include "utils.h"
struct COOMatrix {
- uint32_t numRows;
- uint32_t numCols;
- uint32_t numNonzeros;
- uint32_t* rowIdxs;
- struct Nonzero* nonzeros;
+ uint32_t numRows;
+ uint32_t numCols;
+ uint32_t numNonzeros;
+ uint32_t *rowIdxs;
+ struct Nonzero *nonzeros;
};
struct CSRMatrix {
- uint32_t numRows;
- uint32_t numCols;
- uint32_t numNonzeros;
- uint32_t* rowPtrs;
- struct Nonzero* nonzeros;
+ uint32_t numRows;
+ uint32_t numCols;
+ uint32_t numNonzeros;
+ uint32_t *rowPtrs;
+ struct Nonzero *nonzeros;
};
-static struct COOMatrix readCOOMatrix(const char* fileName) {
-
- struct COOMatrix cooMatrix;
-
- // Initialize fields
- FILE* fp = fopen(fileName, "r");
- assert(fscanf(fp, "%u", &cooMatrix.numRows));
- if(cooMatrix.numRows%2 == 1) {
- PRINT_WARNING("Reading matrix %s: number of rows must be even. Padding with an extra row.", fileName);
- cooMatrix.numRows++;
- }
- assert(fscanf(fp, "%u", &cooMatrix.numCols));
- assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
- cooMatrix.rowIdxs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(uint32_t)));
- cooMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(struct Nonzero)));
-
- // Read the nonzeros
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx;
- assert(fscanf(fp, "%u", &rowIdx));
- cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
- uint32_t colIdx;
- assert(fscanf(fp, "%u", &colIdx));
- cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
- cooMatrix.nonzeros[i].value = 1.0f;
- }
-
- return cooMatrix;
+static struct COOMatrix readCOOMatrix(const char *fileName)
+{
+
+ struct COOMatrix cooMatrix;
+
+ // Initialize fields
+ FILE *fp = fopen(fileName, "r");
+ assert(fscanf(fp, "%u", &cooMatrix.numRows));
+ if (cooMatrix.numRows % 2 == 1) {
+ PRINT_WARNING
+ ("Reading matrix %s: number of rows must be even. Padding with an extra row.",
+ fileName);
+ cooMatrix.numRows++;
+ }
+ assert(fscanf(fp, "%u", &cooMatrix.numCols));
+ assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
+ cooMatrix.rowIdxs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (cooMatrix.numNonzeros * sizeof(uint32_t)));
+ cooMatrix.nonzeros =
+ (struct Nonzero *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (cooMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+ // Read the nonzeros
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx;
+ assert(fscanf(fp, "%u", &rowIdx));
+ cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
+ uint32_t colIdx;
+ assert(fscanf(fp, "%u", &colIdx));
+ cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
+ cooMatrix.nonzeros[i].value = 1.0f;
+ }
+
+ return cooMatrix;
}
-static void freeCOOMatrix(struct COOMatrix cooMatrix) {
- free(cooMatrix.rowIdxs);
- free(cooMatrix.nonzeros);
+static void freeCOOMatrix(struct COOMatrix cooMatrix)
+{
+ free(cooMatrix.rowIdxs);
+ free(cooMatrix.nonzeros);
}
-static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix) {
-
- struct CSRMatrix csrMatrix;
-
- // Initialize fields
- csrMatrix.numRows = cooMatrix.numRows;
- csrMatrix.numCols = cooMatrix.numCols;
- csrMatrix.numNonzeros = cooMatrix.numNonzeros;
- csrMatrix.rowPtrs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8((csrMatrix.numRows + 1)*sizeof(uint32_t)));
- csrMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrMatrix.numNonzeros*sizeof(struct Nonzero)));
-
- // Histogram rowIdxs
- memset(csrMatrix.rowPtrs, 0, (csrMatrix.numRows + 1)*sizeof(uint32_t));
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx = cooMatrix.rowIdxs[i];
- csrMatrix.rowPtrs[rowIdx]++;
- }
-
- // Prefix sum rowPtrs
- uint32_t sumBeforeNextRow = 0;
- for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
- uint32_t sumBeforeRow = sumBeforeNextRow;
- sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
- csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
- }
- csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
-
- // Bin the nonzeros
- for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
- uint32_t rowIdx = cooMatrix.rowIdxs[i];
- uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
- csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
- }
-
- // Restore rowPtrs
- for(uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
- csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
- }
- csrMatrix.rowPtrs[0] = 0;
-
- return csrMatrix;
+static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix)
+{
+
+ struct CSRMatrix csrMatrix;
+
+ // Initialize fields
+ csrMatrix.numRows = cooMatrix.numRows;
+ csrMatrix.numCols = cooMatrix.numCols;
+ csrMatrix.numNonzeros = cooMatrix.numNonzeros;
+ csrMatrix.rowPtrs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ ((csrMatrix.numRows + 1) * sizeof(uint32_t)));
+ csrMatrix.nonzeros =
+ (struct Nonzero *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (csrMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+ // Histogram rowIdxs
+ memset(csrMatrix.rowPtrs, 0,
+ (csrMatrix.numRows + 1) * sizeof(uint32_t));
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx = cooMatrix.rowIdxs[i];
+ csrMatrix.rowPtrs[rowIdx]++;
+ }
+
+ // Prefix sum rowPtrs
+ uint32_t sumBeforeNextRow = 0;
+ for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+ uint32_t sumBeforeRow = sumBeforeNextRow;
+ sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
+ csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
+ }
+ csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
+
+ // Bin the nonzeros
+ for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+ uint32_t rowIdx = cooMatrix.rowIdxs[i];
+ uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
+ csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
+ }
+
+ // Restore rowPtrs
+ for (uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
+ csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
+ }
+ csrMatrix.rowPtrs[0] = 0;
+
+ return csrMatrix;
}
-static void freeCSRMatrix(struct CSRMatrix csrMatrix) {
- free(csrMatrix.rowPtrs);
- free(csrMatrix.nonzeros);
+static void freeCSRMatrix(struct CSRMatrix csrMatrix)
+{
+ free(csrMatrix.rowPtrs);
+ free(csrMatrix.nonzeros);
}
-static void initVector(float* vec, uint32_t size) {
- for(uint32_t i = 0; i < size; ++i) {
- vec[i] = 1.0f;
- }
+static void initVector(float *vec, uint32_t size)
+{
+ for (uint32_t i = 0; i < size; ++i) {
+ vec[i] = 1.0f;
+ }
}
#endif
-
diff --git a/SpMV/support/params.h b/SpMV/support/params.h
index b4b696c..bf60e79 100644
--- a/SpMV/support/params.h
+++ b/SpMV/support/params.h
@@ -5,42 +5,47 @@
#include "common.h"
#include "utils.h"
-static void usage() {
- PRINT( "\nUsage: ./program [options]"
- "\n"
- "\nBenchmark-specific options:"
- "\n -f <F> input matrix file name (default=data/bcsstk30.mtx)"
- "\n"
- "\nGeneral options:"
- "\n -v <V> verbosity"
- "\n -h help"
- "\n\n");
+static void usage()
+{
+ PRINT("\nUsage: ./program [options]"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -f <F> input matrix file name (default=data/bcsstk30.mtx)"
+ "\n"
+ "\nGeneral options:"
+ "\n -v <V> verbosity" "\n -h help" "\n\n");
}
typedef struct Params {
- const char* fileName;
- unsigned int verbosity;
+ const char *fileName;
+ unsigned int verbosity;
} Params;
-static struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.fileName = "data/bcsstk30.mtx";
- p.verbosity = 1;
- int opt;
- while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
- switch(opt) {
- case 'f': p.fileName = optarg; break;
- case 'v': p.verbosity = atoi(optarg); break;
- case 'h': usage(); exit(0);
- default:
- PRINT_ERROR("Unrecognized option!");
- usage();
- exit(0);
- }
- }
-
- return p;
+static struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.fileName = "data/bcsstk30.mtx";
+ p.verbosity = 1;
+ int opt;
+ while ((opt = getopt(argc, argv, "f:v:h")) >= 0) {
+ switch (opt) {
+ case 'f':
+ p.fileName = optarg;
+ break;
+ case 'v':
+ p.verbosity = atoi(optarg);
+ break;
+ case 'h':
+ usage();
+ exit(0);
+ default:
+ PRINT_ERROR("Unrecognized option!");
+ usage();
+ exit(0);
+ }
+ }
+
+ return p;
}
#endif
-
diff --git a/SpMV/support/timer.h b/SpMV/support/timer.h
index 66e9842..7367b11 100644
--- a/SpMV/support/timer.h
+++ b/SpMV/support/timer.h
@@ -6,22 +6,25 @@
#include <sys/time.h>
typedef struct Timer {
- struct timeval startTime;
- struct timeval endTime;
+ struct timeval startTime;
+ struct timeval endTime;
} Timer;
-static void startTimer(Timer* timer) {
- gettimeofday(&(timer->startTime), NULL);
+static void startTimer(Timer *timer)
+{
+ gettimeofday(&(timer->startTime), NULL);
}
-static void stopTimer(Timer* timer) {
- gettimeofday(&(timer->endTime), NULL);
+static void stopTimer(Timer *timer)
+{
+ gettimeofday(&(timer->endTime), NULL);
}
-static double getElapsedTime(Timer timer) {
- return ((double) ((timer.endTime.tv_sec - timer.startTime.tv_sec)
- + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
+static double getElapsedTime(Timer timer)
+{
+ return ((double)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+ + (timer.endTime.tv_usec -
+ timer.startTime.tv_usec) / 1.0e6));
}
#endif
-
diff --git a/SpMV/support/utils.h b/SpMV/support/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/support/utils.h
@@ -8,4 +8,3 @@
#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__)
#endif
-