summaryrefslogtreecommitdiff
path: root/SpMV/host
diff options
context:
space:
mode:
Diffstat (limited to 'SpMV/host')
-rw-r--r--SpMV/host/app.c217
-rw-r--r--SpMV/host/mram-management.h37
2 files changed, 254 insertions, 0 deletions
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
new file mode 100644
index 0000000..8887410
--- /dev/null
+++ b/SpMV/host/app.c
@@ -0,0 +1,217 @@
+/**
+* app.c
+* SpMV Host Application Source File
+*
+*/
+#include <dpu.h>
+#include <dpu_log.h>
+
+#include <assert.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "mram-management.h"
+#include "../support/common.h"
+#include "../support/matrix.h"
+#include "../support/params.h"
+#include "../support/timer.h"
+#include "../support/utils.h"
+
+#define DPU_BINARY "./bin/dpu_code"
+
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+// Main of the Host Application
+int main(int argc, char** argv) {
+
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Timing and profiling
+ Timer timer;
+ float loadTime = 0.0f, dpuTime = 0.0f, retrieveTime = 0.0f;
+ #if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ #endif
+
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs;
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize SpMV data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+ struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+ struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+ uint32_t numRows = csrMatrix.numRows;
+ uint32_t numCols = csrMatrix.numCols;
+ uint32_t* rowPtrs = csrMatrix.rowPtrs;
+ struct Nonzero* nonzeros = csrMatrix.nonzeros;
+ float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
+ initVector(inVector, numCols);
+ float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
+
+ // Partition data structure across DPUs
+ uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ unsigned int dpuIdx = 0;
+ PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+ DPU_FOREACH (dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's rows
+ uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
+ uint32_t dpuNumRows;
+ if(dpuStartRowIdx > numRows) {
+ dpuNumRows = 0;
+ } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
+ dpuNumRows = numRows - dpuStartRowIdx;
+ } else {
+ dpuNumRows = numRowsPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u rows", dpuNumRows);
+
+ // Partition nonzeros and copy data
+ if(dpuNumRows > 0) {
+
+ // Find DPU's CSR matrix partition
+ uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+ uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+ struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
+ uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+ // Allocate MRAM
+ uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
+ uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
+ uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
+ uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
+ assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
+ PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+ dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+ dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+ dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+ dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
+ startTimer(&timer);
+ copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
+ copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
+ stopTimer(&timer);
+ loadTime += getElapsedTime(timer);
+
+ }
+
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
+ startTimer(&timer);
+ copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
+ stopTimer(&timer);
+ loadTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+ PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", loadTime*1e3);
+
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+ startTimer(&timer);
+ #if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+ #endif
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ #if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy);
+ #endif
+ stopTimer(&timer);
+ dpuTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime*1e3);
+
+ // Copy back result
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer);
+ dpuIdx = 0;
+ DPU_FOREACH (dpu_set, dpu) {
+ unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+ if(dpuNumRows > 0) {
+ uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
+ copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer);
+ retrieveTime += getElapsedTime(timer);
+ PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", retrieveTime*1e3);
+ if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, retrieveTime*1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ float* outVectorReference = malloc(numRows*sizeof(float));
+ for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float sum = 0.0f;
+ for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+ uint32_t colIdx = nonzeros[i].col;
+ float value = nonzeros[i].value;
+ sum += inVector[colIdx]*value;
+ }
+ outVectorReference[rowIdx] = sum;
+ }
+
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+ float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
+ const float tolerance = 0.00001;
+ if(diff > tolerance || diff < -tolerance) {
+ PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
+ }
+ }
+
+ // Display DPU Logs
+ if(p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH (dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+
+ // Deallocate data structures
+ freeCOOMatrix(cooMatrix);
+ freeCSRMatrix(csrMatrix);
+ free(inVector);
+ free(outVector);
+ free(outVectorReference);
+
+ return 0;
+}
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
new file mode 100644
index 0000000..627dfde
--- /dev/null
+++ b/SpMV/host/mram-management.h
@@ -0,0 +1,37 @@
+
+#ifndef _MRAM_MANAGEMENT_H_
+#define _MRAM_MANAGEMENT_H_
+
+#include "../support/common.h"
+#include "../support/utils.h"
+
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+
+struct mram_heap_allocator_t {
+ uint32_t totalAllocated;
+};
+
+static void init_allocator(struct mram_heap_allocator_t* allocator) {
+ allocator->totalAllocated = 0;
+}
+
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if(allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
+}
+
+static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+}
+
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+}
+
+#endif
+