diff options
Diffstat (limited to 'SpMV/host')
-rw-r--r-- | SpMV/host/app.c | 217 | ||||
-rw-r--r-- | SpMV/host/mram-management.h | 37 |
2 files changed, 254 insertions, 0 deletions
diff --git a/SpMV/host/app.c b/SpMV/host/app.c new file mode 100644 index 0000000..8887410 --- /dev/null +++ b/SpMV/host/app.c @@ -0,0 +1,217 @@ +/** +* app.c +* SpMV Host Application Source File +* +*/ +#include <dpu.h> +#include <dpu_log.h> + +#include <assert.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "mram-management.h" +#include "../support/common.h" +#include "../support/matrix.h" +#include "../support/params.h" +#include "../support/timer.h" +#include "../support/utils.h" + +#define DPU_BINARY "./bin/dpu_code" + +#ifndef ENERGY +#define ENERGY 0 +#endif +#if ENERGY +#include <dpu_probe.h> +#endif + +// Main of the Host Application +int main(int argc, char** argv) { + + // Process parameters + struct Params p = input_params(argc, argv); + + // Timing and profiling + Timer timer; + float loadTime = 0.0f, dpuTime = 0.0f, retrieveTime = 0.0f; + #if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + #endif + + // Allocate DPUs and load binary + struct dpu_set_t dpu_set, dpu; + uint32_t numDPUs; + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); + PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs); + + // Initialize SpMV data structures + PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName); + struct COOMatrix cooMatrix = readCOOMatrix(p.fileName); + PRINT_INFO(p.verbosity >= 1, " %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros); + struct CSRMatrix csrMatrix = coo2csr(cooMatrix); + uint32_t numRows = csrMatrix.numRows; + uint32_t numCols = csrMatrix.numCols; + uint32_t* rowPtrs = csrMatrix.rowPtrs; + struct Nonzero* nonzeros = csrMatrix.nonzeros; + float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float))); + initVector(inVector, numCols); + float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float))); + + // Partition data structure across DPUs + uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1); + PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU); + struct DPUParams dpuParams[numDPUs]; + unsigned int dpuIdx = 0; + PRINT_INFO(p.verbosity == 1, "Copying data to DPUs"); + DPU_FOREACH (dpu_set, dpu) { + + // Allocate parameters + struct mram_heap_allocator_t allocator; + init_allocator(&allocator); + uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams)); + + // Find DPU's rows + uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU; + uint32_t dpuNumRows; + if(dpuStartRowIdx > numRows) { + dpuNumRows = 0; + } else if(dpuStartRowIdx + numRowsPerDPU > numRows) { + dpuNumRows = numRows - dpuStartRowIdx; + } else { + dpuNumRows = numRowsPerDPU; + } + dpuParams[dpuIdx].dpuNumRows = dpuNumRows; + PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx); + PRINT_INFO(p.verbosity >= 2, " Receives %u rows", dpuNumRows); + + // Partition nonzeros and copy data + if(dpuNumRows > 0) { + + // Find DPU's CSR matrix partition + uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx]; + uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0]; + struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset]; + uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset; + + // Allocate MRAM + uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t)); + uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero)); + uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float)); + uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float)); + assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!"); + PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated); + + // Set up DPU parameters + dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset; + dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m; + dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m; + dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m; + dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m; + + // Send data to DPU + PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); + startTimer(&timer); + copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t)); + copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero)); + copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float)); + stopTimer(&timer); + loadTime += getElapsedTime(timer); + + } + + // Send parameters to DPU + PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); + startTimer(&timer); + copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams)); + stopTimer(&timer); + loadTime += getElapsedTime(timer); + + ++dpuIdx; + + } + PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", loadTime*1e3); + + // Run all DPUs + PRINT_INFO(p.verbosity >= 1, "Booting DPUs"); + startTimer(&timer); + #if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); + #endif + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + #if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); + double energy; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", energy); + #endif + stopTimer(&timer); + dpuTime += getElapsedTime(timer); + PRINT_INFO(p.verbosity >= 1, " DPU Time: %f ms", dpuTime*1e3); + + // Copy back result + PRINT_INFO(p.verbosity >= 1, "Copying back the result"); + startTimer(&timer); + dpuIdx = 0; + DPU_FOREACH (dpu_set, dpu) { + unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows; + if(dpuNumRows > 0) { + uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU; + copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float)); + } + ++dpuIdx; + } + stopTimer(&timer); + retrieveTime += getElapsedTime(timer); + PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", retrieveTime*1e3); + if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, retrieveTime*1e3); + + // Calculating result on CPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); + float* outVectorReference = malloc(numRows*sizeof(float)); + for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) { + float sum = 0.0f; + for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) { + uint32_t colIdx = nonzeros[i].col; + float value = nonzeros[i].value; + sum += inVector[colIdx]*value; + } + outVectorReference[rowIdx] = sum; + } + + // Verify the result + PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) { + float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx]; + const float tolerance = 0.00001; + if(diff > tolerance || diff < -tolerance) { + PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]); + } + } + + // Display DPU Logs + if(p.verbosity >= 2) { + PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); + dpuIdx = 0; + DPU_FOREACH (dpu_set, dpu) { + PRINT("DPU %u:", dpuIdx); + DPU_ASSERT(dpu_log_read(dpu, stdout)); + ++dpuIdx; + } + } + + // Deallocate data structures + freeCOOMatrix(cooMatrix); + freeCSRMatrix(csrMatrix); + free(inVector); + free(outVector); + free(outVectorReference); + + return 0; +} diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h new file mode 100644 index 0000000..627dfde --- /dev/null +++ b/SpMV/host/mram-management.h @@ -0,0 +1,37 @@ + +#ifndef _MRAM_MANAGEMENT_H_ +#define _MRAM_MANAGEMENT_H_ + +#include "../support/common.h" +#include "../support/utils.h" + +#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB + +struct mram_heap_allocator_t { + uint32_t totalAllocated; +}; + +static void init_allocator(struct mram_heap_allocator_t* allocator) { + allocator->totalAllocated = 0; +} + +static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) { + uint32_t ret = allocator->totalAllocated; + allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size); + if(allocator->totalAllocated > DPU_CAPACITY) { + PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY); + exit(0); + } + return ret; +} + +static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) { + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); +} + +static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) { + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); +} + +#endif + |