9 files changed, 681 insertions, 541 deletions
diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c
index 8d360ee..e33761f 100644
--- a/SpMV/baselines/cpu/app.c
+++ b/SpMV/baselines/cpu/app.c
@@ -13,60 +13,63 @@
 #include "../../support/timer.h"
 #include "../../support/utils.h"
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
 
-    // Process parameters
-    struct Params p = input_params(argc, argv);
+	// Process parameters
+	struct Params p = input_params(argc, argv);
 
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    float* inVector = malloc(csrMatrix.numCols*sizeof(float));
-    float* outVector = malloc(csrMatrix.numRows*sizeof(float));
-    initVector(inVector, csrMatrix.numCols);
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	float *inVector = malloc(csrMatrix.numCols * sizeof(float));
+	float *outVector = malloc(csrMatrix.numRows * sizeof(float));
+	initVector(inVector, csrMatrix.numCols);
 
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    //omp_set_num_threads(4);
-    Timer timer;
-    startTimer(&timer);
-    #pragma omp parallel for
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = csrMatrix.rowPtrs[rowIdx]; i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = csrMatrix.nonzeros[i].col;
-            float value = csrMatrix.nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVector[rowIdx] = sum;
-    }
-    stopTimer(&timer);
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	//omp_set_num_threads(4);
+	Timer timer;
+	startTimer(&timer);
+#pragma omp parallel for
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = csrMatrix.rowPtrs[rowIdx];
+		     i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = csrMatrix.nonzeros[i].col;
+			float value = csrMatrix.nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVector[rowIdx] = sum;
+	}
+	stopTimer(&timer);
 
-
-    unsigned int nr_threads = 0;
+	unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-    nr_threads++;
-
+	nr_threads++;
 
-    // coomatrix / csrmatrix use uint32_t indexes and float values
-    printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
-        " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
-        nr_threads, csrMatrix.numNonzeros,
-        csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6),
-        csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6),
-        getElapsedTime(timer)*1e6);
-    //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
-    PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+	// coomatrix / csrmatrix use uint32_t indexes and float values
+	printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
+	       " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
+	       nr_threads, csrMatrix.numNonzeros,
+	       csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer) *
+							1e6),
+	       csrMatrix.numNonzeros / (getElapsedTime(timer) * 1e6),
+	       getElapsedTime(timer) * 1e6);
+	//if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+	PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms",
+		   getElapsedTime(timer) * 1e3);
 
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
 
-    return 0;
+	return 0;
 
 }
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..501a62a 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -20,120 +20,164 @@
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // main
-int main() {
-
-    if(me() == 0) {
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Load parameters
-    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    uint32_t numRows = params_w->dpuNumRows;
-
-    // Sanity check
-    if(me() == 0) {
-        if(numRows%2 != 0) {
-            // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
-            PRINT_ERROR("The number of rows is not a multiple of two!");
-        }
-    }
-
-    // Identify tasklet's rows
-    uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
-    uint32_t taskletRowsStart = me()*numRowsPerTasklet;
-    uint32_t taskletNumRows;
-    if(taskletRowsStart > numRows) {
-        taskletNumRows = 0;
-    } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
-        taskletNumRows = numRows - taskletRowsStart;
-    } else {
-        taskletNumRows = numRowsPerTasklet;
-    }
-
-    // Only process tasklets with nonzero number of rows
-    if(taskletNumRows > 0) {
-
-        // Extract parameters
-        uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
-        uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
-        uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
-        uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
-        uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
-        // Initialize row pointer sequential reader
-        uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
-        seqreader_t rowPtrReader;
-        uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
-        uint32_t firstRowPtr = *taskletRowPtrs_w;
-
-        // Initialize nonzeros sequential reader
-        uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
-        uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
-        seqreader_t nonzerosReader;
-        struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
-        // Initialize input vector cache
-        uint32_t inVectorTileSize = 64;
-        float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
-        mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
-        uint32_t currInVectorTileIdx = 0;
-
-        // Initialize output vector cache
-        uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
-        uint32_t outVectorTileSize = 64;
-        float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
-        // SpMV
-        uint32_t nextRowPtr = firstRowPtr;
-        for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
-            // Find row nonzeros
-            taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
-            uint32_t rowPtr = nextRowPtr;
-            nextRowPtr = *taskletRowPtrs_w;
-            uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
-            // Multiply row with vector
-            float outValue = 0.0f;
-            for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
-                // Get matrix value
-                float matValue = taskletNonzeros_w->value;
-
-                // Get input vector value
-                uint32_t col = taskletNonzeros_w->col;
-                uint32_t inVectorTileIdx = col/inVectorTileSize;
-                uint32_t inVectorTileOffset = col%inVectorTileSize;
-                if(inVectorTileIdx != currInVectorTileIdx) {
-                    mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
-                    currInVectorTileIdx = inVectorTileIdx;
-                }
-                float inValue = inVectorTile_w[inVectorTileOffset];
-
-                // Multiply and add
-                outValue += matValue*inValue;
-
-                // Read next nonzero
-                taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
-            }
-
-            // Store output
-            uint32_t outVectorTileIdx = row/outVectorTileSize;
-            uint32_t outVectorTileOffset = row%outVectorTileSize;
-            outVectorTile_w[outVectorTileOffset] = outValue;
-            if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
-            } else if(row == taskletNumRows - 1) { // Last row for tasklet
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
-            }
-
-        }
-    }
-
-    return 0;
+int main()
+{
+
+	if (me() == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Load parameters
+	uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	struct DPUParams *params_w =
+	    (struct DPUParams *)
+	    mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	mram_read((__mram_ptr void const *)params_m, params_w,
+		  ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	uint32_t numRows = params_w->dpuNumRows;
+
+	// Sanity check
+	if (me() == 0) {
+		if (numRows % 2 != 0) {
+			// The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+			PRINT_ERROR
+			    ("The number of rows is not a multiple of two!");
+		}
+	}
+	// Identify tasklet's rows
+	uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1);	// Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+	uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+	uint32_t taskletNumRows;
+	if (taskletRowsStart > numRows) {
+		taskletNumRows = 0;
+	} else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+		taskletNumRows = numRows - taskletRowsStart;
+	} else {
+		taskletNumRows = numRowsPerTasklet;
+	}
+
+	// Only process tasklets with nonzero number of rows
+	if (taskletNumRows > 0) {
+
+		// Extract parameters
+		uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+		uint32_t rowPtrs_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+		uint32_t nonzeros_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuNonzeros_m;
+		uint32_t inVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuInVector_m;
+		uint32_t outVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuOutVector_m;
+
+		// Initialize row pointer sequential reader
+		uint32_t taskletRowPtrs_m =
+		    rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+		seqreader_t rowPtrReader;
+		uint32_t *taskletRowPtrs_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletRowPtrs_m,
+				 &rowPtrReader);
+		uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+		// Initialize nonzeros sequential reader
+		uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+		uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero);	// 8-byte aligned because Nonzero is 8 bytes
+		seqreader_t nonzerosReader;
+		struct Nonzero *taskletNonzeros_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletNonzeros_m,
+				 &nonzerosReader);
+
+		// Initialize input vector cache
+		uint32_t inVectorTileSize = 64;
+		float *inVectorTile_w =
+		    mem_alloc(inVectorTileSize * sizeof(float));
+		mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+			  256);
+		uint32_t currInVectorTileIdx = 0;
+
+		// Initialize output vector cache
+		uint32_t taskletOutVector_m =
+		    outVector_m + taskletRowsStart * sizeof(float);
+		uint32_t outVectorTileSize = 64;
+		float *outVectorTile_w =
+		    mem_alloc(outVectorTileSize * sizeof(float));
+
+		// SpMV
+		uint32_t nextRowPtr = firstRowPtr;
+		for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+			// Find row nonzeros
+			taskletRowPtrs_w =
+			    seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+					&rowPtrReader);
+			uint32_t rowPtr = nextRowPtr;
+			nextRowPtr = *taskletRowPtrs_w;
+			uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+			// Multiply row with vector
+			float outValue = 0.0f;
+			for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+				// Get matrix value
+				float matValue = taskletNonzeros_w->value;
+
+				// Get input vector value
+				uint32_t col = taskletNonzeros_w->col;
+				uint32_t inVectorTileIdx =
+				    col / inVectorTileSize;
+				uint32_t inVectorTileOffset =
+				    col % inVectorTileSize;
+				if (inVectorTileIdx != currInVectorTileIdx) {
+					mram_read((__mram_ptr void const
+						   *)(inVector_m +
+						      inVectorTileIdx *
+						      inVectorTileSize *
+						      sizeof(float)),
+						  inVectorTile_w, 256);
+					currInVectorTileIdx = inVectorTileIdx;
+				}
+				float inValue =
+				    inVectorTile_w[inVectorTileOffset];
+
+				// Multiply and add
+				outValue += matValue * inValue;
+
+				// Read next nonzero
+				taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader);	// Last read will be out of bounds and unused
+
+			}
+
+			// Store output
+			uint32_t outVectorTileIdx = row / outVectorTileSize;
+			uint32_t outVectorTileOffset = row % outVectorTileSize;
+			outVectorTile_w[outVectorTileOffset] = outValue;
+			if (outVectorTileOffset == outVectorTileSize - 1) {	// Last element in tile
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)), 256);
+			} else if (row == taskletNumRows - 1) {	// Last row for tasklet
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)),
+					   (taskletNumRows %
+					    outVectorTileSize) * sizeof(float));
+			}
+
+		}
+	}
+
+	return 0;
 }
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index be9ee37..ffccb70 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -33,228 +33,284 @@
 #endif
 
 // Main of the Host Application
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
 
-    // Process parameters
-    struct Params p = input_params(argc, argv);
+	// Process parameters
+	struct Params p = input_params(argc, argv);
 
-    // Timing and profiling
-    Timer timer;
-    double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime = 0.0f, readTime = 0.0f, freeTime = 0.0f;
-    #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-    #endif
-
-    // Allocate DPUs and load binary
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t numDPUs, numRanks;
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    stopTimer(&timer);
-    allocTime += getElapsedTime(timer);
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    stopTimer(&timer);
-    loadTime += getElapsedTime(timer);
-
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
-    assert(numDPUs == NR_DPUS);
-    PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    uint32_t numRows = csrMatrix.numRows;
-    uint32_t numCols = csrMatrix.numCols;
-    uint32_t* rowPtrs = csrMatrix.rowPtrs;
-    struct Nonzero* nonzeros = csrMatrix.nonzeros;
-    float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
-    initVector(inVector, numCols);
-    float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
-
-    // Partition data structure across DPUs
-    uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
-    PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
-    struct DPUParams dpuParams[numDPUs];
-    unsigned int dpuIdx = 0;
-    PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
-    DPU_FOREACH (dpu_set, dpu) {
-
-        // Allocate parameters
-        struct mram_heap_allocator_t allocator;
-        init_allocator(&allocator);
-        uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
-        // Find DPU's rows
-        uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-        uint32_t dpuNumRows;
-        if(dpuStartRowIdx > numRows) {
-            dpuNumRows = 0;
-        } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
-            dpuNumRows = numRows - dpuStartRowIdx;
-        } else {
-            dpuNumRows = numRowsPerDPU;
-        }
-        dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
-        PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
-        PRINT_INFO(p.verbosity >= 2, "        Receives %u rows", dpuNumRows);
-
-        // Partition nonzeros and copy data
-        if(dpuNumRows > 0) {
-
-            // Find DPU's CSR matrix partition
-            uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
-            uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
-            struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
-            uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
-
-            // Allocate MRAM
-            uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
-            uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
-            uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
-            uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
-            assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
-            PRINT_INFO(p.verbosity >= 2, "        Total memory allocated is %d bytes", allocator.totalAllocated);
-
-            // Set up DPU parameters
-            dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
-            dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
-            dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
-            dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
-            dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
-
-            // Send data to DPU
-            PRINT_INFO(p.verbosity >= 2, "        Copying data to DPU");
-            startTimer(&timer);
-            copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
-            copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
-            stopTimer(&timer);
-            writeTime += getElapsedTime(timer);
-
-        }
-
-        // Send parameters to DPU
-        PRINT_INFO(p.verbosity >= 2, "        Copying parameters to DPU");
-        startTimer(&timer);
-        copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
-        stopTimer(&timer);
-        writeTime += getElapsedTime(timer);
-
-        ++dpuIdx;
-
-    }
-    PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms", writeTime*1e3);
-
-    // Run all DPUs
-    PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
-    startTimer(&timer);
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_start(&probe));
-    #endif
-    DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_stop(&probe));
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
-    #endif
-    stopTimer(&timer);
-    dpuTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime*1e3);
-
-    // Copy back result
-    PRINT_INFO(p.verbosity >= 1, "Copying back the result");
-    startTimer(&timer);
-    dpuIdx = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-        unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
-        if(dpuNumRows > 0) {
-            uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-            copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
-        }
-        ++dpuIdx;
-    }
-    stopTimer(&timer);
-    readTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime*1e3);
-
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    float* outVectorReference = malloc(numRows*sizeof(float));
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = nonzeros[i].col;
-            float value = nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVectorReference[rowIdx] = sum;
-    }
-
-    // Verify the result
-    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
-    int status = 1;
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
-        const float tolerance = 0.00001;
-        if(diff > tolerance || diff < -tolerance) {
-            status = 0;
-            PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
-        }
-    }
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_free(dpu_set));
-    stopTimer(&timer);
-    freeTime += getElapsedTime(timer);
-
-    if (status) {
-        printf("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
-            numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros);
-        printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-            allocTime, loadTime, writeTime, dpuTime, readTime, freeTime);
-        printf(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-            csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-        printf(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros / (dpuTime * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-            csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-    }
-
-    // Display DPU Logs
-    if(p.verbosity >= 2) {
-        PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            PRINT("DPU %u:", dpuIdx);
-            DPU_ASSERT(dpu_log_read(dpu, stdout));
-            ++dpuIdx;
-        }
-    }
-
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
-    free(outVectorReference);
+	// Timing and profiling
+	Timer timer;
+	double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime =
+	    0.0f, readTime = 0.0f, freeTime = 0.0f;
+#if ENERGY
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+#endif
 
-    return 0;
+	// Allocate DPUs and load binary
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t numDPUs, numRanks;
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	stopTimer(&timer);
+	allocTime += getElapsedTime(timer);
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	stopTimer(&timer);
+	loadTime += getElapsedTime(timer);
+
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+	assert(numDPUs == NR_DPUS);
+	PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	uint32_t numRows = csrMatrix.numRows;
+	uint32_t numCols = csrMatrix.numCols;
+	uint32_t *rowPtrs = csrMatrix.rowPtrs;
+	struct Nonzero *nonzeros = csrMatrix.nonzeros;
+	float *inVector =
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+	initVector(inVector, numCols);
+	float *outVector =
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+
+	// Partition data structure across DPUs
+	uint32_t numRowsPerDPU =
+	    ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / numDPUs + 1);
+	PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU",
+		   numRowsPerDPU);
+	struct DPUParams dpuParams[numDPUs];
+	unsigned int dpuIdx = 0;
+	PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+	DPU_FOREACH(dpu_set, dpu) {
+
+		// Allocate parameters
+		struct mram_heap_allocator_t allocator;
+		init_allocator(&allocator);
+		uint32_t dpuParams_m =
+		    mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+		// Find DPU's rows
+		uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+		uint32_t dpuNumRows;
+		if (dpuStartRowIdx > numRows) {
+			dpuNumRows = 0;
+		} else if (dpuStartRowIdx + numRowsPerDPU > numRows) {
+			dpuNumRows = numRows - dpuStartRowIdx;
+		} else {
+			dpuNumRows = numRowsPerDPU;
+		}
+		dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+		PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
+		PRINT_INFO(p.verbosity >= 2, "        Receives %u rows",
+			   dpuNumRows);
+
+		// Partition nonzeros and copy data
+		if (dpuNumRows > 0) {
+
+			// Find DPU's CSR matrix partition
+			uint32_t *dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+			uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+			struct Nonzero *dpuNonzeros_h =
+			    &nonzeros[dpuRowPtrsOffset];
+			uint32_t dpuNumNonzeros =
+			    dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+			// Allocate MRAM
+			uint32_t dpuRowPtrs_m =
+			    mram_heap_alloc(&allocator,
+					    (dpuNumRows +
+					     1) * sizeof(uint32_t));
+			uint32_t dpuNonzeros_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNonzeros *
+					    sizeof(struct Nonzero));
+			uint32_t dpuInVector_m =
+			    mram_heap_alloc(&allocator,
+					    numCols * sizeof(float));
+			uint32_t dpuOutVector_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumRows * sizeof(float));
+			assert((dpuNumRows * sizeof(float)) % 8 == 0
+			       &&
+			       "Output sub-vector must be a multiple of 8 bytes!");
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Total memory allocated is %d bytes",
+				   allocator.totalAllocated);
+
+			// Set up DPU parameters
+			dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+			dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+			dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+			dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+			dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+			// Send data to DPU
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Copying data to DPU");
+			startTimer(&timer);
+			copyToDPU(dpu, (uint8_t *) dpuRowPtrs_h, dpuRowPtrs_m,
+				  (dpuNumRows + 1) * sizeof(uint32_t));
+			copyToDPU(dpu, (uint8_t *) dpuNonzeros_h, dpuNonzeros_m,
+				  dpuNumNonzeros * sizeof(struct Nonzero));
+			copyToDPU(dpu, (uint8_t *) inVector, dpuInVector_m,
+				  numCols * sizeof(float));
+			stopTimer(&timer);
+			writeTime += getElapsedTime(timer);
+
+		}
+		// Send parameters to DPU
+		PRINT_INFO(p.verbosity >= 2,
+			   "        Copying parameters to DPU");
+		startTimer(&timer);
+		copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], dpuParams_m,
+			  sizeof(struct DPUParams));
+		stopTimer(&timer);
+		writeTime += getElapsedTime(timer);
+
+		++dpuIdx;
+
+	}
+	PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms",
+		   writeTime * 1e3);
+
+	// Run all DPUs
+	PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+	startTimer(&timer);
+#if ENERGY
+	DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+	DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+#if ENERGY
+	DPU_ASSERT(dpu_probe_stop(&probe));
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
+#endif
+	stopTimer(&timer);
+	dpuTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime * 1e3);
+
+	// Copy back result
+	PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+	startTimer(&timer);
+	dpuIdx = 0;
+	DPU_FOREACH(dpu_set, dpu) {
+		unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+		if (dpuNumRows > 0) {
+			uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+			copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m,
+				    (uint8_t *) (outVector + dpuStartRowIdx),
+				    dpuNumRows * sizeof(float));
+		}
+		++dpuIdx;
+	}
+	stopTimer(&timer);
+	readTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime * 1e3);
+
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	float *outVectorReference = malloc(numRows * sizeof(float));
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = nonzeros[i].col;
+			float value = nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVectorReference[rowIdx] = sum;
+	}
+
+	// Verify the result
+	PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+	int status = 1;
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float diff =
+		    (outVectorReference[rowIdx] -
+		     outVector[rowIdx]) / outVectorReference[rowIdx];
+		const float tolerance = 0.00001;
+		if (diff > tolerance || diff < -tolerance) {
+			status = 0;
+			PRINT_ERROR
+			    ("Mismatch at index %u (CPU result = %f, DPU result = %f)",
+			     rowIdx, outVectorReference[rowIdx],
+			     outVector[rowIdx]);
+		}
+	}
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_free(dpu_set));
+	stopTimer(&timer);
+	freeTime += getElapsedTime(timer);
+
+	if (status) {
+		printf
+		    ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
+		     numDPUs, numRanks, NR_TASKLETS, "float",
+		     csrMatrix.numNonzeros);
+		printf
+		    ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+		     allocTime, loadTime, writeTime, dpuTime, readTime,
+		     freeTime);
+		printf
+		    (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		printf
+		    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+		printf
+		    (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		printf
+		    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+		     csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
+					      1e6),
+		     csrMatrix.numNonzeros /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+	}
+	// Display DPU Logs
+	if (p.verbosity >= 2) {
+		PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			PRINT("DPU %u:", dpuIdx);
+			DPU_ASSERT(dpu_log_read(dpu, stdout));
+			++dpuIdx;
+		}
+	}
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
+	free(outVectorReference);
+
+	return 0;
 }
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index 627dfde..f2ee031 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -5,33 +5,45 @@
 #include "../support/common.h"
 #include "../support/utils.h"
 
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20)	// A DPU's capacity is 64 MiB
 
 struct mram_heap_allocator_t {
-    uint32_t totalAllocated;
+	uint32_t totalAllocated;
 };
 
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
-    allocator->totalAllocated = 0;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+	allocator->totalAllocated = 0;
 }
 
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
-    uint32_t ret = allocator->totalAllocated;
-    allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
-    if(allocator->totalAllocated > DPU_CAPACITY) {
-        PRINT_ERROR("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
-        exit(0);
-    }
-    return ret;
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+				uint32_t size)
+{
+	uint32_t ret = allocator->totalAllocated;
+	allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+	if (allocator->totalAllocated > DPU_CAPACITY) {
+		PRINT_ERROR
+		    ("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+		     allocator->totalAllocated, DPU_CAPACITY);
+		exit(0);
+	}
+	return ret;
 }
 
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
-    DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
+		      uint32_t size)
+{
+	DPU_ASSERT(dpu_copy_to
+		   (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+		    ROUND_UP_TO_MULTIPLE_OF_8(size)));
 }
 
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
-    DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
+			uint8_t *hostPtr, uint32_t size)
+{
+	DPU_ASSERT(dpu_copy_from
+		   (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+		    ROUND_UP_TO_MULTIPLE_OF_8(size)));
 }
 
 #endif
-
diff --git a/SpMV/support/common.h b/SpMV/support/common.h
index 58fede8..6118814 100644
--- a/SpMV/support/common.h
+++ b/SpMV/support/common.h
@@ -8,18 +8,17 @@
 #define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
 
 struct DPUParams {
-    uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
-    uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
-    uint32_t dpuRowPtrs_m;
-    uint32_t dpuNonzeros_m;
-    uint32_t dpuInVector_m;
-    uint32_t dpuOutVector_m;
+	uint32_t dpuNumRows;	/* Number of rows assigned to the DPU */
+	uint32_t dpuRowPtrsOffset;	/* Offset of the row pointers */
+	uint32_t dpuRowPtrs_m;
+	uint32_t dpuNonzeros_m;
+	uint32_t dpuInVector_m;
+	uint32_t dpuOutVector_m;
 };
 
 struct Nonzero {
-    uint32_t col;
-    float value;
+	uint32_t col;
+	float value;
 };
 
 #endif
-
diff --git a/SpMV/support/matrix.h b/SpMV/support/matrix.h
index d25da1b..ce8745e 100644
--- a/SpMV/support/matrix.h
+++ b/SpMV/support/matrix.h
@@ -9,111 +9,130 @@
 #include "utils.h"
 
 struct COOMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowIdxs;
-    struct Nonzero* nonzeros;
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowIdxs;
+	struct Nonzero *nonzeros;
 };
 
 struct CSRMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowPtrs;
-    struct Nonzero* nonzeros;
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowPtrs;
+	struct Nonzero *nonzeros;
 };
 
-static struct COOMatrix readCOOMatrix(const char* fileName) {
-
-    struct COOMatrix cooMatrix;
-
-    // Initialize fields
-    FILE* fp = fopen(fileName, "r");
-    assert(fscanf(fp, "%u", &cooMatrix.numRows));
-    if(cooMatrix.numRows%2 == 1) {
-        PRINT_WARNING("Reading matrix %s: number of rows must be even. Padding with an extra row.", fileName);
-        cooMatrix.numRows++;
-    }
-    assert(fscanf(fp, "%u", &cooMatrix.numCols));
-    assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
-    cooMatrix.rowIdxs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(uint32_t)));
-    cooMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Read the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx;
-        assert(fscanf(fp, "%u", &rowIdx));
-        cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
-        uint32_t colIdx;
-        assert(fscanf(fp, "%u", &colIdx));
-        cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
-        cooMatrix.nonzeros[i].value = 1.0f;
-    }
-
-    return cooMatrix;
+static struct COOMatrix readCOOMatrix(const char *fileName)
+{
+
+	struct COOMatrix cooMatrix;
+
+	// Initialize fields
+	FILE *fp = fopen(fileName, "r");
+	assert(fscanf(fp, "%u", &cooMatrix.numRows));
+	if (cooMatrix.numRows % 2 == 1) {
+		PRINT_WARNING
+		    ("Reading matrix %s: number of rows must be even. Padding with an extra row.",
+		     fileName);
+		cooMatrix.numRows++;
+	}
+	assert(fscanf(fp, "%u", &cooMatrix.numCols));
+	assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
+	cooMatrix.rowIdxs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(uint32_t)));
+	cooMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Read the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx;
+		assert(fscanf(fp, "%u", &rowIdx));
+		cooMatrix.rowIdxs[i] = rowIdx - 1;	// File format indexes begin at 1
+		uint32_t colIdx;
+		assert(fscanf(fp, "%u", &colIdx));
+		cooMatrix.nonzeros[i].col = colIdx - 1;	// File format indexes begin at 1
+		cooMatrix.nonzeros[i].value = 1.0f;
+	}
+
+	return cooMatrix;
 
 }
 
-static void freeCOOMatrix(struct COOMatrix cooMatrix) {
-    free(cooMatrix.rowIdxs);
-    free(cooMatrix.nonzeros);
+static void freeCOOMatrix(struct COOMatrix cooMatrix)
+{
+	free(cooMatrix.rowIdxs);
+	free(cooMatrix.nonzeros);
 }
 
-static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix) {
-
-    struct CSRMatrix csrMatrix;
-
-    // Initialize fields
-    csrMatrix.numRows = cooMatrix.numRows;
-    csrMatrix.numCols = cooMatrix.numCols;
-    csrMatrix.numNonzeros = cooMatrix.numNonzeros;
-    csrMatrix.rowPtrs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8((csrMatrix.numRows + 1)*sizeof(uint32_t)));
-    csrMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Histogram rowIdxs
-    memset(csrMatrix.rowPtrs, 0, (csrMatrix.numRows + 1)*sizeof(uint32_t));
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        csrMatrix.rowPtrs[rowIdx]++;
-    }
-
-    // Prefix sum rowPtrs
-    uint32_t sumBeforeNextRow = 0;
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        uint32_t sumBeforeRow = sumBeforeNextRow;
-        sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
-        csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
-    }
-    csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
-
-    // Bin the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
-        csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
-    }
-
-    // Restore rowPtrs
-    for(uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
-        csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
-    }
-    csrMatrix.rowPtrs[0] = 0;
-
-    return csrMatrix;
+static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix)
+{
+
+	struct CSRMatrix csrMatrix;
+
+	// Initialize fields
+	csrMatrix.numRows = cooMatrix.numRows;
+	csrMatrix.numCols = cooMatrix.numCols;
+	csrMatrix.numNonzeros = cooMatrix.numNonzeros;
+	csrMatrix.rowPtrs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   ((csrMatrix.numRows + 1) * sizeof(uint32_t)));
+	csrMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (csrMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Histogram rowIdxs
+	memset(csrMatrix.rowPtrs, 0,
+	       (csrMatrix.numRows + 1) * sizeof(uint32_t));
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		csrMatrix.rowPtrs[rowIdx]++;
+	}
+
+	// Prefix sum rowPtrs
+	uint32_t sumBeforeNextRow = 0;
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		uint32_t sumBeforeRow = sumBeforeNextRow;
+		sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
+		csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
+	}
+	csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
+
+	// Bin the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
+		csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
+	}
+
+	// Restore rowPtrs
+	for (uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
+		csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
+	}
+	csrMatrix.rowPtrs[0] = 0;
+
+	return csrMatrix;
 
 }
 
-static void freeCSRMatrix(struct CSRMatrix csrMatrix) {
-    free(csrMatrix.rowPtrs);
-    free(csrMatrix.nonzeros);
+static void freeCSRMatrix(struct CSRMatrix csrMatrix)
+{
+	free(csrMatrix.rowPtrs);
+	free(csrMatrix.nonzeros);
 }
 
-static void initVector(float* vec, uint32_t size) {
-    for(uint32_t i = 0; i < size; ++i) {
-        vec[i] = 1.0f;
-    }
+static void initVector(float *vec, uint32_t size)
+{
+	for (uint32_t i = 0; i < size; ++i) {
+		vec[i] = 1.0f;
+	}
 }
 
 #endif
-
diff --git a/SpMV/support/params.h b/SpMV/support/params.h
index b4b696c..bf60e79 100644
--- a/SpMV/support/params.h
+++ b/SpMV/support/params.h
@@ -5,42 +5,47 @@
 #include "common.h"
 #include "utils.h"
 
-static void usage() {
-    PRINT(  "\nUsage:  ./program [options]"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
-            "\n"
-            "\nGeneral options:"
-            "\n    -v <V>    verbosity"
-            "\n    -h        help"
-            "\n\n");
+static void usage()
+{
+	PRINT("\nUsage:  ./program [options]"
+	      "\n"
+	      "\nBenchmark-specific options:"
+	      "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
+	      "\n"
+	      "\nGeneral options:"
+	      "\n    -v <V>    verbosity" "\n    -h        help" "\n\n");
 }
 
 typedef struct Params {
-  const char* fileName;
-  unsigned int verbosity;
+	const char *fileName;
+	unsigned int verbosity;
 } Params;
 
-static struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.fileName      = "data/bcsstk30.mtx";
-    p.verbosity     = 1;
-    int opt;
-    while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
-        switch(opt) {
-            case 'f': p.fileName    = optarg;       break;
-            case 'v': p.verbosity   = atoi(optarg); break;
-            case 'h': usage(); exit(0);
-            default:
-                      PRINT_ERROR("Unrecognized option!");
-                      usage();
-                      exit(0);
-        }
-    }
-
-    return p;
+static struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.fileName = "data/bcsstk30.mtx";
+	p.verbosity = 1;
+	int opt;
+	while ((opt = getopt(argc, argv, "f:v:h")) >= 0) {
+		switch (opt) {
+		case 'f':
+			p.fileName = optarg;
+			break;
+		case 'v':
+			p.verbosity = atoi(optarg);
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			PRINT_ERROR("Unrecognized option!");
+			usage();
+			exit(0);
+		}
+	}
+
+	return p;
 }
 
 #endif
-
diff --git a/SpMV/support/timer.h b/SpMV/support/timer.h
index 66e9842..7367b11 100644
--- a/SpMV/support/timer.h
+++ b/SpMV/support/timer.h
@@ -6,22 +6,25 @@
 #include <sys/time.h>
 
 typedef struct Timer {
-    struct timeval startTime;
-    struct timeval endTime;
+	struct timeval startTime;
+	struct timeval endTime;
 } Timer;
 
-static void startTimer(Timer* timer) {
-    gettimeofday(&(timer->startTime), NULL);
+static void startTimer(Timer *timer)
+{
+	gettimeofday(&(timer->startTime), NULL);
 }
 
-static void stopTimer(Timer* timer) {
-    gettimeofday(&(timer->endTime), NULL);
+static void stopTimer(Timer *timer)
+{
+	gettimeofday(&(timer->endTime), NULL);
 }
 
-static double getElapsedTime(Timer timer) {
-    return ((double) ((timer.endTime.tv_sec - timer.startTime.tv_sec)
-                   + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
+static double getElapsedTime(Timer timer)
+{
+	return ((double)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+			 + (timer.endTime.tv_usec -
+			    timer.startTime.tv_usec) / 1.0e6));
 }
 
 #endif
-
diff --git a/SpMV/support/utils.h b/SpMV/support/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/support/utils.h
@@ -8,4 +8,3 @@
 #define PRINT(fmt, ...)             printf(fmt "\n", ##__VA_ARGS__)
 
 #endif
-