22 files changed, 946 insertions, 746 deletions
diff --git a/SpMV/Makefile b/SpMV/Makefile
index 0e7a70c..c2d9d50 100644
--- a/SpMV/Makefile
+++ b/SpMV/Makefile
@@ -1,21 +1,31 @@
 NR_TASKLETS ?= 16
 NR_DPUS ?= 1
 
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard host/*.c)
-DPU_SOURCES := $(wildcard dpu/*.c)
-CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c)
-GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu)
-
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-CPU_BASE_FLAGS := -O3 -fopenmp
-GPU_BASE_FLAGS := -O3
+
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
 
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -24,19 +34,13 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-gpu: bin/gpu_baseline
-
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
-
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-
-bin/cpu_baseline: ${CPU_BASE_SOURCES}
-	${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS}
+bin/host_code: host/app.c include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ host/app.c ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/gpu_baseline: ${GPU_BASE_SOURCES}
-	${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS}
+bin/dpu_code: dpu/task.c include bin
+	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c
 
 clean:
 	${QUIET}rm -rf bin
diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile
index 5b2367b..a24b764 100644
--- a/SpMV/baselines/cpu/Makefile
+++ b/SpMV/baselines/cpu/Makefile
@@ -1,7 +1,15 @@
+native ?= 1
+
+CFLAGS =
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
 all: spmv
 
 spmv: app.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -o spmv -fopenmp app.c
+	gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o spmv -fopenmp app.c
 
 spmv_O0: app.c
 	gcc -o spmv_O0 -fopenmp app.c
diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c
index 8d360ee..e33761f 100644
--- a/SpMV/baselines/cpu/app.c
+++ b/SpMV/baselines/cpu/app.c
@@ -13,60 +13,63 @@
 #include "../../support/timer.h"
 #include "../../support/utils.h"
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
 
-    // Process parameters
-    struct Params p = input_params(argc, argv);
+	// Process parameters
+	struct Params p = input_params(argc, argv);
 
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    float* inVector = malloc(csrMatrix.numCols*sizeof(float));
-    float* outVector = malloc(csrMatrix.numRows*sizeof(float));
-    initVector(inVector, csrMatrix.numCols);
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	float *inVector = malloc(csrMatrix.numCols * sizeof(float));
+	float *outVector = malloc(csrMatrix.numRows * sizeof(float));
+	initVector(inVector, csrMatrix.numCols);
 
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    //omp_set_num_threads(4);
-    Timer timer;
-    startTimer(&timer);
-    #pragma omp parallel for
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = csrMatrix.rowPtrs[rowIdx]; i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = csrMatrix.nonzeros[i].col;
-            float value = csrMatrix.nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVector[rowIdx] = sum;
-    }
-    stopTimer(&timer);
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	//omp_set_num_threads(4);
+	Timer timer;
+	startTimer(&timer);
+#pragma omp parallel for
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = csrMatrix.rowPtrs[rowIdx];
+		     i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = csrMatrix.nonzeros[i].col;
+			float value = csrMatrix.nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVector[rowIdx] = sum;
+	}
+	stopTimer(&timer);
 
-
-    unsigned int nr_threads = 0;
+	unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-    nr_threads++;
-
+	nr_threads++;
 
-    // coomatrix / csrmatrix use uint32_t indexes and float values
-    printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
-        " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
-        nr_threads, csrMatrix.numNonzeros,
-        csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6),
-        csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6),
-        getElapsedTime(timer)*1e6);
-    //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
-    PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+	// coomatrix / csrmatrix use uint32_t indexes and float values
+	printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
+	       " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
+	       nr_threads, csrMatrix.numNonzeros,
+	       csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer) *
+							1e6),
+	       csrMatrix.numNonzeros / (getElapsedTime(timer) * 1e6),
+	       getElapsedTime(timer) * 1e6);
+	//if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+	PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms",
+		   getElapsedTime(timer) * 1e3);
 
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
 
-    return 0;
+	return 0;
 
 }
diff --git a/SpMV/baselines/cpu/run-perf.sh b/SpMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..714498d
--- /dev/null
+++ b/SpMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
diff --git a/SpMV/benchmark-scripts/ccmcc25-sim.sh b/SpMV/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..9d1af4e
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 4 8 16; do
+	./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  SpMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+	::: data bcsstk30.mtx bcsstk30.4.mtx bcsstk30.8.mtx bcsstk30.16.mtx \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/benchmark-scripts/ccmcc25.sh b/SpMV/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..176ea99
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 8 32 64; do
+	./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  SpMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} numa_rank={numa_rank} \
+		::: i $(seq 0 10) \
+		::: data bcsstk30.mtx bcsstk30.8.mtx bcsstk30.32.mtx bcsstk30.64.mtx \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+	>> ${fn}.txt
+
+done
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..305a645 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -11,7 +11,7 @@
 #include <perfcounter.h>
 #include <seqread.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m   "fmt"\n", ##__VA_ARGS__)
 
@@ -20,120 +20,164 @@
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // main
-int main() {
-
-    if(me() == 0) {
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Load parameters
-    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    uint32_t numRows = params_w->dpuNumRows;
-
-    // Sanity check
-    if(me() == 0) {
-        if(numRows%2 != 0) {
-            // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
-            PRINT_ERROR("The number of rows is not a multiple of two!");
-        }
-    }
-
-    // Identify tasklet's rows
-    uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
-    uint32_t taskletRowsStart = me()*numRowsPerTasklet;
-    uint32_t taskletNumRows;
-    if(taskletRowsStart > numRows) {
-        taskletNumRows = 0;
-    } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
-        taskletNumRows = numRows - taskletRowsStart;
-    } else {
-        taskletNumRows = numRowsPerTasklet;
-    }
-
-    // Only process tasklets with nonzero number of rows
-    if(taskletNumRows > 0) {
-
-        // Extract parameters
-        uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
-        uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
-        uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
-        uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
-        uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
-        // Initialize row pointer sequential reader
-        uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
-        seqreader_t rowPtrReader;
-        uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
-        uint32_t firstRowPtr = *taskletRowPtrs_w;
-
-        // Initialize nonzeros sequential reader
-        uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
-        uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
-        seqreader_t nonzerosReader;
-        struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
-        // Initialize input vector cache
-        uint32_t inVectorTileSize = 64;
-        float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
-        mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
-        uint32_t currInVectorTileIdx = 0;
-
-        // Initialize output vector cache
-        uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
-        uint32_t outVectorTileSize = 64;
-        float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
-        // SpMV
-        uint32_t nextRowPtr = firstRowPtr;
-        for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
-            // Find row nonzeros
-            taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
-            uint32_t rowPtr = nextRowPtr;
-            nextRowPtr = *taskletRowPtrs_w;
-            uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
-            // Multiply row with vector
-            float outValue = 0.0f;
-            for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
-                // Get matrix value
-                float matValue = taskletNonzeros_w->value;
-
-                // Get input vector value
-                uint32_t col = taskletNonzeros_w->col;
-                uint32_t inVectorTileIdx = col/inVectorTileSize;
-                uint32_t inVectorTileOffset = col%inVectorTileSize;
-                if(inVectorTileIdx != currInVectorTileIdx) {
-                    mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
-                    currInVectorTileIdx = inVectorTileIdx;
-                }
-                float inValue = inVectorTile_w[inVectorTileOffset];
-
-                // Multiply and add
-                outValue += matValue*inValue;
-
-                // Read next nonzero
-                taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
-            }
-
-            // Store output
-            uint32_t outVectorTileIdx = row/outVectorTileSize;
-            uint32_t outVectorTileOffset = row%outVectorTileSize;
-            outVectorTile_w[outVectorTileOffset] = outValue;
-            if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
-            } else if(row == taskletNumRows - 1) { // Last row for tasklet
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
-            }
-
-        }
-    }
-
-    return 0;
+int main()
+{
+
+	if (me() == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Load parameters
+	uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	struct DPUParams *params_w =
+	    (struct DPUParams *)
+	    mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	mram_read((__mram_ptr void const *)params_m, params_w,
+		  ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	uint32_t numRows = params_w->dpuNumRows;
+
+	// Sanity check
+	if (me() == 0) {
+		if (numRows % 2 != 0) {
+			// The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+			PRINT_ERROR
+			    ("The number of rows is not a multiple of two!");
+		}
+	}
+	// Identify tasklet's rows
+	uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1);	// Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+	uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+	uint32_t taskletNumRows;
+	if (taskletRowsStart > numRows) {
+		taskletNumRows = 0;
+	} else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+		taskletNumRows = numRows - taskletRowsStart;
+	} else {
+		taskletNumRows = numRowsPerTasklet;
+	}
+
+	// Only process tasklets with nonzero number of rows
+	if (taskletNumRows > 0) {
+
+		// Extract parameters
+		uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+		uint32_t rowPtrs_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+		uint32_t nonzeros_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuNonzeros_m;
+		uint32_t inVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuInVector_m;
+		uint32_t outVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuOutVector_m;
+
+		// Initialize row pointer sequential reader
+		uint32_t taskletRowPtrs_m =
+		    rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+		seqreader_t rowPtrReader;
+		uint32_t *taskletRowPtrs_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletRowPtrs_m,
+				 &rowPtrReader);
+		uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+		// Initialize nonzeros sequential reader
+		uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+		uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero);	// 8-byte aligned because Nonzero is 8 bytes
+		seqreader_t nonzerosReader;
+		struct Nonzero *taskletNonzeros_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletNonzeros_m,
+				 &nonzerosReader);
+
+		// Initialize input vector cache
+		uint32_t inVectorTileSize = 64;
+		float *inVectorTile_w =
+		    mem_alloc(inVectorTileSize * sizeof(float));
+		mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+			  256);
+		uint32_t currInVectorTileIdx = 0;
+
+		// Initialize output vector cache
+		uint32_t taskletOutVector_m =
+		    outVector_m + taskletRowsStart * sizeof(float);
+		uint32_t outVectorTileSize = 64;
+		float *outVectorTile_w =
+		    mem_alloc(outVectorTileSize * sizeof(float));
+
+		// SpMV
+		uint32_t nextRowPtr = firstRowPtr;
+		for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+			// Find row nonzeros
+			taskletRowPtrs_w =
+			    seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+					&rowPtrReader);
+			uint32_t rowPtr = nextRowPtr;
+			nextRowPtr = *taskletRowPtrs_w;
+			uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+			// Multiply row with vector
+			float outValue = 0.0f;
+			for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+				// Get matrix value
+				float matValue = taskletNonzeros_w->value;
+
+				// Get input vector value
+				uint32_t col = taskletNonzeros_w->col;
+				uint32_t inVectorTileIdx =
+				    col / inVectorTileSize;
+				uint32_t inVectorTileOffset =
+				    col % inVectorTileSize;
+				if (inVectorTileIdx != currInVectorTileIdx) {
+					mram_read((__mram_ptr void const
+						   *)(inVector_m +
+						      inVectorTileIdx *
+						      inVectorTileSize *
+						      sizeof(float)),
+						  inVectorTile_w, 256);
+					currInVectorTileIdx = inVectorTileIdx;
+				}
+				float inValue =
+				    inVectorTile_w[inVectorTileOffset];
+
+				// Multiply and add
+				outValue += matValue * inValue;
+
+				// Read next nonzero
+				taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader);	// Last read will be out of bounds and unused
+
+			}
+
+			// Store output
+			uint32_t outVectorTileIdx = row / outVectorTileSize;
+			uint32_t outVectorTileOffset = row % outVectorTileSize;
+			outVectorTile_w[outVectorTileOffset] = outValue;
+			if (outVectorTileOffset == outVectorTileSize - 1) {	// Last element in tile
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)), 256);
+			} else if (row == taskletNumRows - 1) {	// Last row for tasklet
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)),
+					   (taskletNumRows %
+					    outVectorTileSize) * sizeof(float));
+			}
+
+		}
+	}
+
+	return 0;
 }
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index fe9c751..6cf2861 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -3,9 +3,24 @@
 * SpMV Host Application Source File
 *
 */
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
 
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <assert.h>
 #include <getopt.h>
 #include <stdio.h>
@@ -14,247 +29,301 @@
 #include <unistd.h>
 
 #include "mram-management.h"
-#include "../support/common.h"
-#include "../support/matrix.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "matrix.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
 
 #define DPU_BINARY "./bin/dpu_code"
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#ifndef ENERGY
-#define ENERGY 0
-#endif
+// Main of the Host Application
+int main(int argc, char **argv)
+{
+
+	// Process parameters
+	struct Params p = input_params(argc, argv);
+
+	// Timing and profiling
+	Timer timer;
+	double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime =
+	    0.0f, readTime = 0.0f, freeTime = 0.0f;
 #if ENERGY
-#include <dpu_probe.h>
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-// Main of the Host Application
-int main(int argc, char** argv) {
-
-    // Process parameters
-    struct Params p = input_params(argc, argv);
-
-    // Timing and profiling
-    Timer timer;
-    double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime = 0.0f, readTime = 0.0f, freeTime = 0.0f;
-    #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-    #endif
-
-    // Allocate DPUs and load binary
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t numDPUs, numRanks;
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    stopTimer(&timer);
-    allocTime += getElapsedTime(timer);
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    stopTimer(&timer);
-    loadTime += getElapsedTime(timer);
-
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
-    assert(numDPUs == NR_DPUS);
-    PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    uint32_t numRows = csrMatrix.numRows;
-    uint32_t numCols = csrMatrix.numCols;
-    uint32_t* rowPtrs = csrMatrix.rowPtrs;
-    struct Nonzero* nonzeros = csrMatrix.nonzeros;
-    float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
-    initVector(inVector, numCols);
-    float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
-
-    // Partition data structure across DPUs
-    uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
-    PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
-    struct DPUParams dpuParams[numDPUs];
-    unsigned int dpuIdx = 0;
-    PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
-    DPU_FOREACH (dpu_set, dpu) {
-
-        // Allocate parameters
-        struct mram_heap_allocator_t allocator;
-        init_allocator(&allocator);
-        uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
-        // Find DPU's rows
-        uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-        uint32_t dpuNumRows;
-        if(dpuStartRowIdx > numRows) {
-            dpuNumRows = 0;
-        } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
-            dpuNumRows = numRows - dpuStartRowIdx;
-        } else {
-            dpuNumRows = numRowsPerDPU;
-        }
-        dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
-        PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
-        PRINT_INFO(p.verbosity >= 2, "        Receives %u rows", dpuNumRows);
-
-        // Partition nonzeros and copy data
-        if(dpuNumRows > 0) {
-
-            // Find DPU's CSR matrix partition
-            uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
-            uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
-            struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
-            uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
-
-            // Allocate MRAM
-            uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
-            uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
-            uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
-            uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
-            assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
-            PRINT_INFO(p.verbosity >= 2, "        Total memory allocated is %d bytes", allocator.totalAllocated);
-
-            // Set up DPU parameters
-            dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
-            dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
-            dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
-            dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
-            dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
-
-            // Send data to DPU
-            PRINT_INFO(p.verbosity >= 2, "        Copying data to DPU");
-            startTimer(&timer);
-            copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
-            copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
-            stopTimer(&timer);
-            writeTime += getElapsedTime(timer);
-
-        }
-
-        // Send parameters to DPU
-        PRINT_INFO(p.verbosity >= 2, "        Copying parameters to DPU");
-        startTimer(&timer);
-        copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
-        stopTimer(&timer);
-        writeTime += getElapsedTime(timer);
-
-        ++dpuIdx;
-
-    }
-    PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms", writeTime*1e3);
-
-    // Run all DPUs
-    PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
-    startTimer(&timer);
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_start(&probe));
-    #endif
-    DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_stop(&probe));
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
-    #endif
-    stopTimer(&timer);
-    dpuTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime*1e3);
-
-    // Copy back result
-    PRINT_INFO(p.verbosity >= 1, "Copying back the result");
-    startTimer(&timer);
-    dpuIdx = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-        unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
-        if(dpuNumRows > 0) {
-            uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-            copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
-        }
-        ++dpuIdx;
-    }
-    stopTimer(&timer);
-    readTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime*1e3);
-
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    float* outVectorReference = malloc(numRows*sizeof(float));
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = nonzeros[i].col;
-            float value = nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVectorReference[rowIdx] = sum;
-    }
-
-    // Verify the result
-    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
-    int status = 1;
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
-        const float tolerance = 0.00001;
-        if(diff > tolerance || diff < -tolerance) {
-            status = 0;
-            PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
-        }
-    }
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_free(dpu_set));
-    stopTimer(&timer);
-    freeTime += getElapsedTime(timer);
-
-    if (status) {
-        printf("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
-            numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros);
-        printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-            allocTime, loadTime, writeTime, dpuTime, readTime, freeTime);
-        printf(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-            csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-        printf(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros / (dpuTime * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f",
-            csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-    }
-
-    // Display DPU Logs
-    if(p.verbosity >= 2) {
-        PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            PRINT("DPU %u:", dpuIdx);
-            DPU_ASSERT(dpu_log_read(dpu, stdout));
-            ++dpuIdx;
-        }
-    }
-
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
-    free(outVectorReference);
-
-    return 0;
+	// Allocate DPUs and load binary
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t numDPUs, numRanks;
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	stopTimer(&timer);
+	allocTime += getElapsedTime(timer);
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	stopTimer(&timer);
+	loadTime += getElapsedTime(timer);
+
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+	assert(numDPUs == NR_DPUS);
+	PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	uint32_t numRows = csrMatrix.numRows;
+	uint32_t numCols = csrMatrix.numCols;
+	uint32_t *rowPtrs = csrMatrix.rowPtrs;
+	struct Nonzero *nonzeros = csrMatrix.nonzeros;
+	float *inVector =
+	    (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+	initVector(inVector, numCols);
+	float *outVector =
+	    (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+
+	// Partition data structure across DPUs
+	uint32_t numRowsPerDPU =
+	    ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / numDPUs + 1);
+	PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU",
+		   numRowsPerDPU);
+	struct DPUParams dpuParams[numDPUs];
+	unsigned int dpuIdx = 0;
+	PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+	DPU_FOREACH(dpu_set, dpu) {
+
+		// Allocate parameters
+		struct mram_heap_allocator_t allocator;
+		init_allocator(&allocator);
+		uint32_t dpuParams_m =
+		    mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+		// Find DPU's rows
+		uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+		uint32_t dpuNumRows;
+		if (dpuStartRowIdx > numRows) {
+			dpuNumRows = 0;
+		} else if (dpuStartRowIdx + numRowsPerDPU > numRows) {
+			dpuNumRows = numRows - dpuStartRowIdx;
+		} else {
+			dpuNumRows = numRowsPerDPU;
+		}
+		dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+		PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
+		PRINT_INFO(p.verbosity >= 2, "        Receives %u rows",
+			   dpuNumRows);
+
+		// Partition nonzeros and copy data
+		if (dpuNumRows > 0) {
+
+			// Find DPU's CSR matrix partition
+			uint32_t *dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+			uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+			struct Nonzero *dpuNonzeros_h =
+			    &nonzeros[dpuRowPtrsOffset];
+			uint32_t dpuNumNonzeros =
+			    dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+			// Allocate MRAM
+			uint32_t dpuRowPtrs_m =
+			    mram_heap_alloc(&allocator,
+					    (dpuNumRows +
+					     1) * sizeof(uint32_t));
+			uint32_t dpuNonzeros_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNonzeros *
+					    sizeof(struct Nonzero));
+			uint32_t dpuInVector_m =
+			    mram_heap_alloc(&allocator,
+					    numCols * sizeof(float));
+			uint32_t dpuOutVector_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumRows * sizeof(float));
+			assert((dpuNumRows * sizeof(float)) % 8 == 0
+			       &&
+			       "Output sub-vector must be a multiple of 8 bytes!");
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Total memory allocated is %d bytes",
+				   allocator.totalAllocated);
+
+			// Set up DPU parameters
+			dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+			dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+			dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+			dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+			dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+			// Send data to DPU
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Copying data to DPU");
+			startTimer(&timer);
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuRowPtrs_m, (uint8_t *) dpuRowPtrs_h,
+						ROUND_UP_TO_MULTIPLE_OF_8((dpuNumRows + 1) * sizeof(uint32_t))));
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNonzeros_m, (uint8_t *) dpuNonzeros_h,
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNonzeros * sizeof(struct Nonzero))));
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuInVector_m, (uint8_t *) inVector,
+						ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))));
+			stopTimer(&timer);
+			writeTime += getElapsedTime(timer);
+		}
+		// Send parameters to DPU
+		PRINT_INFO(p.verbosity >= 2,
+			   "        Copying parameters to DPU");
+		startTimer(&timer);
+		DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+					dpuParams_m, (uint8_t *) & dpuParams[dpuIdx],
+					ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+		stopTimer(&timer);
+		writeTime += getElapsedTime(timer);
+
+		++dpuIdx;
+
+	}
+	PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms",
+		   writeTime * 1e3);
+
+	// Run all DPUs
+	PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+	startTimer(&timer);
+#if ENERGY
+	DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+	DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+#if ENERGY
+	DPU_ASSERT(dpu_probe_stop(&probe));
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
+#endif
+	stopTimer(&timer);
+	dpuTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime * 1e3);
+
+	// Copy back result
+	PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+	startTimer(&timer);
+	dpuIdx = 0;
+
+	DPU_FOREACH(dpu_set, dpu) {
+		unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+		if (dpuNumRows > 0) {
+			uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+			DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuParams[dpuIdx].dpuOutVector_m,
+						(uint8_t *) (outVector + dpuStartRowIdx),
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumRows * sizeof(float))));
+		}
+		++dpuIdx;
+	}
+	stopTimer(&timer);
+	readTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime * 1e3);
+
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	float *outVectorReference = (float*)malloc(numRows * sizeof(float));
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = nonzeros[i].col;
+			float value = nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVectorReference[rowIdx] = sum;
+	}
+
+	// Verify the result
+	PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+	int status = 1;
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float diff =
+		    (outVectorReference[rowIdx] -
+		     outVector[rowIdx]) / outVectorReference[rowIdx];
+		const float tolerance = 0.00001;
+		if (diff > tolerance || diff < -tolerance) {
+			status = 0;
+			PRINT_ERROR
+			    ("Mismatch at index %u (CPU result = %f, DPU result = %f)",
+			     rowIdx, outVectorReference[rowIdx],
+			     outVector[rowIdx]);
+		}
+	}
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_free(dpu_set));
+	stopTimer(&timer);
+	freeTime += getElapsedTime(timer);
+
+	if (status) {
+		dfatool_printf
+		    ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
+		     numDPUs, numRanks, NR_TASKLETS, "float",
+		     csrMatrix.numNonzeros);
+		dfatool_printf
+		    ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+		     allocTime, loadTime, writeTime, dpuTime, readTime,
+		     freeTime);
+		dfatool_printf
+		    (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+		     csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
+					      1e6),
+		     csrMatrix.numNonzeros /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+	}
+	// Display DPU Logs
+	if (p.verbosity >= 2) {
+		PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			PRINT("DPU %u:", dpuIdx);
+			DPU_ASSERT(dpu_log_read(dpu, stdout));
+			++dpuIdx;
+		}
+	}
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
+	free(outVectorReference);
+
+	return 0;
 }
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index 627dfde..a953d6a 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -1,37 +1,29 @@
+#pragma once
 
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
+#include "common.h"
+#include "utils.h"
 
-#include "../support/common.h"
-#include "../support/utils.h"
-
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20)	// A DPU's capacity is 64 MiB
 
 struct mram_heap_allocator_t {
-    uint32_t totalAllocated;
+	uint32_t totalAllocated;
 };
 
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
-    allocator->totalAllocated = 0;
-}
-
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
-    uint32_t ret = allocator->totalAllocated;
-    allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
-    if(allocator->totalAllocated > DPU_CAPACITY) {
-        PRINT_ERROR("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
-        exit(0);
-    }
-    return ret;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+	allocator->totalAllocated = 0;
 }
 
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
-    DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+				uint32_t size)
+{
+	uint32_t ret = allocator->totalAllocated;
+	allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+	if (allocator->totalAllocated > DPU_CAPACITY) {
+		PRINT_ERROR
+		    ("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+		     allocator->totalAllocated, DPU_CAPACITY);
+		exit(0);
+	}
+	return ret;
 }
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
-    DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
-
diff --git a/SpMV/include/common.h b/SpMV/include/common.h
new file mode 100644
index 0000000..6118814
--- /dev/null
+++ b/SpMV/include/common.h
@@ -0,0 +1,24 @@
+
+/* Common data structures between host and DPUs */
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
+#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
+
+struct DPUParams {
+	uint32_t dpuNumRows;	/* Number of rows assigned to the DPU */
+	uint32_t dpuRowPtrsOffset;	/* Offset of the row pointers */
+	uint32_t dpuRowPtrs_m;
+	uint32_t dpuNonzeros_m;
+	uint32_t dpuInVector_m;
+	uint32_t dpuOutVector_m;
+};
+
+struct Nonzero {
+	uint32_t col;
+	float value;
+};
+
+#endif
diff --git a/SpMV/include/dfatool_host.ah b/SpMV/include/dfatool_host.ah
new file mode 100644
index 0000000..91d44bd
--- /dev/null
+++ b/SpMV/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_rows, n_cols, n_nonzero;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(float);
+	}
+
+	advice call("% input_params(...)"): after() {
+		printf("[>>] SpMV | n_dpus=%u\n", NR_DPUS);
+	}
+
+	advice call("% readCOOMatrix(...)") : after() {
+		struct COOMatrix* c = tjp->result();
+		n_rows = c->numRows;
+		n_cols = c->numCols;
+		n_nonzero = c->numNonzeros;
+		printf("[--] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+	}
+};
diff --git a/SpMV/include/matrix.h b/SpMV/include/matrix.h
new file mode 100644
index 0000000..ce8745e
--- /dev/null
+++ b/SpMV/include/matrix.h
@@ -0,0 +1,138 @@
+
+#ifndef _MATRIX_H_
+#define _MATRIX_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "utils.h"
+
+struct COOMatrix {
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowIdxs;
+	struct Nonzero *nonzeros;
+};
+
+struct CSRMatrix {
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowPtrs;
+	struct Nonzero *nonzeros;
+};
+
+static struct COOMatrix readCOOMatrix(const char *fileName)
+{
+
+	struct COOMatrix cooMatrix;
+
+	// Initialize fields
+	FILE *fp = fopen(fileName, "r");
+	assert(fscanf(fp, "%u", &cooMatrix.numRows));
+	if (cooMatrix.numRows % 2 == 1) {
+		PRINT_WARNING
+		    ("Reading matrix %s: number of rows must be even. Padding with an extra row.",
+		     fileName);
+		cooMatrix.numRows++;
+	}
+	assert(fscanf(fp, "%u", &cooMatrix.numCols));
+	assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
+	cooMatrix.rowIdxs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(uint32_t)));
+	cooMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Read the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx;
+		assert(fscanf(fp, "%u", &rowIdx));
+		cooMatrix.rowIdxs[i] = rowIdx - 1;	// File format indexes begin at 1
+		uint32_t colIdx;
+		assert(fscanf(fp, "%u", &colIdx));
+		cooMatrix.nonzeros[i].col = colIdx - 1;	// File format indexes begin at 1
+		cooMatrix.nonzeros[i].value = 1.0f;
+	}
+
+	return cooMatrix;
+
+}
+
+static void freeCOOMatrix(struct COOMatrix cooMatrix)
+{
+	free(cooMatrix.rowIdxs);
+	free(cooMatrix.nonzeros);
+}
+
+static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix)
+{
+
+	struct CSRMatrix csrMatrix;
+
+	// Initialize fields
+	csrMatrix.numRows = cooMatrix.numRows;
+	csrMatrix.numCols = cooMatrix.numCols;
+	csrMatrix.numNonzeros = cooMatrix.numNonzeros;
+	csrMatrix.rowPtrs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   ((csrMatrix.numRows + 1) * sizeof(uint32_t)));
+	csrMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (csrMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Histogram rowIdxs
+	memset(csrMatrix.rowPtrs, 0,
+	       (csrMatrix.numRows + 1) * sizeof(uint32_t));
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		csrMatrix.rowPtrs[rowIdx]++;
+	}
+
+	// Prefix sum rowPtrs
+	uint32_t sumBeforeNextRow = 0;
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		uint32_t sumBeforeRow = sumBeforeNextRow;
+		sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
+		csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
+	}
+	csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
+
+	// Bin the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
+		csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
+	}
+
+	// Restore rowPtrs
+	for (uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
+		csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
+	}
+	csrMatrix.rowPtrs[0] = 0;
+
+	return csrMatrix;
+
+}
+
+static void freeCSRMatrix(struct CSRMatrix csrMatrix)
+{
+	free(csrMatrix.rowPtrs);
+	free(csrMatrix.nonzeros);
+}
+
+static void initVector(float *vec, uint32_t size)
+{
+	for (uint32_t i = 0; i < size; ++i) {
+		vec[i] = 1.0f;
+	}
+}
+
+#endif
diff --git a/SpMV/include/params.h b/SpMV/include/params.h
new file mode 100644
index 0000000..bf60e79
--- /dev/null
+++ b/SpMV/include/params.h
@@ -0,0 +1,51 @@
+
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+#include "utils.h"
+
+static void usage()
+{
+	PRINT("\nUsage:  ./program [options]"
+	      "\n"
+	      "\nBenchmark-specific options:"
+	      "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
+	      "\n"
+	      "\nGeneral options:"
+	      "\n    -v <V>    verbosity" "\n    -h        help" "\n\n");
+}
+
+typedef struct Params {
+	const char *fileName;
+	unsigned int verbosity;
+} Params;
+
+static struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.fileName = "data/bcsstk30.mtx";
+	p.verbosity = 1;
+	int opt;
+	while ((opt = getopt(argc, argv, "f:v:h")) >= 0) {
+		switch (opt) {
+		case 'f':
+			p.fileName = optarg;
+			break;
+		case 'v':
+			p.verbosity = atoi(optarg);
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			PRINT_ERROR("Unrecognized option!");
+			usage();
+			exit(0);
+		}
+	}
+
+	return p;
+}
+
+#endif
diff --git a/SpMV/include/timer.h b/SpMV/include/timer.h
new file mode 100644
index 0000000..cb513cb
--- /dev/null
+++ b/SpMV/include/timer.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <stdio.h>
+#include <sys/time.h>
+
+#if DFATOOL_TIMING
+
+#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0)
+
+typedef struct Timer {
+	struct timeval startTime;
+	struct timeval endTime;
+} Timer;
+
+static void startTimer(Timer *timer)
+{
+	gettimeofday(&(timer->startTime), NULL);
+}
+
+static void stopTimer(Timer *timer)
+{
+	gettimeofday(&(timer->endTime), NULL);
+}
+
+static double getElapsedTime(Timer timer)
+{
+	return ((double)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+			 + (timer.endTime.tv_usec -
+			    timer.startTime.tv_usec) / 1.0e6));
+}
+
+#else
+
+#define dfatool_printf(fmt, ...) do {} while (0)
+
+typedef int Timer;
+
+static void startTimer(Timer* timer)
+{
+	(void)timer;
+}
+
+static void stopTimer(Timer* timer)
+{
+	(void)timer;
+}
+
+static double getElapsedTime(Timer timer)
+{
+	(void)timer;
+	return 0.0;
+}
+
+#endif
diff --git a/SpMV/support/utils.h b/SpMV/include/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/include/utils.h
@@ -8,4 +8,3 @@
 #define PRINT(fmt, ...)             printf(fmt "\n", ##__VA_ARGS__)
 
 #endif
-
diff --git a/SpMV/run-paper-strong-full.sh b/SpMV/run-paper-strong-full.sh
deleted file mode 100755
index 09b7085..0000000
--- a/SpMV/run-paper-strong-full.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-cd data/generate
-./replicate ../bcsstk30.mtx 64 ../bcsstk30.mtx.64.mtx
-cd ../..
-
-# >2048 is not in upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
-
-rm -f data/bcsstk30.mtx.64.mtx
diff --git a/SpMV/run-paper-strong-rank.sh b/SpMV/run-paper-strong-rank.sh
deleted file mode 100755
index c73a6a0..0000000
--- a/SpMV/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/SpMV/run-paper-weak.sh b/SpMV/run-paper-weak.sh
deleted file mode 100755
index 74683cc..0000000
--- a/SpMV/run-paper-weak.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	cd data/generate
-	make
-	./replicate ../bcsstk30.mtx ${nr_dpus} /tmp/bcsstk30.mtx.${nr_dpus}.mtx
-	cd ../..
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx || true
-			done
-		fi
-	done
-	rm -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx
-done |
-) tee log-paper-weak.txt
diff --git a/SpMV/support/common.h b/SpMV/support/common.h
deleted file mode 100644
index 58fede8..0000000
--- a/SpMV/support/common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-
-/* Common data structures between host and DPUs */
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
-#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
-
-struct DPUParams {
-    uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
-    uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
-    uint32_t dpuRowPtrs_m;
-    uint32_t dpuNonzeros_m;
-    uint32_t dpuInVector_m;
-    uint32_t dpuOutVector_m;
-};
-
-struct Nonzero {
-    uint32_t col;
-    float value;
-};
-
-#endif
-
diff --git a/SpMV/support/matrix.h b/SpMV/support/matrix.h
deleted file mode 100644
index d25da1b..0000000
--- a/SpMV/support/matrix.h
+++ /dev/null
@@ -1,119 +0,0 @@
-
-#ifndef _MATRIX_H_
-#define _MATRIX_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "common.h"
-#include "utils.h"
-
-struct COOMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowIdxs;
-    struct Nonzero* nonzeros;
-};
-
-struct CSRMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowPtrs;
-    struct Nonzero* nonzeros;
-};
-
-static struct COOMatrix readCOOMatrix(const char* fileName) {
-
-    struct COOMatrix cooMatrix;
-
-    // Initialize fields
-    FILE* fp = fopen(fileName, "r");
-    assert(fscanf(fp, "%u", &cooMatrix.numRows));
-    if(cooMatrix.numRows%2 == 1) {
-        PRINT_WARNING("Reading matrix %s: number of rows must be even. Padding with an extra row.", fileName);
-        cooMatrix.numRows++;
-    }
-    assert(fscanf(fp, "%u", &cooMatrix.numCols));
-    assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
-    cooMatrix.rowIdxs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(uint32_t)));
-    cooMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Read the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx;
-        assert(fscanf(fp, "%u", &rowIdx));
-        cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
-        uint32_t colIdx;
-        assert(fscanf(fp, "%u", &colIdx));
-        cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
-        cooMatrix.nonzeros[i].value = 1.0f;
-    }
-
-    return cooMatrix;
-
-}
-
-static void freeCOOMatrix(struct COOMatrix cooMatrix) {
-    free(cooMatrix.rowIdxs);
-    free(cooMatrix.nonzeros);
-}
-
-static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix) {
-
-    struct CSRMatrix csrMatrix;
-
-    // Initialize fields
-    csrMatrix.numRows = cooMatrix.numRows;
-    csrMatrix.numCols = cooMatrix.numCols;
-    csrMatrix.numNonzeros = cooMatrix.numNonzeros;
-    csrMatrix.rowPtrs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8((csrMatrix.numRows + 1)*sizeof(uint32_t)));
-    csrMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Histogram rowIdxs
-    memset(csrMatrix.rowPtrs, 0, (csrMatrix.numRows + 1)*sizeof(uint32_t));
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        csrMatrix.rowPtrs[rowIdx]++;
-    }
-
-    // Prefix sum rowPtrs
-    uint32_t sumBeforeNextRow = 0;
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        uint32_t sumBeforeRow = sumBeforeNextRow;
-        sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
-        csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
-    }
-    csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
-
-    // Bin the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
-        csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
-    }
-
-    // Restore rowPtrs
-    for(uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
-        csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
-    }
-    csrMatrix.rowPtrs[0] = 0;
-
-    return csrMatrix;
-
-}
-
-static void freeCSRMatrix(struct CSRMatrix csrMatrix) {
-    free(csrMatrix.rowPtrs);
-    free(csrMatrix.nonzeros);
-}
-
-static void initVector(float* vec, uint32_t size) {
-    for(uint32_t i = 0; i < size; ++i) {
-        vec[i] = 1.0f;
-    }
-}
-
-#endif
-
diff --git a/SpMV/support/params.h b/SpMV/support/params.h
deleted file mode 100644
index b4b696c..0000000
--- a/SpMV/support/params.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-#include "utils.h"
-
-static void usage() {
-    PRINT(  "\nUsage:  ./program [options]"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
-            "\n"
-            "\nGeneral options:"
-            "\n    -v <V>    verbosity"
-            "\n    -h        help"
-            "\n\n");
-}
-
-typedef struct Params {
-  const char* fileName;
-  unsigned int verbosity;
-} Params;
-
-static struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.fileName      = "data/bcsstk30.mtx";
-    p.verbosity     = 1;
-    int opt;
-    while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
-        switch(opt) {
-            case 'f': p.fileName    = optarg;       break;
-            case 'v': p.verbosity   = atoi(optarg); break;
-            case 'h': usage(); exit(0);
-            default:
-                      PRINT_ERROR("Unrecognized option!");
-                      usage();
-                      exit(0);
-        }
-    }
-
-    return p;
-}
-
-#endif
-
diff --git a/SpMV/support/timer.h b/SpMV/support/timer.h
deleted file mode 100644
index 66e9842..0000000
--- a/SpMV/support/timer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-#include <stdio.h>
-#include <sys/time.h>
-
-typedef struct Timer {
-    struct timeval startTime;
-    struct timeval endTime;
-} Timer;
-
-static void startTimer(Timer* timer) {
-    gettimeofday(&(timer->startTime), NULL);
-}
-
-static void stopTimer(Timer* timer) {
-    gettimeofday(&(timer->endTime), NULL);
-}
-
-static double getElapsedTime(Timer timer) {
-    return ((double) ((timer.endTime.tv_sec - timer.startTime.tv_sec)
-                   + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
-}
-
-#endif
-