239 files changed, 8830 insertions, 7374 deletions
diff --git a/.gitignore b/.gitignore
index 9325fab..535bf9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,10 +3,13 @@ bin
 log-paper-strong-full.txt
 log-paper-strong-rank.txt
 log-paper-weak.txt
+*~
 *-O0.txt
 *-O2.txt
 *-explore.txt
+*.perf
 */bin
+*/repo.acp
 log-*.txt
 log
 
diff --git a/BFS/Makefile b/BFS/Makefile
index a4ea69d..a773b38 100644
--- a/BFS/Makefile
+++ b/BFS/Makefile
@@ -1,17 +1,37 @@
 NR_DPUS ?= 1
 NR_TASKLETS ?= 16
+WITH_ALLOC_OVERHEAD ?= 0
+WITH_LOAD_OVERHEAD ?= 0
+WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -20,11 +40,13 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/host_code: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
 clean:
 	${QUIET}rm -rf bin
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile
index 1f6ed3c..1efe457 100644
--- a/BFS/baselines/cpu/Makefile
+++ b/BFS/baselines/cpu/Makefile
@@ -1,8 +1,26 @@
-.PHONY: all
-all: bfs
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+LDFLAGS =
+CFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
+endif
 
 bfs: app.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -o bfs -fopenmp app.c
+	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -o bfs -fopenmp app.c ${LDFLAGS}
 
 bfs_O0: app.c
 	gcc -o bfs_O0 -fopenmp app.c
@@ -27,3 +45,5 @@ run_O2: bfs_O2
 .PHONY: clean
 clean:
 	rm -f bfs bfs_O0 bfs_O2
+
+.PHONY: all
diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c
index caf4cbc..390b1f9 100644
--- a/BFS/baselines/cpu/app.c
+++ b/BFS/baselines/cpu/app.c
@@ -8,12 +8,30 @@
 
 #include <omp.h>
 
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+struct bitmask* bitmask_in;
+int numa_node_in = -1;
+int numa_node_cpu = -1;
+#endif
+
 #include "../../support/common.h"
 #include "../../support/graph.h"
 #include "../../support/params.h"
-#include "../../support/timer.h"
 #include "../../support/utils.h"
 
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define startTimer(...)
+#define stopTimer(...)
+#endif
+
 int main(int argc, char** argv) {
 
     // Process parameters
@@ -24,8 +42,9 @@ int main(int argc, char** argv) {
     struct COOGraph cooGraph = readCOOGraph(p.fileName);
     PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
 
-
+#if WITH_BENCHMARK
     Timer timer;
+#endif
     for(int rep = 0; rep < 100; rep++) {
 
         struct CSRGraph csrGraph = coo2csr(cooGraph);
@@ -43,6 +62,12 @@ int main(int argc, char** argv) {
         uint32_t* prevFrontier = buffer1;
         uint32_t* currFrontier = buffer2;
 
+#if NOP_SYNC
+        for(int rep = 0; rep < 200000; rep++) {
+            asm volatile("nop" ::);
+        }
+#endif
+
         // Calculating result on CPU
         startTimer(&timer, 0, 0);
         nodeLevel[srcNode] = 0;
@@ -86,6 +111,12 @@ int main(int argc, char** argv) {
         }
         stopTimer(&timer, 0);
 
+#if NOP_SYNC
+        for(int rep = 0; rep < 200000; rep++) {
+            asm volatile("nop" ::);
+        }
+#endif
+
         freeCSRGraph(csrGraph);
         free(buffer1);
         free(buffer2);
@@ -135,6 +166,7 @@ int main(int argc, char** argv) {
         }
         stopTimer(&timer, 1);
 
+#if WITH_BENCHMARK
         unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
@@ -158,8 +190,11 @@ int main(int argc, char** argv) {
             printf(" throughput_seq_MOpps=%f throughput_MOpps=%f",
                 csrGraph.numNodes / timer.time[1],
                 csrGraph.numNodes / timer.time[0]);
-            printAll(&timer, 1);
+            printf(" latency_us=%f latency_seq_us=%f\n",
+                timer.time[0],
+                timer.time[1]);
         }
+#endif // WITH_BENCHMARK
 
         freeCSRGraph(csrGraph);
         free(nodeLevel);
diff --git a/BFS/benchmark-scripts/ccmcc25-sim.sh b/BFS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..bcbe284
--- /dev/null
+++ b/BFS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -f ${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  BFS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+# BFS does not support repeated kernel invocations → repeat it here
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+	::: i $(seq 0 4) \
+	::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
diff --git a/BFS/benchmark-scripts/ccmcc25.sh b/BFS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..0dcf4bb
--- /dev/null
+++ b/BFS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -f ${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  BFS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	# BFS does not support repeated kernel invocations → repeat it here
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any data={data} \
+		::: i $(seq 0 10) \
+		::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+	>> ${fn}.txt
+
+done
diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h
index b02c073..dc986d2 100644
--- a/BFS/dpu/dpu-utils.h
+++ b/BFS/dpu/dpu-utils.h
@@ -6,39 +6,46 @@
 
 #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m   "fmt"\n", ##__VA_ARGS__)
 
-static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8);
-    return cache_w[0];
+static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+	mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)),
+		  cache_w, 8);
+	return cache_w[0];
 }
 
-static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    cache_w[0] = val;
-    mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8);
+static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx,
+		    uint64_t *cache_w)
+{
+	cache_w[0] = val;
+	mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)),
+		   8);
 }
 
-static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    // Load 8B
-    uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
-    uint32_t offset = ((uint32_t)ptr_idx_m)%8;
-    uint32_t ptr_block_m = ptr_idx_m - offset;
-    mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
-    // Extract 4B
-    uint32_t* cache_32_w = (uint32_t*) cache_w;
-    return cache_32_w[offset/4];
+static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+	// Load 8B
+	uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+	uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+	uint32_t ptr_block_m = ptr_idx_m - offset;
+	mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+	// Extract 4B
+	uint32_t *cache_32_w = (uint32_t *) cache_w;
+	return cache_32_w[offset / 4];
 }
 
-static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    // Load 8B
-    uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
-    uint32_t offset = ((uint32_t)ptr_idx_m)%8;
-    uint32_t ptr_block_m = ptr_idx_m - offset;
-    mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
-    // Modify 4B
-    uint32_t* cache_32_w = (uint32_t*) cache_w;
-    cache_32_w[offset/4] = val;
-    // Write back 8B
-    mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8);
+static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx,
+		    uint64_t *cache_w)
+{
+	// Load 8B
+	uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+	uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+	uint32_t ptr_block_m = ptr_idx_m - offset;
+	mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+	// Modify 4B
+	uint32_t *cache_32_w = (uint32_t *) cache_w;
+	cache_32_w[offset / 4] = val;
+	// Write back 8B
+	mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8);
 }
 
 #endif
-
diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c
index 43a2d0f..5275047 100644
--- a/BFS/dpu/task.c
+++ b/BFS/dpu/task.c
@@ -12,7 +12,7 @@
 #include <perfcounter.h>
 
 #include "dpu-utils.h"
-#include "../support/common.h"
+#include "common.h"
 
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
@@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS);
 MUTEX_INIT(nextFrontierMutex);
 
 // main
-int main() {
-
-    if(me() == 0) {
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Load parameters
-    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-
-    // Extract parameters
-    uint32_t numGlobalNodes = params_w->numNodes;
-    uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
-    uint32_t numNodes = params_w->dpuNumNodes;
-    uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
-    uint32_t level = params_w->level;
-    uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
-    uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
-    uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
-    uint32_t visited_m = params_w->dpuVisited_m;
-    uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
-    uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
-
-    if(numNodes > 0) {
-
-        // Sanity check
-        if(me() == 0) {
-            if(numGlobalNodes%64 != 0) {
-                //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
-            }
-            if(startNodeIdx%64 != 0 || numNodes%64 != 0) {
-                //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
-            }
-        }
-
-        // Allocate WRAM cache for each tasklet to use throughout
-        uint64_t* cache_w = mem_alloc(sizeof(uint64_t));
-
-        // Update current frontier and visited list based on the next frontier from the previous iteration
-        for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) {
-
-            // Get the next frontier tile from MRAM
-            uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w);
-
-            // Process next frontier tile if it is not empty 
-            if(nextFrontierTile) {
-
-                // Mark everything that was previously added to the next frontier as visited
-                uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w);
-                visitedTile |= nextFrontierTile;
-                store8B(visitedTile, visited_m, nodeTileIdx, cache_w);
-
-                // Clear the next frontier
-                store8B(0, nextFrontier_m, nodeTileIdx, cache_w);
-
-            }
-
-            // Extract the current frontier from the previous next frontier and update node levels
-            uint32_t startTileIdx = startNodeIdx/64;
-            uint32_t numTiles = numNodes/64;
-            if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) {
-
-                // Update current frontier
-                store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w);
-
-                // Update node levels
-                if(nextFrontierTile) {
-                    for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                        if(isSet(nextFrontierTile, node%64)) {
-                            store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
-                        }
-                    }
-                }
-            }
-
-        }
-
-        // Wait until all tasklets have updated the current frontier
-        barrier_wait(&bfsBarrier);
-
-        // Identify tasklet's nodes
-        uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS;
-        uint32_t taskletNodesStart = me()*numNodesPerTasklet;
-        uint32_t taskletNumNodes;
-        if(taskletNodesStart > numNodes) {
-            taskletNumNodes = 0;
-        } else if(taskletNodesStart + numNodesPerTasklet > numNodes) {
-            taskletNumNodes = numNodes - taskletNodesStart;
-        } else {
-            taskletNumNodes = numNodesPerTasklet;
-        }
-
-        // Visit neighbors of the current frontier
-        mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
-        for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) {
-            uint32_t nodeTileIdx = node/64;
-            uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
-            if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
-                // Visit its neighbors
-                uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset;
-                uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
-                for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
-                    uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
-                    uint32_t neighborTileIdx = neighbor/64;
-                    uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w);
-                    if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited
-                        // Add neighbor to next frontier
-                        mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
-                        uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w);
-                        setBit(nextFrontierTile, neighbor%64);
-                        store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w);
-                        mutex_unlock(mutexID);
-                    }
-                }
-            }
-        }
-
-    }
-
-    return 0;
+int main()
+{
+
+	if (me() == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Load parameters
+	uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	struct DPUParams *params_w =
+	    (struct DPUParams *)
+	    mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	mram_read((__mram_ptr void const *)params_m, params_w,
+		  ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+
+	// Extract parameters
+	uint32_t numGlobalNodes = params_w->numNodes;
+	uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
+	uint32_t numNodes = params_w->dpuNumNodes;
+	uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
+	uint32_t level = params_w->level;
+	uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
+	uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
+	uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
+	uint32_t visited_m = params_w->dpuVisited_m;
+	uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
+	uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
+
+	if (numNodes > 0) {
+
+		// Sanity check
+		if (me() == 0) {
+			if (numGlobalNodes % 64 != 0) {
+				//PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
+			}
+			if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) {
+				//PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
+			}
+		}
+		// Allocate WRAM cache for each tasklet to use throughout
+		uint64_t *cache_w = mem_alloc(sizeof(uint64_t));
+
+		// Update current frontier and visited list based on the next frontier from the previous iteration
+		for (uint32_t nodeTileIdx = me();
+		     nodeTileIdx < numGlobalNodes / 64;
+		     nodeTileIdx += NR_TASKLETS) {
+
+			// Get the next frontier tile from MRAM
+			uint64_t nextFrontierTile =
+			    load8B(nextFrontier_m, nodeTileIdx, cache_w);
+
+			// Process next frontier tile if it is not empty 
+			if (nextFrontierTile) {
+
+				// Mark everything that was previously added to the next frontier as visited
+				uint64_t visitedTile =
+				    load8B(visited_m, nodeTileIdx, cache_w);
+				visitedTile |= nextFrontierTile;
+				store8B(visitedTile, visited_m, nodeTileIdx,
+					cache_w);
+
+				// Clear the next frontier
+				store8B(0, nextFrontier_m, nodeTileIdx,
+					cache_w);
+
+			}
+			// Extract the current frontier from the previous next frontier and update node levels
+			uint32_t startTileIdx = startNodeIdx / 64;
+			uint32_t numTiles = numNodes / 64;
+			if (startTileIdx <= nodeTileIdx
+			    && nodeTileIdx < startTileIdx + numTiles) {
+
+				// Update current frontier
+				store8B(nextFrontierTile, currentFrontier_m,
+					nodeTileIdx - startTileIdx, cache_w);
+
+				// Update node levels
+				if (nextFrontierTile) {
+					for (uint32_t node = nodeTileIdx * 64;
+					     node < (nodeTileIdx + 1) * 64;
+					     ++node) {
+						if (isSet
+						    (nextFrontierTile,
+						     node % 64)) {
+							store4B(level, nodeLevel_m, node - startNodeIdx, cache_w);	// No false sharing so no need for locks
+						}
+					}
+				}
+			}
+
+		}
+
+		// Wait until all tasklets have updated the current frontier
+		barrier_wait(&bfsBarrier);
+
+		// Identify tasklet's nodes
+		uint32_t numNodesPerTasklet =
+		    (numNodes + NR_TASKLETS - 1) / NR_TASKLETS;
+		uint32_t taskletNodesStart = me() * numNodesPerTasklet;
+		uint32_t taskletNumNodes;
+		if (taskletNodesStart > numNodes) {
+			taskletNumNodes = 0;
+		} else if (taskletNodesStart + numNodesPerTasklet > numNodes) {
+			taskletNumNodes = numNodes - taskletNodesStart;
+		} else {
+			taskletNumNodes = numNodesPerTasklet;
+		}
+
+		// Visit neighbors of the current frontier
+		mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
+		for (uint32_t node = taskletNodesStart;
+		     node < taskletNodesStart + taskletNumNodes; ++node) {
+			uint32_t nodeTileIdx = node / 64;
+			uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w);	// TODO: Optimize: load tile then loop over nodes in the tile
+			if (isSet(currentFrontierTile, node % 64)) {	// If the node is in the current frontier
+				// Visit its neighbors
+				uint32_t nodePtr =
+				    load4B(nodePtrs_m, node,
+					   cache_w) - nodePtrsOffset;
+				uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset;	// TODO: Optimize: might be in the same 8B as nodePtr
+				for (uint32_t i = nodePtr; i < nextNodePtr; ++i) {
+					uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w);	// TODO: Optimize: sequential access to neighbors can use sequential reader
+					uint32_t neighborTileIdx =
+					    neighbor / 64;
+					uint64_t visitedTile =
+					    load8B(visited_m, neighborTileIdx,
+						   cache_w);
+					if (!isSet(visitedTile, neighbor % 64)) {	// Neighbor not previously visited
+						// Add neighbor to next frontier
+						mutex_lock(mutexID);	// TODO: Optimize: use more locks to reduce contention
+						uint64_t nextFrontierTile =
+						    load8B(nextFrontier_m,
+							   neighborTileIdx,
+							   cache_w);
+						setBit(nextFrontierTile,
+						       neighbor % 64);
+						store8B(nextFrontierTile,
+							nextFrontier_m,
+							neighborTileIdx,
+							cache_w);
+						mutex_unlock(mutexID);
+					}
+				}
+			}
+		}
+
+	}
+
+	return 0;
 }
diff --git a/BFS/host/app.c b/BFS/host/app.c
index 54b9cdc..4431193 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -3,9 +3,24 @@
 * BFS Host Application Source File
 *
 */
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
 
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <assert.h>
 #include <getopt.h>
 #include <stdio.h>
@@ -14,321 +29,436 @@
 #include <unistd.h>
 
 #include "mram-management.h"
-#include "../support/common.h"
-#include "../support/graph.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "graph.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
 
-#ifndef ENERGY
-#define ENERGY 0
+#define DPU_BINARY "./bin/dpu_code"
+
+// Main of the Host Application
+int main(int argc, char **argv)
+{
+
+	// Process parameters
+	struct Params p = input_params(argc, argv);
+
+	// Timer and profiling
+	Timer timer;
+#if ENERGY
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	double tenergy = 0;
+#endif
+
+	// Allocate DPUs and load binary
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t numDPUs, numRanks;
+
+#if WITH_ALLOC_OVERHEAD
+	startTimer(&timer, 0, 0);
+#endif
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if WITH_ALLOC_OVERHEAD
+	stopTimer(&timer, 0);
+#else
+	zeroTimer(&timer, 0);
+#endif
+
+#if WITH_LOAD_OVERHEAD
+	startTimer(&timer, 1, 0);
+#endif
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+#if WITH_LOAD_OVERHEAD
+	stopTimer(&timer, 0);
+#else
+	zeroTimer(&timer, 1);
+#endif
+
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+	assert(NR_DPUS == numDPUs);
+	PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+	// Initialize BFS data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+	struct COOGraph cooGraph = readCOOGraph(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges",
+		   cooGraph.numNodes, cooGraph.numEdges);
+	struct CSRGraph csrGraph = coo2csr(cooGraph);
+	uint32_t numNodes = csrGraph.numNodes;
+	uint32_t *nodePtrs = csrGraph.nodePtrs;
+	uint32_t *neighborIdxs = csrGraph.neighborIdxs;
+	uint32_t *nodeLevel = (uint32_t*)calloc(numNodes, sizeof(uint32_t));	// Node's BFS level (initially all 0 meaning not reachable)
+	uint64_t *visited = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	uint64_t *currentFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	uint64_t *nextFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	setBit(nextFrontier[0], 0);	// Initialize frontier to first node
+	uint32_t level = 1;
+
+	// Partition data structure across DPUs
+	uint32_t numNodesPerDPU =
+	    ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1);
+	PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU",
+		   numNodesPerDPU);
+	struct DPUParams dpuParams[numDPUs];
+	uint32_t dpuParams_m[numDPUs];
+	unsigned int dpuIdx = 0;
+	unsigned int t0ini = 0;
+	unsigned int t1ini = 0;
+	unsigned int t2ini = 0;
+	unsigned int t3ini = 0;
+	DPU_FOREACH(dpu_set, dpu) {
+
+		// Allocate parameters
+		struct mram_heap_allocator_t allocator;
+		init_allocator(&allocator);
+		dpuParams_m[dpuIdx] =
+		    mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+		// Find DPU's nodes
+		uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+		uint32_t dpuNumNodes;
+		if (dpuStartNodeIdx > numNodes) {
+			dpuNumNodes = 0;
+		} else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) {
+			dpuNumNodes = numNodes - dpuStartNodeIdx;
+		} else {
+			dpuNumNodes = numNodesPerDPU;
+		}
+		dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
+		PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
+		PRINT_INFO(p.verbosity >= 2, "        Receives %u nodes",
+			   dpuNumNodes);
+
+		// Partition edges and copy data
+		if (dpuNumNodes > 0) {
+
+			// Find DPU's CSR graph partition
+			uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
+			uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
+			uint32_t *dpuNeighborIdxs_h =
+			    neighborIdxs + dpuNodePtrsOffset;
+			uint32_t dpuNumNeighbors =
+			    dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
+			uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
+
+			// Allocate MRAM
+			uint32_t dpuNodePtrs_m =
+			    mram_heap_alloc(&allocator,
+					    (dpuNumNodes +
+					     1) * sizeof(uint32_t));
+			uint32_t dpuNeighborIdxs_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNeighbors * sizeof(uint32_t));
+			uint32_t dpuNodeLevel_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNodes * sizeof(uint32_t));
+			uint32_t dpuVisited_m =
+			    mram_heap_alloc(&allocator,
+					    numNodes / 64 * sizeof(uint64_t));
+			uint32_t dpuCurrentFrontier_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNodes / 64 *
+					    sizeof(uint64_t));
+			uint32_t dpuNextFrontier_m =
+			    mram_heap_alloc(&allocator,
+					    numNodes / 64 * sizeof(uint64_t));
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Total memory allocated is %d bytes",
+				   allocator.totalAllocated);
+
+			// Set up DPU parameters
+			dpuParams[dpuIdx].numNodes = numNodes;
+			dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
+			dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
+			dpuParams[dpuIdx].level = level;
+			dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
+			dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
+			dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
+			dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
+			dpuParams[dpuIdx].dpuCurrentFrontier_m =
+			    dpuCurrentFrontier_m;
+			dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
+
+			// Send data to DPU
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Copying data to DPU");
+			startTimer(&timer, 2, t0ini++);
+
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNodePtrs_m, (uint8_t *) dpuNodePtrs_h,
+						ROUND_UP_TO_MULTIPLE_OF_8((dpuNumNodes + 1) * sizeof(uint32_t))));
+
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNeighborIdxs_m, (uint8_t *) dpuNeighborIdxs_h,
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNeighbors * sizeof(uint32_t))));
+
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNodeLevel_m, (uint8_t *) dpuNodeLevel_h,
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(uint32_t))));
+
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuVisited_m, (uint8_t *) visited,
+						ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNextFrontier_m, (uint8_t *) nextFrontier,
+						ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+
+			// NOTE: No need to copy current frontier because it is written before being read
+			stopTimer(&timer, 2);
+			//loadTime += getElapsedTime(timer);
+
+		}
+		// Send parameters to DPU
+		PRINT_INFO(p.verbosity >= 2,
+			   "        Copying parameters to DPU");
+		startTimer(&timer, 2, t1ini++);
+		DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+					dpuParams_m[dpuIdx], (uint8_t *) & dpuParams[dpuIdx],
+					ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+		stopTimer(&timer, 2);
+		//loadTime += getElapsedTime(timer);
+
+		++dpuIdx;
+
+	}
+
+	// Iterate until next frontier is empty
+	uint32_t nextFrontierEmpty = 0;
+	while (!nextFrontierEmpty) {
+
+		PRINT_INFO(p.verbosity >= 1,
+			   "Processing current frontier for level %u", level);
+
+#if ENERGY
+		DPU_ASSERT(dpu_probe_start(&probe));
 #endif
+		// Run all DPUs
+		PRINT_INFO(p.verbosity >= 1, "    Booting DPUs");
+		startTimer(&timer, 3, t2ini++);
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		stopTimer(&timer, 3);
+		//dpuTime += getElapsedTime(timer);
 #if ENERGY
-#include <dpu_probe.h>
+		DPU_ASSERT(dpu_probe_stop(&probe));
+		double energy;
+		DPU_ASSERT(dpu_probe_get
+			   (&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+		tenergy += energy;
 #endif
 
-#define DPU_BINARY "./bin/dpu_code"
+		// Copy back next frontier from all DPUs and compute their union as the current frontier
+		startTimer(&timer, 4, t3ini++);
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+			if (dpuNumNodes > 0) {
+				if (dpuIdx == 0) {
+					DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+								dpuParams[dpuIdx].dpuNextFrontier_m,
+								(uint8_t *) currentFrontier,
+								ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+				} else {
+					DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+								dpuParams[dpuIdx].dpuNextFrontier_m,
+								(uint8_t *) nextFrontier,
+								ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+					for (uint32_t i = 0; i < numNodes / 64;
+					     ++i) {
+						currentFrontier[i] |=
+						    nextFrontier[i];
+					}
+				}
+				++dpuIdx;
+			}
+		}
+
+		// Check if the next frontier is empty, and copy data to DPU if not empty
+		nextFrontierEmpty = 1;
+		for (uint32_t i = 0; i < numNodes / 64; ++i) {
+			if (currentFrontier[i]) {
+				nextFrontierEmpty = 0;
+				break;
+			}
+		}
+		if (!nextFrontierEmpty) {
+			++level;
+			dpuIdx = 0;
+			DPU_FOREACH(dpu_set, dpu) {
+				uint32_t dpuNumNodes =
+				    dpuParams[dpuIdx].dpuNumNodes;
+				if (dpuNumNodes > 0) {
+					// Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
+					DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+								dpuParams[dpuIdx].dpuNextFrontier_m,
+								(uint8_t *) currentFrontier,
+								ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+					// Copy new level to DPU
+					dpuParams[dpuIdx].level = level;
+					DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+								dpuParams_m[dpuIdx], (uint8_t *) &dpuParams[dpuIdx],
+								ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+					++dpuIdx;
+				}
+			}
+		}
+		stopTimer(&timer, 4);
+		//hostTime += getElapsedTime(timer);
+
+	}
+
+	// Copy back node levels
+	PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+	startTimer(&timer, 5, 0);
+	dpuIdx = 0;
+	DPU_FOREACH(dpu_set, dpu) {
+		uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+		if (dpuNumNodes > 0) {
+			uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+			DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuParams[dpuIdx].dpuNodeLevel_m,
+						(uint8_t *) (nodeLevel + dpuStartNodeIdx),
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(float))));
+		}
+		++dpuIdx;
+	}
+	stopTimer(&timer, 5);
+	//retrieveTime += getElapsedTime(timer);
+	//if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f    DPU Kernel Time (ms): %f    Inter-DPU Time (ms): %f    DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	uint32_t *nodeLevelReference = (uint32_t*) calloc(numNodes, sizeof(uint32_t));	// Node's BFS level (initially all 0 meaning not reachable)
+	memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t));
+	setBit(nextFrontier[0], 0);	// Initialize frontier to first node
+	nextFrontierEmpty = 0;
+	level = 1;
+	startTimer(&timer, 6, 0);
+	while (!nextFrontierEmpty) {
+		// Update current frontier and visited list based on the next frontier from the previous iteration
+		for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+		     ++nodeTileIdx) {
+			uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
+			currentFrontier[nodeTileIdx] = nextFrontierTile;
+			if (nextFrontierTile) {
+				visited[nodeTileIdx] |= nextFrontierTile;
+				nextFrontier[nodeTileIdx] = 0;
+				for (uint32_t node = nodeTileIdx * 64;
+				     node < (nodeTileIdx + 1) * 64; ++node) {
+					if (isSet(nextFrontierTile, node % 64)) {
+						nodeLevelReference[node] =
+						    level;
+					}
+				}
+			}
+		}
+		// Visit neighbors of the current frontier
+		nextFrontierEmpty = 1;
+		for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+		     ++nodeTileIdx) {
+			uint64_t currentFrontierTile =
+			    currentFrontier[nodeTileIdx];
+			if (currentFrontierTile) {
+				for (uint32_t node = nodeTileIdx * 64;
+				     node < (nodeTileIdx + 1) * 64; ++node) {
+					if (isSet(currentFrontierTile, node % 64)) {	// If the node is in the current frontier
+						// Visit its neighbors
+						uint32_t nodePtr =
+						    nodePtrs[node];
+						uint32_t nextNodePtr =
+						    nodePtrs[node + 1];
+						for (uint32_t i = nodePtr;
+						     i < nextNodePtr; ++i) {
+							uint32_t neighbor =
+							    neighborIdxs[i];
+							if (!isSet(visited[neighbor / 64], neighbor % 64)) {	// Neighbor not previously visited
+								// Add neighbor to next frontier
+								setBit
+								    (nextFrontier
+								     [neighbor /
+								      64],
+								     neighbor %
+								     64);
+								nextFrontierEmpty
+								    = 0;
+							}
+						}
+					}
+				}
+			}
+		}
+		++level;
+	}
+	stopTimer(&timer, 6);
+
+#if WITH_FREE_OVERHEAD
+	startTimer(&timer, 7);
+#endif
+	DPU_ASSERT(dpu_free(dpu_set));
+#if WITH_FREE_OVERHEAD
+	stopTimer(&timer, 7);
+#else
+	zeroTimer(&timer, 7);
+#endif
 
-// Main of the Host Application
-int main(int argc, char** argv) {
-
-    // Process parameters
-    struct Params p = input_params(argc, argv);
-
-    // Timer and profiling
-    Timer timer;
-    #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-    double tenergy=0;
-    #endif
-
-    // Allocate DPUs and load binary
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t numDPUs;
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
-    PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
-    // Initialize BFS data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
-    struct COOGraph cooGraph = readCOOGraph(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
-    struct CSRGraph csrGraph = coo2csr(cooGraph);
-    uint32_t numNodes = csrGraph.numNodes;
-    uint32_t* nodePtrs = csrGraph.nodePtrs;
-    uint32_t* neighborIdxs = csrGraph.neighborIdxs;
-    uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
-    uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    setBit(nextFrontier[0], 0); // Initialize frontier to first node
-    uint32_t level = 1;
-
-    // Partition data structure across DPUs
-    uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1);
-    PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU);
-    struct DPUParams dpuParams[numDPUs];
-    uint32_t dpuParams_m[numDPUs];
-    unsigned int dpuIdx = 0;
-    unsigned int t0ini = 0;
-    unsigned int t1ini = 0;
-    unsigned int t2ini = 0;
-    unsigned int t3ini = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-
-        // Allocate parameters
-        struct mram_heap_allocator_t allocator;
-        init_allocator(&allocator);
-        dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
-        // Find DPU's nodes
-        uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
-        uint32_t dpuNumNodes;
-        if(dpuStartNodeIdx > numNodes) {
-            dpuNumNodes = 0;
-        } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) {
-            dpuNumNodes = numNodes - dpuStartNodeIdx;
-        } else {
-            dpuNumNodes = numNodesPerDPU;
-        }
-        dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
-        PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
-        PRINT_INFO(p.verbosity >= 2, "        Receives %u nodes", dpuNumNodes);
-
-        // Partition edges and copy data
-        if(dpuNumNodes > 0) {
-
-            // Find DPU's CSR graph partition
-            uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
-            uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
-            uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset;
-            uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
-            uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
-
-            // Allocate MRAM
-            uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t));
-            uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t));
-            uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t));
-            uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
-            uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t));
-            uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
-            PRINT_INFO(p.verbosity >= 2, "        Total memory allocated is %d bytes", allocator.totalAllocated);
-
-            // Set up DPU parameters
-            dpuParams[dpuIdx].numNodes = numNodes;
-            dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
-            dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
-            dpuParams[dpuIdx].level = level;
-            dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
-            dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
-            dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
-            dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
-            dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m;
-            dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
-
-            // Send data to DPU
-            PRINT_INFO(p.verbosity >= 2, "        Copying data to DPU");
-            startTimer(&timer, 0, t0ini++);
-            copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
-            copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
-            // NOTE: No need to copy current frontier because it is written before being read
-            stopTimer(&timer, 0);
-            //loadTime += getElapsedTime(timer);
-
-        }
-
-        // Send parameters to DPU
-        PRINT_INFO(p.verbosity >= 2, "        Copying parameters to DPU");
-        startTimer(&timer, 1, t1ini++);
-        copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
-        stopTimer(&timer, 1);
-        //loadTime += getElapsedTime(timer);
-
-        ++dpuIdx;
-
-    }
-
-    // Iterate until next frontier is empty
-    uint32_t nextFrontierEmpty = 0;
-    while(!nextFrontierEmpty) {
-
-        PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level);
-
-	#if ENERGY
-	DPU_ASSERT(dpu_probe_start(&probe));
-	#endif
-        // Run all DPUs
-        PRINT_INFO(p.verbosity >= 1, "    Booting DPUs");
-        startTimer(&timer, 2, t2ini++);
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        stopTimer(&timer, 2);
-        //dpuTime += getElapsedTime(timer);
-	#if ENERGY
-    	DPU_ASSERT(dpu_probe_stop(&probe));
-    	double energy;
-    	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-	tenergy += energy;
-	#endif
-
-
-
-        // Copy back next frontier from all DPUs and compute their union as the current frontier
-        startTimer(&timer, 3, t3ini++);
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-            if(dpuNumNodes > 0) {
-                if(dpuIdx == 0) {
-                    copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t));
-                } else {
-                    copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t));
-                    for(uint32_t i = 0; i < numNodes/64; ++i) {
-                        currentFrontier[i] |= nextFrontier[i];
-                    }
-                }
-                ++dpuIdx;
-            }
-        }
-
-        // Check if the next frontier is empty, and copy data to DPU if not empty
-        nextFrontierEmpty = 1;
-        for(uint32_t i = 0; i < numNodes/64; ++i) {
-            if(currentFrontier[i]) {
-                nextFrontierEmpty = 0;
-                break;
-            }
-        }
-        if(!nextFrontierEmpty) {
-            ++level;
-            dpuIdx = 0;
-            DPU_FOREACH (dpu_set, dpu) {
-                uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-                if(dpuNumNodes > 0) {
-                    // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
-                    copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
-                    // Copy new level to DPU
-                    dpuParams[dpuIdx].level = level;
-                    copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
-                    ++dpuIdx;
-                }
-            }
-        }
-        stopTimer(&timer, 3);
-        //hostTime += getElapsedTime(timer);
-
-    }
-
-    // Copy back node levels
-    PRINT_INFO(p.verbosity >= 1, "Copying back the result");
-    startTimer(&timer, 4, 0);
-    dpuIdx = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-        uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-        if(dpuNumNodes > 0) {
-            uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
-            copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float));
-        }
-        ++dpuIdx;
-    }
-    stopTimer(&timer, 4);
-    //retrieveTime += getElapsedTime(timer);
-    //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f    DPU Kernel Time (ms): %f    Inter-DPU Time (ms): %f    DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
-
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
-    memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t));
-    setBit(nextFrontier[0], 0); // Initialize frontier to first node
-    nextFrontierEmpty = 0;
-    level = 1;
-    while(!nextFrontierEmpty) {
-        // Update current frontier and visited list based on the next frontier from the previous iteration
-        for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
-            uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
-            currentFrontier[nodeTileIdx] = nextFrontierTile;
-            if(nextFrontierTile) {
-                visited[nodeTileIdx] |= nextFrontierTile;
-                nextFrontier[nodeTileIdx] = 0;
-                for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                    if(isSet(nextFrontierTile, node%64)) {
-                        nodeLevelReference[node] = level;
-                    }
-                }
-            }
-        }
-        // Visit neighbors of the current frontier
-        nextFrontierEmpty = 1;
-        for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
-            uint64_t currentFrontierTile = currentFrontier[nodeTileIdx];
-            if(currentFrontierTile) {
-                for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                    if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
-                        // Visit its neighbors
-                        uint32_t nodePtr = nodePtrs[node];
-                        uint32_t nextNodePtr = nodePtrs[node + 1];
-                        for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
-                            uint32_t neighbor = neighborIdxs[i];
-                            if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited
-                                // Add neighbor to next frontier
-                                setBit(nextFrontier[neighbor/64], neighbor%64);
-                                nextFrontierEmpty = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        ++level;
-    }
-
-    // Verify the result
-    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
-    int status = 1;
-    for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
-        if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
-            PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
-            status = 0;
-        }
-    }
-
-    if (status) {
-        printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
-            "| throughput_pim_MBps=%f throughput_MBps=%f",
-            numDPUs, NR_TASKLETS, "uint32_t", numNodes,
-            numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]),
-            numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
-        printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
-            numNodes / (timer.time[2] + timer.time[3]),
-            numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
-        printAll(&timer, 4);
-    }
-
-    // Display DPU Logs
-    if(p.verbosity >= 2) {
-        PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            PRINT("DPU %u:", dpuIdx);
-            DPU_ASSERT(dpu_log_read(dpu, stdout));
-            ++dpuIdx;
-        }
-    }
-
-    // Deallocate data structures
-    freeCOOGraph(cooGraph);
-    freeCSRGraph(csrGraph);
-    free(nodeLevel);
-    free(visited);
-    free(currentFrontier);
-    free(nextFrontier);
-    free(nodeLevelReference);
-
-    return 0;
+	// Verify the result
+	PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+	int status = 1;
+	for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
+		if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
+			PRINT_ERROR
+			    ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)",
+			     nodeIdx, nodeLevelReference[nodeIdx],
+			     nodeLevel[nodeIdx]);
+			status = 0;
+		}
+	}
+
+	if (status) {
+		dfatool_printf
+		    ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
+		     "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, numRanks,
+		     NR_TASKLETS, "uint32_t", numNodes,
+		     numNodes * sizeof(uint32_t) / (timer.time[2] +
+						    timer.time[3]),
+		     numNodes * sizeof(uint32_t) / (timer.time[0] +
+						    timer.time[1] +
+						    timer.time[2] +
+						    timer.time[3] +
+						    timer.time[4]));
+		dfatool_printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
+		       numNodes / (timer.time[2] + timer.time[3]),
+		       numNodes / (timer.time[0] + timer.time[1] +
+				   timer.time[2] + timer.time[3] +
+				   timer.time[4]));
+		dfatool_printf
+		    (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
+		     timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+		     timer.time[4], timer.time[5], timer.time[6],
+		     timer.time[7]);
+	}
+	// Display DPU Logs
+	if (p.verbosity >= 2) {
+		PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			PRINT("DPU %u:", dpuIdx);
+			DPU_ASSERT(dpu_log_read(dpu, stdout));
+			++dpuIdx;
+		}
+	}
+	// Deallocate data structures
+	freeCOOGraph(cooGraph);
+	freeCSRGraph(csrGraph);
+	free(nodeLevel);
+	free(visited);
+	free(currentFrontier);
+	free(nextFrontier);
+	free(nodeLevelReference);
+
+	return 0;
 
 }
-
diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h
index 627dfde..a953d6a 100644
--- a/BFS/host/mram-management.h
+++ b/BFS/host/mram-management.h
@@ -1,37 +1,29 @@
+#pragma once
 
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
+#include "common.h"
+#include "utils.h"
 
-#include "../support/common.h"
-#include "../support/utils.h"
-
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20)	// A DPU's capacity is 64 MiB
 
 struct mram_heap_allocator_t {
-    uint32_t totalAllocated;
+	uint32_t totalAllocated;
 };
 
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
-    allocator->totalAllocated = 0;
-}
-
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
-    uint32_t ret = allocator->totalAllocated;
-    allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
-    if(allocator->totalAllocated > DPU_CAPACITY) {
-        PRINT_ERROR("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
-        exit(0);
-    }
-    return ret;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+	allocator->totalAllocated = 0;
 }
 
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
-    DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+				uint32_t size)
+{
+	uint32_t ret = allocator->totalAllocated;
+	allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+	if (allocator->totalAllocated > DPU_CAPACITY) {
+		PRINT_ERROR
+		    ("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+		     allocator->totalAllocated, DPU_CAPACITY);
+		exit(0);
+	}
+	return ret;
 }
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
-    DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
-
diff --git a/BFS/include/common.h b/BFS/include/common.h
new file mode 100644
index 0000000..5f2aa0d
--- /dev/null
+++ b/BFS/include/common.h
@@ -0,0 +1,25 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
+#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
+#define ROUND_UP_TO_MULTIPLE_OF_64(x)   ((((x) + 63)/64)*64)
+
+#define setBit(val, idx) (val) |= (1 << (idx))
+#define isSet(val, idx)  ((val) & (1 << (idx)))
+
+struct DPUParams {
+	uint32_t dpuNumNodes;	/* The number of nodes assigned to this DPU */
+	uint32_t numNodes;	/* Total number of nodes in the graph  */
+	uint32_t dpuStartNodeIdx;	/* The index of the first node assigned to this DPU  */
+	uint32_t dpuNodePtrsOffset;	/* Offset of the node pointers */
+	uint32_t level;		/* The current BFS level */
+	uint32_t dpuNodePtrs_m;
+	uint32_t dpuNeighborIdxs_m;
+	uint32_t dpuNodeLevel_m;
+	uint32_t dpuVisited_m;
+	uint32_t dpuCurrentFrontier_m;
+	uint32_t dpuNextFrontier_m;
+};
+
+#endif
diff --git a/BFS/include/dfatool_host.ah b/BFS/include/dfatool_host.ah
new file mode 100644
index 0000000..b2677e1
--- /dev/null
+++ b/BFS/include/dfatool_host.ah
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned long input_size;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(uint32_t);
+	}
+
+	advice call("% input_params(...)"): after() {
+		printf("[>>] BFS | n_dpus=%u\n", NR_DPUS);
+	}
+
+	advice call("% coo2csr(...)") : after() {
+		struct CSRGraph *g = tjp->result();
+		input_size = g->numNodes;
+		printf("[--] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size);
+	}
+};
diff --git a/BFS/include/graph.h b/BFS/include/graph.h
new file mode 100644
index 0000000..2a19f67
--- /dev/null
+++ b/BFS/include/graph.h
@@ -0,0 +1,133 @@
+
+#ifndef _GRAPH_H_
+#define _GRAPH_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "utils.h"
+
+struct COOGraph {
+	uint32_t numNodes;
+	uint32_t numEdges;
+	uint32_t *nodeIdxs;
+	uint32_t *neighborIdxs;
+};
+
+struct CSRGraph {
+	uint32_t numNodes;
+	uint32_t numEdges;
+	uint32_t *nodePtrs;
+	uint32_t *neighborIdxs;
+};
+
+static struct COOGraph readCOOGraph(const char *fileName)
+{
+
+	struct COOGraph cooGraph;
+
+	// Initialize fields
+	FILE *fp = fopen(fileName, "r");
+	uint32_t numNodes, numCols;
+	assert(fscanf(fp, "%u", &numNodes));
+	assert(fscanf(fp, "%u", &numCols));
+	if (numNodes == numCols) {
+		cooGraph.numNodes = numNodes;
+	} else {
+		PRINT_WARNING
+		    ("    Adjacency matrix is not square. Padding matrix to be square.");
+		cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols;
+	}
+	if (cooGraph.numNodes % 64 != 0) {
+		PRINT_WARNING
+		    ("    Adjacency matrix dimension is %u which is not a multiple of 64 nodes.",
+		     cooGraph.numNodes);
+		cooGraph.numNodes += (64 - cooGraph.numNodes % 64);
+		PRINT_WARNING
+		    ("        Padding to %u which is a multiple of 64 nodes.",
+		     cooGraph.numNodes);
+	}
+	assert(fscanf(fp, "%u", &cooGraph.numEdges));
+	cooGraph.nodeIdxs =
+	    (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+	cooGraph.neighborIdxs =
+	    (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+
+	// Read the neighborIdxs
+	for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
+		uint32_t nodeIdx;
+		assert(fscanf(fp, "%u", &nodeIdx));
+		cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
+		uint32_t neighborIdx;
+		assert(fscanf(fp, "%u", &neighborIdx));
+		cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
+	}
+
+	return cooGraph;
+
+}
+
+static void freeCOOGraph(struct COOGraph cooGraph)
+{
+	free(cooGraph.nodeIdxs);
+	free(cooGraph.neighborIdxs);
+}
+
+static struct CSRGraph coo2csr(struct COOGraph cooGraph)
+{
+
+	struct CSRGraph csrGraph;
+
+	// Initialize fields
+	csrGraph.numNodes = cooGraph.numNodes;
+	csrGraph.numEdges = cooGraph.numEdges;
+	csrGraph.nodePtrs =
+	    (uint32_t *)
+	    calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1),
+		   sizeof(uint32_t));
+	csrGraph.neighborIdxs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (csrGraph.numEdges * sizeof(uint32_t)));
+
+	// Histogram nodeIdxs
+	for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+		uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+		csrGraph.nodePtrs[nodeIdx]++;
+	}
+
+	// Prefix sum nodePtrs
+	uint32_t sumBeforeNextNode = 0;
+	for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+		uint32_t sumBeforeNode = sumBeforeNextNode;
+		sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
+		csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
+	}
+	csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
+
+	// Bin the neighborIdxs
+	for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+		uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+		uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
+		csrGraph.neighborIdxs[neighborListIdx] =
+		    cooGraph.neighborIdxs[i];
+	}
+
+	// Restore nodePtrs
+	for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
+		csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
+	}
+	csrGraph.nodePtrs[0] = 0;
+
+	return csrGraph;
+
+}
+
+static void freeCSRGraph(struct CSRGraph csrGraph)
+{
+	free(csrGraph.nodePtrs);
+	free(csrGraph.neighborIdxs);
+}
+
+#endif
diff --git a/BFS/include/params.h b/BFS/include/params.h
new file mode 100644
index 0000000..f9169bc
--- /dev/null
+++ b/BFS/include/params.h
@@ -0,0 +1,67 @@
+
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+#include "utils.h"
+
+static void usage()
+{
+	PRINT("\nUsage:  ./program [options]"
+	      "\n"
+	      "\nBenchmark-specific options:"
+	      "\n    -f <F>    input matrix file name (default=data/roadNet-CA.txt)"
+	      "\n"
+	      "\nGeneral options:"
+	      "\n    -v <V>    verbosity" "\n    -h        help" "\n\n");
+}
+
+typedef struct Params {
+	const char *fileName;
+	unsigned int verbosity;
+#if NUMA
+	struct bitmask *bitmask_in;
+	int numa_node_cpu;
+#endif
+} Params;
+
+static struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.fileName = "data/roadNet-CA.txt";
+	p.verbosity = 0;
+#if NUMA
+	p.bitmask_in = NULL;
+	p.numa_node_cpu = -1;
+#endif
+	int opt;
+	while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) {
+		switch (opt) {
+		case 'f':
+			p.fileName = optarg;
+			break;
+		case 'v':
+			p.verbosity = atoi(optarg);
+			break;
+#if NUMA
+		case 'A':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'C':
+			p.numa_node_cpu = atoi(optarg);
+			break;
+#endif
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			PRINT_ERROR("Unrecognized option!");
+			usage();
+			exit(0);
+		}
+	}
+
+	return p;
+}
+
+#endif
diff --git a/BFS/include/timer.h b/BFS/include/timer.h
new file mode 100644
index 0000000..e85490f
--- /dev/null
+++ b/BFS/include/timer.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#define N_TIMERS 8
+#define startTimer start
+#define stopTimer stop
+#define zeroTimer zero
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/BFS/support/utils.h b/BFS/include/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/BFS/support/utils.h
+++ b/BFS/include/utils.h
@@ -8,4 +8,3 @@
 #define PRINT(fmt, ...)             printf(fmt "\n", ##__VA_ARGS__)
 
 #endif
-
diff --git a/BFS/run-paper-strong-full.sh b/BFS/run-paper-strong-full.sh
deleted file mode 100755
index 42806a2..0000000
--- a/BFS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 are not part of upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/BFS/run-paper-strong-rank.sh b/BFS/run-paper-strong-rank.sh
deleted file mode 100755
index e01d18a..0000000
--- a/BFS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/BFS/run-paper-weak.sh b/BFS/run-paper-weak.sh
deleted file mode 100755
index 121758a..0000000
--- a/BFS/run-paper-weak.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# 256 and 512 are not part of upstream
-for nr_dpus in 256 512 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				# upstream code uses some kind of generated rMat graphs, but does not provide instructions for reproduction
-				timeout --foreground -k 1m 3m bin/host_code -f data/loc-gowalla_edges.txt || true
-			done
-		fi
-	done
-done |
-) tee log-paper-weak.txt
diff --git a/BFS/run.sh b/BFS/run.sh
deleted file mode 100755
index 8f5bfb8..0000000
--- a/BFS/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# -f: input file (i.e., input size)
-# bin/host_code -f data/loc-gowalla_edges.txt
-
-# input size depends on file -> strong scaling only
-
-echo "prim-benchmarks BFS (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do
-	for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do
-		for f in loc-gowalla_edges roadNet-CA; do
-			echo
-			if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-				for i in `seq 1 20`; do
-					timeout --foreground -k 1m 30m bin/host_code -f data/${f}.txt || true
-				done
-			fi
-		done
-	done
-done
diff --git a/BFS/support/common.h b/BFS/support/common.h
deleted file mode 100644
index ced324c..0000000
--- a/BFS/support/common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
-#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
-#define ROUND_UP_TO_MULTIPLE_OF_64(x)   ((((x) + 63)/64)*64)
-
-#define setBit(val, idx) (val) |= (1 << (idx))
-#define isSet(val, idx)  ((val) & (1 << (idx)))
-
-struct DPUParams {
-    uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
-    uint32_t numNodes; /* Total number of nodes in the graph  */
-    uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU  */
-    uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
-    uint32_t level; /* The current BFS level */
-    uint32_t dpuNodePtrs_m;
-    uint32_t dpuNeighborIdxs_m;
-    uint32_t dpuNodeLevel_m;
-    uint32_t dpuVisited_m;
-    uint32_t dpuCurrentFrontier_m;
-    uint32_t dpuNextFrontier_m;
-};
-
-#endif
-
diff --git a/BFS/support/graph.h b/BFS/support/graph.h
deleted file mode 100644
index f89ff5c..0000000
--- a/BFS/support/graph.h
+++ /dev/null
@@ -1,116 +0,0 @@
-
-#ifndef _GRAPH_H_
-#define _GRAPH_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "common.h"
-#include "utils.h"
-
-struct COOGraph {
-    uint32_t numNodes;
-    uint32_t numEdges;
-    uint32_t* nodeIdxs;
-    uint32_t* neighborIdxs;
-};
-
-struct CSRGraph {
-    uint32_t numNodes;
-    uint32_t numEdges;
-    uint32_t* nodePtrs;
-    uint32_t* neighborIdxs;
-};
-
-static struct COOGraph readCOOGraph(const char* fileName) {
-
-    struct COOGraph cooGraph;
-
-    // Initialize fields
-    FILE* fp = fopen(fileName, "r");
-    uint32_t numNodes, numCols;
-    assert(fscanf(fp, "%u", &numNodes));
-    assert(fscanf(fp, "%u", &numCols));
-    if(numNodes == numCols) {
-        cooGraph.numNodes = numNodes;
-    } else {
-        PRINT_WARNING("    Adjacency matrix is not square. Padding matrix to be square.");
-        cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols;
-    }
-    if(cooGraph.numNodes%64 != 0) {
-        PRINT_WARNING("    Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes);
-        cooGraph.numNodes += (64 - cooGraph.numNodes%64);
-        PRINT_WARNING("        Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes);
-    }
-    assert(fscanf(fp, "%u", &cooGraph.numEdges));
-    cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-    cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-
-    // Read the neighborIdxs
-    for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
-        uint32_t nodeIdx;
-        assert(fscanf(fp, "%u", &nodeIdx));
-        cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
-        uint32_t neighborIdx;
-        assert(fscanf(fp, "%u", &neighborIdx));
-        cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
-    }
-
-    return cooGraph;
-
-}
-
-static void freeCOOGraph(struct COOGraph cooGraph) {
-    free(cooGraph.nodeIdxs);
-    free(cooGraph.neighborIdxs);
-}
-
-static struct CSRGraph coo2csr(struct COOGraph cooGraph) {
-
-    struct CSRGraph csrGraph;
-
-    // Initialize fields
-    csrGraph.numNodes = cooGraph.numNodes;
-    csrGraph.numEdges = cooGraph.numEdges;
-    csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t));
-    csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t)));
-
-    // Histogram nodeIdxs
-    for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
-        uint32_t nodeIdx = cooGraph.nodeIdxs[i];
-        csrGraph.nodePtrs[nodeIdx]++;
-    }
-
-    // Prefix sum nodePtrs
-    uint32_t sumBeforeNextNode = 0;
-    for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
-        uint32_t sumBeforeNode = sumBeforeNextNode;
-        sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
-        csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
-    }
-    csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
-
-    // Bin the neighborIdxs
-    for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
-        uint32_t nodeIdx = cooGraph.nodeIdxs[i];
-        uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
-        csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i];
-    }
-
-    // Restore nodePtrs
-    for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
-        csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
-    }
-    csrGraph.nodePtrs[0] = 0;
-
-    return csrGraph;
-
-}
-
-static void freeCSRGraph(struct CSRGraph csrGraph) {
-    free(csrGraph.nodePtrs);
-    free(csrGraph.neighborIdxs);
-}
-
-#endif
-
diff --git a/BFS/support/params.h b/BFS/support/params.h
deleted file mode 100644
index f4f12e7..0000000
--- a/BFS/support/params.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-#include "utils.h"
-
-static void usage() {
-    PRINT(  "\nUsage:  ./program [options]"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -f <F>    input matrix file name (default=data/roadNet-CA.txt)"
-            "\n"
-            "\nGeneral options:"
-            "\n    -v <V>    verbosity"
-            "\n    -h        help"
-            "\n\n");
-}
-
-typedef struct Params {
-  const char* fileName;
-  unsigned int verbosity;
-} Params;
-
-static struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.fileName      = "data/roadNet-CA.txt";
-    p.verbosity     = 0;
-    int opt;
-    while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
-        switch(opt) {
-            case 'f': p.fileName    = optarg;       break;
-            case 'v': p.verbosity   = atoi(optarg); break;
-            case 'h': usage(); exit(0);
-            default:
-                      PRINT_ERROR("Unrecognized option!");
-                      usage();
-                      exit(0);
-        }
-    }
-
-    return p;
-}
-
-#endif
-
diff --git a/BFS/support/timer.h b/BFS/support/timer.h
deleted file mode 100644
index 80719cf..0000000
--- a/BFS/support/timer.h
+++ /dev/null
@@ -1,34 +0,0 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-#include <stdio.h>
-#include <sys/time.h>
-
-typedef struct Timer {
-    struct timeval startTime[5];
-    struct timeval stopTime[5];
-    double         time[5];
-} Timer;
-
-static void startTimer(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-static void stopTimer(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-static void printAll(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
-
-#endif
diff --git a/BS/Makefile b/BS/Makefile
index f9c3002..f5f0c67 100644
--- a/BS/Makefile
+++ b/BS/Makefile
@@ -7,17 +7,34 @@ WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 WITH_DPUINFO ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra  -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DINPUT_SIZE=${INPUT_SIZE} -DPROBLEM_SIZE=${PROBLEM_SIZE} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra  -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DINPUT_SIZE=${INPUT_SIZE} -DPROBLEM_SIZE=${PROBLEM_SIZE} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -26,10 +43,12 @@ all: bin/bs_host bin/bs_dpu
 bin:
 	${QUIET}mkdir -p bin
 
-bin/bs_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/bs_host: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/bs_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/bs_dpu: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile
index b67602f..4c30f65 100644
--- a/BS/baselines/cpu/Makefile
+++ b/BS/baselines/cpu/Makefile
@@ -1,16 +1,30 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
 
-ifeq (${NUMA}, 1)
-	FLAGS += -lnuma
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
 endif
 
 .PHONY: all
 all: bs_omp
 
 bs_omp: bs_omp.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS}
+	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} bs_omp.c -o bs_omp -fopenmp ${LDFLAGS}
 
 bs_omp_O0: bs_omp.c
 	gcc bs_omp.c -o bs_omp_O0 -fopenmp
diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c
index 874299b..5084c41 100644
--- a/BS/baselines/cpu/bs_omp.c
+++ b/BS/baselines/cpu/bs_omp.c
@@ -7,265 +7,286 @@
 #include <assert.h>
 #include <time.h>
 #include <stdint.h>
+
+#if WITH_BENCHMARK
 #include "timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
 
 #if NUMA
 #include <numaif.h>
 #include <numa.h>
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
-struct bitmask* bitmask_in;
+struct bitmask *bitmask_in;
 int numa_node_in = -1;
 int numa_node_cpu = -1;
 #endif
 
-
 #if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
 int numa_node_cpu_memcpy = -1;
 int numa_node_local = -1;
 int numa_node_in_is_local = 0;
 #endif
 
-
 #define DTYPE uint64_t
 /*
 * @brief creates a "test file" by filling a bufferwith values
 */
-void create_test_file(DTYPE * input, uint64_t  nr_elements, DTYPE * querys, uint64_t n_querys) {
+void create_test_file(DTYPE *input, uint64_t nr_elements, DTYPE *querys,
+		      uint64_t n_querys)
+{
 
-  srand(time(NULL));
+	srand(time(NULL));
 
-  input[0] = 1;
-  for (uint64_t i = 1; i < nr_elements; i++) {
-        input[i] = input[i - 1] + (rand() % 10) + 1;
-  }
+	input[0] = 1;
+	for (uint64_t i = 1; i < nr_elements; i++) {
+		input[i] = input[i - 1] + (rand() % 10) + 1;
+	}
 
-  for(uint64_t i = 0; i < n_querys; i++)
-  {
-	querys[i] = input[rand() % (nr_elements - 2)];
-  }
+	for (uint64_t i = 0; i < n_querys; i++) {
+		querys[i] = input[rand() % (nr_elements - 2)];
+	}
 }
 
 /**
 * @brief compute output in the host
 */
-uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigned n_querys)
+uint64_t binarySearch(DTYPE *input, uint64_t input_size, DTYPE *querys,
+		      unsigned n_querys)
 {
 
 	uint64_t found = -1;
 	uint64_t q, r, l, m;
-	
-       #pragma omp parallel for private(q,r,l,m)
-     	for(q = 0; q < n_querys; q++)
-      	{
+
+#pragma omp parallel for private(q,r,l,m)
+	for (q = 0; q < n_querys; q++) {
 		l = 0;
 		r = input_size;
-		while (l <= r) 
-		{
-	    		m = l + (r - l) / 2;
-
-	    		// Check if x is present at mid
-	     		if (input[m] == querys[q])
-			{	
-		    		found += m;
+		while (l <= r) {
+			m = l + (r - l) / 2;
+
+			// Check if x is present at mid
+			if (input[m] == querys[q]) {
+				found += m;
 				break;
 			}
-	    		// If x greater, ignore left half
-	    		if (input[m] < querys[q])
-			    	l = m + 1;
+			// If x greater, ignore left half
+			if (input[m] < querys[q])
+				l = m + 1;
 
-	    		// If x is smaller, ignore right half
+			// If x is smaller, ignore right half
 			else
-		    		r = m - 1;
-		
+				r = m - 1;
+
 		}
-       	}
+	}
 
-      	return found;
+	return found;
 }
 
   /**
   * @brief Main of the Host Application.
   */
-  int main(int argc, char **argv) {
-    (void)argc;
-    Timer timer;
-    uint64_t input_size = atol(argv[1]);
-    uint64_t n_querys = atol(argv[2]);
+int main(int argc, char **argv)
+{
+	(void)argc;
+#if WITH_BENCHMARK
+	Timer timer;
+#endif
+	uint64_t input_size = atol(argv[1]);
+	uint64_t n_querys = atol(argv[2]);
 #if NUMA
-    bitmask_in = numa_parse_nodestring(argv[3]);
-    numa_node_cpu = atoi(argv[4]);
+	bitmask_in = numa_parse_nodestring(argv[3]);
+	numa_node_cpu = atoi(argv[4]);
 #endif
 #if NUMA_MEMCPY
-    bitmask_cpu = numa_parse_nodestring(argv[5]);
-    numa_node_cpu_memcpy = atoi(argv[6]);
+	bitmask_cpu = numa_parse_nodestring(argv[5]);
+	numa_node_cpu_memcpy = atoi(argv[6]);
 #endif
 
-    printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
+	printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
 
 #if NUMA
-    if (bitmask_in) {
-        numa_set_membind(bitmask_in);
-        numa_free_nodemask(bitmask_in);
-    }
-    DTYPE * input = numa_alloc((input_size) * sizeof(DTYPE));
-    DTYPE * querys = numa_alloc((n_querys) * sizeof(DTYPE));
+	if (bitmask_in) {
+		numa_set_membind(bitmask_in);
+		numa_free_nodemask(bitmask_in);
+	}
+	DTYPE *input = numa_alloc((input_size) * sizeof(DTYPE));
+	DTYPE *querys = numa_alloc((n_querys) * sizeof(DTYPE));
 #else
-    DTYPE * input = malloc((input_size) * sizeof(DTYPE));
-    DTYPE * querys = malloc((n_querys) * sizeof(DTYPE));
+	DTYPE *input = malloc((input_size) * sizeof(DTYPE));
+	DTYPE *querys = malloc((n_querys) * sizeof(DTYPE));
 #endif
 
 #if NUMA
 #if NUMA_MEMCPY
-    if (bitmask_cpu) {
-        numa_set_membind(bitmask_cpu);
-        numa_free_nodemask(bitmask_cpu);
-    }
+	if (bitmask_cpu) {
+		numa_set_membind(bitmask_cpu);
+		numa_free_nodemask(bitmask_cpu);
+	}
 #else
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
+#endif				// NUMA_MEMCPY
 #endif
 
-    DTYPE result_host = -1;
+	DTYPE result_host = -1;
 
-    // Create an input file with arbitrary data.
-    create_test_file(input, input_size, querys, n_querys);
+	// Create an input file with arbitrary data.
+	create_test_file(input, input_size, querys, n_querys);
 
 #if NUMA
-    mp_pages[0] = input;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = input;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 #if NUMA_MEMCPY
-    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+	numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+				 || (numa_node_cpu + 8 == numa_node_in)) * 1;
 #endif
 
 #if NUMA_MEMCPY
-    DTYPE *input_local = input;
-    DTYPE *querys_local = querys;
-    start(&timer, 1, 0);
-    if (!numa_node_in_is_local) {
-        input_local = numa_alloc((input_size) * sizeof(DTYPE));
-        querys_local = numa_alloc((n_querys) * sizeof(DTYPE));
-    }
-    stop(&timer, 1);
-    if (!numa_node_in_is_local) {
-        if (numa_node_cpu_memcpy != -1) {
-            if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
-                perror("numa_run_on_node");
-                numa_node_cpu_memcpy = -1;
-            }
-        }
-    }
-    start(&timer, 2, 0);
-    if (!numa_node_in_is_local) {
-        memcpy(input_local, input, input_size * sizeof(DTYPE));
-        memcpy(querys_local, querys, n_querys * sizeof(DTYPE));
-    } else {
-        input_local = input;
-        querys_local = querys;
-    }
-    stop(&timer, 2);
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
-    mp_pages[0] = input_local;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(input_local)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_local = mp_status[0];
-    }
+	DTYPE *input_local = input;
+	DTYPE *querys_local = querys;
+	start(&timer, 1, 0);
+	if (!numa_node_in_is_local) {
+		input_local = numa_alloc((input_size) * sizeof(DTYPE));
+		querys_local = numa_alloc((n_querys) * sizeof(DTYPE));
+	}
+	stop(&timer, 1);
+	if (!numa_node_in_is_local) {
+		if (numa_node_cpu_memcpy != -1) {
+			if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
+				perror("numa_run_on_node");
+				numa_node_cpu_memcpy = -1;
+			}
+		}
+	}
+	start(&timer, 2, 0);
+	if (!numa_node_in_is_local) {
+		memcpy(input_local, input, input_size * sizeof(DTYPE));
+		memcpy(querys_local, querys, n_querys * sizeof(DTYPE));
+	} else {
+		input_local = input;
+		querys_local = querys;
+	}
+	stop(&timer, 2);
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
+	mp_pages[0] = input_local;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(input_local)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_local = mp_status[0];
+	}
+#endif
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
 
-    start(&timer, 0, 0);
+	start(&timer, 0, 0);
 #if NUMA_MEMCPY
-    result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys);
+	result_host =
+	    binarySearch(input_local, input_size - 1, querys_local, n_querys);
 #else
-    result_host = binarySearch(input, input_size - 1, querys, n_querys);
+	result_host = binarySearch(input, input_size - 1, querys, n_querys);
+#endif
+	stop(&timer, 0);
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
-    stop(&timer, 0);
 
 #if NUMA_MEMCPY
-    start(&timer, 3, 0);
-    if (!numa_node_in_is_local) {
-        numa_free(input_local, input_size * sizeof(DTYPE));
-        numa_free(querys_local, n_querys * sizeof(DTYPE));
-    }
-    stop(&timer, 3);
+	start(&timer, 3, 0);
+	if (!numa_node_in_is_local) {
+		numa_free(input_local, input_size * sizeof(DTYPE));
+		numa_free(querys_local, n_querys * sizeof(DTYPE));
+	}
+	stop(&timer, 3);
 #endif
 
-    unsigned int nr_threads = 0;
+	int status = (result_host);
+#if WITH_BENCHMARK
+	unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-    nr_threads++;
+	nr_threads++;
 
-    int status = (result_host);
-    if (status) {
+	if (status) {
 #if NUMA_MEMCPY
-        printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu"
-            " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d"
-            " | throughput_MBps=%f throughput_MOpps=%f"
-            " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
-            nr_threads, "uint64_t", input_size,
-            numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
-            n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0],
-            timer.time[0], timer.time[1], timer.time[2], timer.time[3],
-            timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+		printf
+		    ("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu"
+		     " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d"
+		     " | throughput_MBps=%f throughput_MOpps=%f"
+		     " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+		     nr_threads, "uint64_t", input_size, numa_node_in,
+		     numa_node_local, numa_node_cpu, numa_node_cpu_memcpy,
+		     numa_distance(numa_node_in, numa_node_cpu),
+		     n_querys * sizeof(DTYPE) / timer.time[0],
+		     n_querys / timer.time[0], timer.time[0], timer.time[1],
+		     timer.time[2], timer.time[3],
+		     timer.time[0] + timer.time[1] + timer.time[2] +
+		     timer.time[3]);
 #else
-        printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu"
+		printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu"
 #if NUMA
-            " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
+		       " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
 #endif
-            " | throughput_MBps=%f",
-            nr_threads, "uint64_t", input_size,
+		       " | throughput_MBps=%f",
+		       nr_threads, "uint64_t", input_size,
 #if NUMA
-            numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+		       numa_node_in, numa_node_cpu, numa_distance(numa_node_in,
+								  numa_node_cpu),
 #endif
-            n_querys * sizeof(DTYPE) / timer.time[0]);
-        printf(" throughput_MOpps=%f latency_us=%f\n",
-            n_querys / timer.time[0], timer.time[0]);
+		       n_querys * sizeof(DTYPE) / timer.time[0]);
+		printf(" throughput_MOpps=%f latency_us=%f\n",
+		       n_querys / timer.time[0], timer.time[0]);
 #endif
-    } else {
-        printf("[ERROR]\n");
-    }
+	} else {
+		printf("[ERROR]\n");
+	}
+#endif				// WITH_BENCHMARK
 
 #if NUMA
-    numa_free(input, input_size * sizeof(DTYPE));
-    numa_free(querys, n_querys * sizeof(DTYPE));
+	numa_free(input, input_size * sizeof(DTYPE));
+	numa_free(querys, n_querys * sizeof(DTYPE));
 #else
-    free(input);
-    free(querys);
+	free(input);
+	free(querys);
 #endif
 
-
-    return status ? 0 : 1;
+	return status ? 0 : 1;
 }
-
diff --git a/BS/baselines/cpu/run-perf.sh b/BS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..5b671e0
--- /dev/null
+++ b/BS/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4
diff --git a/BS/benchmark-scripts/ccmcc25-sim.sh b/BS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..05e7f87
--- /dev/null
+++ b/BS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		INPUT_SIZE=${nr_elements} PROBLEM_SIZE=${nr_queries} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/bs_host -w 0 -e 5 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  BS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_elements={nr_elements} nr_queries={nr_queries} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: nr_elements $((2**18)) $((2**19)) $((2**20)) $((2**21)) $((2**22)) \
+	::: nr_queries 512 1024 2048 4096 \
+>> ${fn}.txt
diff --git a/BS/benchmark-scripts/ccmcc25.sh b/BS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..186baf6
--- /dev/null
+++ b/BS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		INPUT_SIZE=${nr_elements} PROBLEM_SIZE=${nr_queries} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/bs_host -w 0 -e 50 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  BS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_elements={nr_elements} nr_queries={nr_queries} \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: nr_elements $((2**20)) $((2**21)) $((2**22)) \
+		::: nr_queries 524288 1048576 2097152 \
+	>> ${fn}.txt
+
+done
diff --git a/BS/benchmark-scripts/milos-hbm-cxl.sh b/BS/benchmark-scripts/milos-hbm-cxl.sh
new file mode 100755
index 0000000..79d02c7
--- /dev/null
+++ b/BS/benchmark-scripts/milos-hbm-cxl.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+cd baselines/cpu
+make -B numa=1
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/milos-hbm-cxl
+
+# * uint64 == 128 MiB
+num_queries_hbm=16777216
+
+run_benchmark() {
+	local "$@"
+	OMP_NUM_THREADS=${nr_threads} ./bs_omp ${input_size} ${num_queries} $ram $cpu 2>&1
+	return $?
+}
+
+export -f run_benchmark
+
+(
+
+echo "single-node execution, HBM ref (1/2)" >&2
+
+# 4 GiB
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+	run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
+	input_size=$(perl -E 'say 2 ** 29') num_queries=${num_queries_hbm} \
+	::: i $(seq 1 5) \
+	::: nr_threads 1 2 4 8 12 16 \
+	::: cpu $(seq 0 7) \
+	::: ram $(seq 0 16)
+
+echo "multi-node execution, HBM ref (2/2)" >&2
+
+# 8 GiB
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+	run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \
+	input_size=$(perl -E 'say 2 ** 30') num_queries=${num_queries_hbm} \
+	::: i $(seq 1 40) \
+	::: nr_threads 32 48 64 96 128 \
+	::: cpu -1 \
+	::: ram $(seq 0 16)
+
+) >> ${fn}.txt
diff --git a/BS/dimes-hetsim-hbm.sh b/BS/dimes-hetsim-hbm.sh
index 4e1500d..4a775ae 100755
--- a/BS/dimes-hetsim-hbm.sh
+++ b/BS/dimes-hetsim-hbm.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 cd baselines/cpu
-make -B NUMA=1
+make -B numa=1
 
 mkdir -p log/$(hostname)
 fn=log/$(hostname)/dimes-hetsim-hbm
diff --git a/BS/dimes-hetsim-nmc.sh b/BS/dimes-hetsim-nmc.sh
index 195334b..fa697bf 100755
--- a/BS/dimes-hetsim-nmc.sh
+++ b/BS/dimes-hetsim-nmc.sh
@@ -3,6 +3,8 @@
 mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
 fn=log/$(hostname)/dimes-hetsim-nmc
 
+source /opt/upmem/upmem-2024.1.0-Linux-x86_64/upmem_env.sh
+
 # upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB)
 # upstream DPU version uses 2 queries
 input_size_upstream=2048576
@@ -11,6 +13,8 @@ num_queries_upstream=2
 input_size_dpu=$(perl -E 'say 2 ** 22')
 num_queries_dpu=1048576
 
+# Make sure that num_queries > input_size!
+
 run_benchmark_nmc() {
 	local "$@"
 	set -e
@@ -69,7 +73,7 @@ cd baselines/cpu
 
 (
 
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
 
 echo "CPU single-node upstream-ref with memcpy, copy node == input node (1/6)" >&2
 
@@ -97,7 +101,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
 	:::+      cpu 0 1 \
 	::: nr_threads 1 2 4 8 12 16
 
-make -B NUMA=1
+make -B numa=1
 
 echo "CPU single-node upstream-ref (3/6)" >&2
 
diff --git a/BS/dpu/task.c b/BS/dpu/task.c
index acf66f2..5881dd1 100644
--- a/BS/dpu/task.c
+++ b/BS/dpu/task.c
@@ -17,140 +17,168 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 __host dpu_results_t DPU_RESULTS[NR_TASKLETS];
 
 // Search
-static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size) {
-  DTYPE found = -2;
-  if(bufferA[0] <= searching_for)
-  {
-    found = -1;
-    for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++){
-      if(bufferA[i] == searching_for)
-      {
-        found = i;
-        break;
-      }
-    }
-  }
-  return found;
+static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size)
+{
+	DTYPE found = -2;
+	if (bufferA[0] <= searching_for) {
+		found = -1;
+		for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++) {
+			if (bufferA[i] == searching_for) {
+				found = i;
+				break;
+			}
+		}
+	}
+	return found;
 }
 
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 extern int main_kernel1(void);
 
-int(*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void){
-  // Kernel
-  return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+int main(void)
+{
+	// Kernel
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
-  unsigned int tasklet_id = me();
-  #if PRINT
-  printf("tasklet_id = %u\n", tasklet_id);
-  #endif
-  if(tasklet_id == 0){
-    mem_reset(); // Reset the heap
-  }
-  // Barrier
-  barrier_wait(&my_barrier);
-
-  DTYPE searching_for, found;
-  uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size;
-
-  // Address of the current processing block in MRAM
-  uint32_t start_mram_block_addr_A       = (uint32_t) DPU_MRAM_HEAP_POINTER;
-  uint32_t start_mram_block_addr_aux     = start_mram_block_addr_A;
-  uint32_t end_mram_block_addr_A         = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
-  uint32_t current_mram_block_addr_query = end_mram_block_addr_A + tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) * sizeof(DTYPE);
-
-  // Initialize a local cache to store the MRAM block
-  DTYPE *cache_A     = (DTYPE *) mem_alloc(BLOCK_SIZE);
-  DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
-  DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE);
-
-  dpu_results_t *result = &DPU_RESULTS[tasklet_id];
-
-  for(uint64_t targets = 0; targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS); targets++)
-  {
-    found = -1;
-
-    mram_read((__mram_ptr void const *) current_mram_block_addr_query, &searching_for, 8);
-    current_mram_block_addr_query += 8;
-
-    // Initialize input vector boundaries
-    start_mram_block_addr_A    = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    start_mram_block_addr_aux  = start_mram_block_addr_A;
-    end_mram_block_addr_A      = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
-
-    uint32_t current_mram_block_addr_A = start_mram_block_addr_A;
-
-    // Bring first and last values to WRAM
-    mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_aux_A, BLOCK_SIZE);
-    mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)),   cache_aux_B, BLOCK_SIZE);
-
-    while(1)
-    {
-      // Locate the address of the mid mram block
-      current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2;
-      current_mram_block_addr_A &= WORD_MASK;
-      
-      // Boundary check
-      if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE))
-      {
-	// Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE)
-        mram_read((__mram_ptr void const *) start_mram_block_addr_A, cache_A, BLOCK_SIZE);
-        found = search(cache_A, searching_for, BLOCK_SIZE);
-
-        if(found > -1)
-        {
-          result->found = found + (start_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
-        }
-	// Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A)
-	else
-	{
-	  size_t remain_bytes_to_search = end_mram_block_addr_A - (start_mram_block_addr_A + BLOCK_SIZE);
-          mram_read((__mram_ptr void const *) start_mram_block_addr_A + BLOCK_SIZE, cache_A, remain_bytes_to_search);
-          found = search(cache_A, searching_for, remain_bytes_to_search);
-	  
-	  if(found > -1)
-          {
-            result->found = found + (start_mram_block_addr_A + BLOCK_SIZE - start_mram_block_addr_aux) / sizeof(DTYPE);
-          }
-	  else
-	  {
-	    printf("%lld NOT found\n", searching_for);
-	  }
+int main_kernel1()
+{
+	unsigned int tasklet_id = me();
+#if PRINT
+	printf("tasklet_id = %u\n", tasklet_id);
+#endif
+	if (tasklet_id == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	DTYPE searching_for, found;
+	uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size;
+
+	// Address of the current processing block in MRAM
+	uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	uint32_t start_mram_block_addr_aux = start_mram_block_addr_A;
+	uint32_t end_mram_block_addr_A =
+	    start_mram_block_addr_A + sizeof(DTYPE) * input_size;
+	uint32_t current_mram_block_addr_query =
+	    end_mram_block_addr_A +
+	    tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) *
+	    sizeof(DTYPE);
+
+	// Initialize a local cache to store the MRAM block
+	DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE);
+
+	dpu_results_t *result = &DPU_RESULTS[tasklet_id];
+
+	for (uint64_t targets = 0;
+	     targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS);
+	     targets++) {
+		found = -1;
+
+		mram_read((__mram_ptr void const *)
+			  current_mram_block_addr_query, &searching_for, 8);
+		current_mram_block_addr_query += 8;
+
+		// Initialize input vector boundaries
+		start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+		start_mram_block_addr_aux = start_mram_block_addr_A;
+		end_mram_block_addr_A =
+		    start_mram_block_addr_A + sizeof(DTYPE) * input_size;
+
+		uint32_t current_mram_block_addr_A = start_mram_block_addr_A;
+
+		// Bring first and last values to WRAM
+		mram_read((__mram_ptr void const *)current_mram_block_addr_A,
+			  cache_aux_A, BLOCK_SIZE);
+		mram_read((__mram_ptr void const *)(end_mram_block_addr_A -
+						    BLOCK_SIZE * sizeof(DTYPE)),
+			  cache_aux_B, BLOCK_SIZE);
+
+		while (1) {
+			// Locate the address of the mid mram block
+			current_mram_block_addr_A =
+			    (start_mram_block_addr_A +
+			     end_mram_block_addr_A) / 2;
+			current_mram_block_addr_A &= WORD_MASK;
+
+			// Boundary check
+			if (current_mram_block_addr_A <
+			    (start_mram_block_addr_A + BLOCK_SIZE)) {
+				// Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE)
+				mram_read((__mram_ptr void const *)
+					  start_mram_block_addr_A, cache_A,
+					  BLOCK_SIZE);
+				found =
+				    search(cache_A, searching_for, BLOCK_SIZE);
+
+				if (found > -1) {
+					result->found =
+					    found + (start_mram_block_addr_A -
+						     start_mram_block_addr_aux)
+					    / sizeof(DTYPE);
+				}
+				// Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A)
+				else {
+					size_t remain_bytes_to_search =
+					    end_mram_block_addr_A -
+					    (start_mram_block_addr_A +
+					     BLOCK_SIZE);
+					mram_read((__mram_ptr void const *)
+						  start_mram_block_addr_A +
+						  BLOCK_SIZE, cache_A,
+						  remain_bytes_to_search);
+					found =
+					    search(cache_A, searching_for,
+						   remain_bytes_to_search);
+
+					if (found > -1) {
+						result->found =
+						    found +
+						    (start_mram_block_addr_A +
+						     BLOCK_SIZE -
+						     start_mram_block_addr_aux)
+						    / sizeof(DTYPE);
+					} else {
+						printf("%lld NOT found\n",
+						       searching_for);
+					}
+				}
+				break;
+			}
+			// Load cache with current MRAM block
+			mram_read((__mram_ptr void const *)
+				  current_mram_block_addr_A, cache_A,
+				  BLOCK_SIZE);
+
+			// Search inside block
+			found = search(cache_A, searching_for, BLOCK_SIZE);
+
+			// If found > -1, we found the searching_for query
+			if (found > -1) {
+				result->found =
+				    found + (current_mram_block_addr_A -
+					     start_mram_block_addr_aux) /
+				    sizeof(DTYPE);
+				break;
+			}
+			// If found == -2, we need to discard right part of the input vector
+			if (found == -2) {
+				end_mram_block_addr_A =
+				    current_mram_block_addr_A;
+			}
+			// If found == -1, we need to discard left part of the input vector
+			else if (found == -1) {
+				start_mram_block_addr_A =
+				    current_mram_block_addr_A;
+			}
+		}
 	}
-	break;
-      }
-      
-      // Load cache with current MRAM block
-      mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE);
-
-      // Search inside block
-      found = search(cache_A, searching_for, BLOCK_SIZE);
-
-      // If found > -1, we found the searching_for query
-      if(found > -1)
-      {
-        result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
-        break;
-      }
-
-      // If found == -2, we need to discard right part of the input vector
-      if(found == -2)
-      {
-        end_mram_block_addr_A     = current_mram_block_addr_A;
-      }
-
-      // If found == -1, we need to discard left part of the input vector
-      else if (found == -1)
-      {
-        start_mram_block_addr_A   = current_mram_block_addr_A;
-      }
-    }
-  }
-  return 0;
+	return 0;
 }
diff --git a/BS/host/app.c b/BS/host/app.c
index 10d76f1..90d016f 100644
--- a/BS/host/app.c
+++ b/BS/host/app.c
@@ -7,20 +7,28 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
-#include <dpu.h>
-#include <dpu_log.h>
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 #include <time.h>
 
-#if ENERGY
-#include <dpu_probe.h>
+#if ASPECTC
+extern "C" {
 #endif
 
+#include <dpu.h>
+#include <dpu_log.h>
 #include <dpu_management.h>
 #include <dpu_target_macros.h>
 
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
@@ -31,7 +39,9 @@
 #define DPU_BINARY "./bin/bs_dpu"
 
 // Create input arrays
-void create_test_file(DTYPE * input, DTYPE * querys, uint64_t  nr_elements, uint64_t nr_querys) {
+void create_test_file(DTYPE *input, DTYPE *querys, uint64_t nr_elements,
+		      uint64_t nr_querys)
+{
 
 	input[0] = 1;
 	for (uint64_t i = 1; i < nr_elements; i++) {
@@ -43,12 +53,12 @@ void create_test_file(DTYPE * input, DTYPE * querys, uint64_t  nr_elements, uint
 }
 
 // Compute output in the host
-int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t num_querys)
+int64_t binarySearch(DTYPE *input, DTYPE *querys, DTYPE input_size,
+		     uint64_t num_querys)
 {
 	uint64_t result = -1;
 	DTYPE r;
-	for(uint64_t q = 0; q < num_querys; q++)
-	{
+	for (uint64_t q = 0; q < num_querys; q++) {
 		DTYPE l = 0;
 		r = input_size;
 		while (l <= r) {
@@ -57,92 +67,96 @@ int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t n
 			// XXX shouldn't this short-circuit?
 			// Check if x is present at mid
 			if (input[m] == querys[q])
-			result = m;
+				result = m;
 
 			// If x greater, ignore left half
 			if (input[m] < querys[q])
-			l = m + 1;
+				l = m + 1;
 
 			// If x is smaller, ignore right half
 			else
-			r = m - 1;
+				r = m - 1;
 		}
 	}
 	return result;
 }
 
-
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
 	struct Params p = input_params(argc, argv);
 	struct dpu_set_t dpu_set, dpu;
 	uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	uint32_t nr_of_ranks;
 	uint64_t input_size = INPUT_SIZE;
 	uint64_t num_querys = p.num_querys;
 	DTYPE result_host = -1;
-	DTYPE result_dpu  = -1;
+	DTYPE result_dpu = -1;
 
-    // Timer declaration
-    Timer timer;
+	// Timer declaration
+	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	zero(&timer, 0); // alloc
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+	zero(&timer, 1); // load
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+	zero(&timer, 6); // free
 #endif
 
-	#if ENERGY
+#if ENERGY
 	struct dpu_probe_t probe;
 	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-	#endif
+#endif
 
 	// Query number adjustement for proper partitioning
-	if(num_querys % (NR_DPUS * NR_TASKLETS))
-	num_querys = num_querys + (NR_DPUS * NR_TASKLETS - num_querys % (NR_DPUS * NR_TASKLETS));
+	if (num_querys % (NR_DPUS * NR_TASKLETS))
+		num_querys =
+		    num_querys + (NR_DPUS * NR_TASKLETS -
+				  num_querys % (NR_DPUS * NR_TASKLETS));
 
-	assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension");    // Allocate input and querys vectors
+	assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension");	// Allocate input and querys vectors
 
-	DTYPE * input  = malloc((input_size) * sizeof(DTYPE));
-	DTYPE * querys = malloc((num_querys) * sizeof(DTYPE));
+	DTYPE *input = (DTYPE*)malloc((input_size) * sizeof(DTYPE));
+	DTYPE *querys = (DTYPE*)malloc((num_querys) * sizeof(DTYPE));
 
 	// Create an input file with arbitrary data
 	create_test_file(input, querys, input_size, num_querys);
 
 	// Create kernel arguments
-	uint64_t slice_per_dpu          = num_querys / NR_DPUS;
-	dpu_arguments_t input_arguments = {input_size, slice_per_dpu, 0};
+	uint64_t slice_per_dpu = num_querys / NR_DPUS;
+	dpu_arguments_t input_arguments = { input_size, slice_per_dpu, (enum kernel)0 };
 
 	for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 		// Perform input transfers
 		uint64_t i = 0;
 
 #if WITH_ALLOC_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 0, 0);
 		}
 		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 0);
 		}
 #endif
 #if WITH_DPUINFO
 		printf("DPUs:");
-		DPU_FOREACH (dpu_set, dpu) {
-			int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
+		DPU_FOREACH(dpu_set, dpu) {
+			int rank =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
 			int slice = dpu_get_slice_id(dpu_from_set(dpu));
 			int member = dpu_get_member_id(dpu_from_set(dpu));
 			printf(" %d(%d.%d)", rank, slice, member);
@@ -150,11 +164,11 @@ int main(int argc, char **argv) {
 		printf("\n");
 #endif
 #if WITH_LOAD_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 1, 0);
 		}
 		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 1);
 		}
 		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -164,27 +178,35 @@ int main(int argc, char **argv) {
 
 		// int prev_rank_id = -1;
 		int rank_id = -1;
-		DPU_FOREACH (dpu_set, dpu) {
-			rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-			if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
 				numa_node_rank = -1;
 			} else {
-				numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
 			}
 			/*
-			if (rank_id != prev_rank_id) {
-				printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-				prev_rank_id = rank_id;
-			}
-			*/
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
 		}
 
 		// Compute host solution
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 2, 0);
 		}
-		result_host = binarySearch(input, querys, input_size - 1, num_querys);
-		if(rep >= p.n_warmup) {
+		result_host =
+		    binarySearch(input, querys, input_size - 1, num_querys);
+		if (rep >= p.n_warmup) {
 			stop(&timer, 2);
 		}
 
@@ -192,103 +214,110 @@ int main(int argc, char **argv) {
 			start(&timer, 3, 0);
 		}
 
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
+		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(input_arguments), DPU_XFER_DEFAULT));
 
 		i = 0;
 
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
+		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, input));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size * sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    input_size * sizeof(DTYPE), DPU_XFER_DEFAULT));
 
 		i = 0;
 
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
-			DPU_ASSERT(dpu_prepare_xfer(dpu, querys + slice_per_dpu * i));
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, querys + slice_per_dpu * i));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size * sizeof(DTYPE), slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size * sizeof(DTYPE),
+			    slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT));
 
 		if (rep >= p.n_warmup) {
 			stop(&timer, 3);
 		}
-
 		// Run kernel on DPUs
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			start(&timer, 4, 0);
-			#if ENERGY
+#if ENERGY
 			DPU_ASSERT(dpu_probe_start(&probe));
-			#endif
+#endif
 		}
 
 		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
 
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			stop(&timer, 4);
-			#if ENERGY
+#if ENERGY
 			DPU_ASSERT(dpu_probe_stop(&probe));
-			#endif
+#endif
 		}
 		// Print logs if required
-		#if PRINT
+#if PRINT
 		unsigned int each_dpu = 0;
 		printf("Display DPU Logs\n");
-		DPU_FOREACH(dpu_set, dpu)
-		{
+		DPU_FOREACH(dpu_set, dpu) {
 			printf("DPU#%d:\n", each_dpu);
 			DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
 			each_dpu++;
 		}
-		#endif
+#endif
 
 		// Retrieve results
-		dpu_results_t* results_retrieve[NR_DPUS];
+		dpu_results_t *results_retrieve[NR_DPUS];
 		if (rep >= p.n_warmup) {
 			start(&timer, 5, 0);
 		}
 		i = 0;
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
-			results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t));
+		DPU_FOREACH(dpu_set, dpu, i) {
+			results_retrieve[i] =
+			    (dpu_results_t *) malloc(NR_TASKLETS *
+						     sizeof(dpu_results_t));
 			DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT));
-
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
-			for(unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++)
-			{
-				if(results_retrieve[i][each_tasklet].found > result_dpu)
-				{
-					result_dpu = results_retrieve[i][each_tasklet].found;
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+			    NR_TASKLETS * sizeof(dpu_results_t),
+			    DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			for (unsigned int each_tasklet = 0;
+			     each_tasklet < NR_TASKLETS; each_tasklet++) {
+				if (results_retrieve[i][each_tasklet].found >
+				    result_dpu) {
+					result_dpu =
+					    results_retrieve[i][each_tasklet].
+					    found;
 				}
 			}
 			free(results_retrieve[i]);
 		}
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 5);
 		}
-
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 6, 0);
 		}
 #endif
 		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 6);
 		}
 #endif
@@ -296,58 +325,91 @@ int main(int argc, char **argv) {
 
 		int status = (result_dpu == result_host);
 		if (status) {
-			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] results are equal\n");
 			if (rep >= p.n_warmup) {
-				printf("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
-					NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, input_size);
-				printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-					WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-				printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-					timer.time[0],
-					timer.time[1],
-					timer.time[2],
-					timer.time[3],
-					timer.time[4],
-					timer.time[5],
-					timer.time[6]);
-				printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-					num_querys * sizeof(DTYPE) / timer.time[2],
-					num_querys * sizeof(DTYPE) / (timer.time[4]),
-					num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-				printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-					num_querys * sizeof(DTYPE) / (timer.time[3] + timer.time[4] + timer.time[5]),
-					num_querys * sizeof(DTYPE) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-					num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-				printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-					num_querys / timer.time[2],
-					num_querys / (timer.time[4]),
-					num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-				printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-					num_querys / (timer.time[3] + timer.time[4] + timer.time[5]),
-					num_querys / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-					num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
+				dfatool_printf
+				    ("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
+				     NR_DPUS, nr_of_ranks, NR_TASKLETS,
+				     XSTR(DTYPE), BLOCK_SIZE, input_size);
+				dfatool_printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD, numa_node_rank);
+				dfatool_printf
+				    ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+				     timer.time[0], timer.time[1],
+				     timer.time[2], timer.time[3],
+				     timer.time[4], timer.time[5],
+				     timer.time[6]);
+				dfatool_printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     num_querys * sizeof(DTYPE) / timer.time[2],
+				     num_querys * sizeof(DTYPE) /
+				     (timer.time[4]),
+				     num_querys * sizeof(DTYPE) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5] + timer.time[6]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     num_querys * sizeof(DTYPE) /
+				     (timer.time[3] + timer.time[4] +
+				      timer.time[5]),
+				     num_querys * sizeof(DTYPE) /
+				     (timer.time[1] + timer.time[3] +
+				      timer.time[4] + timer.time[5]),
+				     num_querys * sizeof(DTYPE) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5]));
+				dfatool_printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     num_querys / timer.time[2],
+				     num_querys / (timer.time[4]),
+				     num_querys / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5] +
+						   timer.time[6]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     num_querys / (timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     num_querys / (timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     num_querys / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]));
 			}
 		} else {
-			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] results differ!\n");
 		}
 	}
 	// Print timing results
 	/*
-	printf("CPU Version Time (ms): ");
-	print(&timer, 0, p.n_reps);
-	printf("CPU-DPU Time (ms): ");
-	print(&timer, 1, p.n_reps);
-	printf("DPU Kernel Time (ms): ");
-	print(&timer, 2, p.n_reps);
-	printf("DPU-CPU Time (ms): ");
-	print(&timer, 3, p.n_reps);
-	*/
-
-	#if ENERGY
+	   printf("CPU Version Time (ms): ");
+	   print(&timer, 0, p.n_reps);
+	   printf("CPU-DPU Time (ms): ");
+	   print(&timer, 1, p.n_reps);
+	   printf("DPU Kernel Time (ms): ");
+	   print(&timer, 2, p.n_reps);
+	   printf("DPU-CPU Time (ms): ");
+	   print(&timer, 3, p.n_reps);
+	 */
+
+#if ENERGY
 	double energy;
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
 	printf("DPU Energy (J): %f\t", energy * num_iterations);
-	#endif
+#endif
 
 	free(input);
 #if !WITH_ALLOC_OVERHEAD
diff --git a/BS/support/common.h b/BS/include/common.h
index dbd050c..d0b2865 100755..100644
--- a/BS/support/common.h
+++ b/BS/include/common.h
@@ -27,18 +27,20 @@
 #define INPUT_SIZE 2048576
 #endif
 
+enum kernel {
+	kernel1 = 0,
+	nr_kernels = 1,
+};
+
 typedef struct {
 	uint64_t input_size;
 	uint64_t slice_per_dpu;
-	enum kernels {
-		kernel1 = 0,
-		nr_kernels = 1,
-	} kernel;
+	enum kernel kernel;
 } dpu_arguments_t;
 
 // Structures used by both the host and the dpu to communicate information
 typedef struct {
-    DTYPE found;
+	DTYPE found;
 } dpu_results_t;
 
 #ifndef ENERGY
diff --git a/BS/include/dfatool_host.ah b/BS/include/dfatool_host.ah
new file mode 100644
index 0000000..19019a5
--- /dev/null
+++ b/BS/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned long n_elements, n_queries;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(uint32_t);
+	}
+
+	advice call("% input_params(...)"): after() {
+		Params* p = tjp->result();
+		n_elements = INPUT_SIZE;
+		n_queries = p->num_querys;
+		printf("[>>] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries);
+	}
+
+	advice call("% binarySearch(...)") : after() {
+		printf("[--] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries);
+	}
+};
diff --git a/BS/include/params.h b/BS/include/params.h
new file mode 100644
index 0000000..f970eda
--- /dev/null
+++ b/BS/include/params.h
@@ -0,0 +1,59 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+typedef struct Params {
+	long num_querys;
+	unsigned n_warmup;
+	unsigned n_reps;
+} Params;
+
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    problem size (default=%d queries)" "\n", PROBLEM_SIZE);
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.num_querys = PROBLEM_SIZE;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+
+	int opt;
+	while ((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.num_querys = atol(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+	return p;
+}
+#endif
diff --git a/BS/include/timer.h b/BS/include/timer.h
new file mode 100644
index 0000000..7b80823
--- /dev/null
+++ b/BS/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/BS/run-fgbs24a.sh b/BS/run-fgbs24a.sh
deleted file mode 100755
index 06f8766..0000000
--- a/BS/run-fgbs24a.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mkdir -p $(hostname)
-
-ts=$(date +%Y%m%d)
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks BS (dfatool fgbs24a edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 2304 2048 2543; do
-	for nr_tasklets in 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
-			timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true
-		fi
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then
-			timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true
-		fi
-	done
-done
-echo "Completed at $(date)"
-) | tee "$(hostname)/${ts}-fgbs24a.txt"
diff --git a/BS/run-paper-strong-full.sh b/BS/run-paper-strong-full.sh
deleted file mode 100755
index a6129aa..0000000
--- a/BS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks BS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 are not part of uptsream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/BS/run-paper-strong-rank.sh b/BS/run-paper-strong-rank.sh
deleted file mode 100755
index c2d4f36..0000000
--- a/BS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks BS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 262144 || true
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/BS/run-paper-weak.sh b/BS/run-paper-weak.sh
deleted file mode 100755
index a27c547..0000000
--- a/BS/run-paper-weak.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-# ... so the weak rank script might be bogus
-
-(
-
-echo "prim-benchmarks BS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		# original Makefile sets PROBLEM_SIZE=2, for some reason.
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1 PROBLEM_SIZE=2; then
-			i=$(( nr_dpus * 262144 ))
-			timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i $i || true
-		fi
-	done
-done
-) | tee log-paper-weak.txt
diff --git a/BS/run.sh b/BS/run.sh
deleted file mode 100755
index 0c67c93..0000000
--- a/BS/run.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks BS (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for i in 262144 16777216; do
-	for nr_dpus in 1 4 8 16 32 64 128 256 512 768 1024 1536 2048 2304 2542; do
-		for nr_tasklets in 8 12 16; do
-			echo
-			if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
-				timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i $i || true
-			fi
-		done
-	done
-done
-echo "Completed at $(date)"
-) | tee "log-$(hostname)-ndpus.txt"
diff --git a/BS/support/params.h b/BS/support/params.h
deleted file mode 100644
index 02bd750..0000000
--- a/BS/support/params.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-
-typedef struct Params {
-  long  num_querys;
-  unsigned   n_warmup;
-  unsigned   n_reps;
-}Params;
-
-void usage() {
-  fprintf(stderr,
-    "\nUsage:  ./program [options]"
-    "\n"
-    "\nGeneral options:"
-    "\n    -h        help"
-    "\n    -w <W>    # of untimed warmup iterations (default=1)"
-    "\n    -e <E>    # of timed repetition iterations (default=3)"
-    "\n"
-    "\nBenchmark-specific options:"
-    "\n    -i <I>    problem size (default=2 queries)"
-    "\n");
-  }
-
-  struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.num_querys    = PROBLEM_SIZE;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-
-    int opt;
-    while((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) {
-      switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.num_querys    = atol(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break; 
-	default:
-        	fprintf(stderr, "\nUnrecognized option!\n");
-        	usage();
-        	exit(0);
-      }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
-
-    return p;
-  }
-  #endif
diff --git a/BS/support/timer.h b/BS/support/timer.h
deleted file mode 100755
index ff1ae1b..0000000
--- a/BS/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile
index 4608944..ede0498 100644
--- a/COUNT/baselines/cpu/Makefile
+++ b/COUNT/baselines/cpu/Makefile
@@ -1,8 +1,23 @@
-NUMA ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
 
-ifeq (${NUMA}, 1)
-	FLAGS += -lnuma
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
 endif
 
 .PHONY: all
@@ -11,7 +26,7 @@ all: count
 TYPE ?= uint64_t
 
 count: app_baseline.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS}
+	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS}
 
 .PHONY: run
 run: count
@@ -19,4 +34,4 @@ run: count
 
 .PHONY: clean
 clean:
-	rm -f count count_O0 count_O2
+	rm -f count
diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c
index d52257a..13e3f51 100644
--- a/COUNT/baselines/cpu/app_baseline.c
+++ b/COUNT/baselines/cpu/app_baseline.c
@@ -12,13 +12,19 @@
 #include <assert.h>
 #include <stdint.h>
 #include <omp.h>
+
+#if WITH_BENCHMARK
 #include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
 
 #if NUMA
 #include <numaif.h>
 #include <numa.h>
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
@@ -37,71 +43,70 @@ volatile int total_count;
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-    char* dpu_type;
-    int   input_size;
-    int   n_warmup;
-    int   n_reps;
-    int   n_threads;
+	char *dpu_type;
+	int input_size;
+	int n_warmup;
+	int n_reps;
+	int n_threads;
 #if NUMA
-    struct bitmask* bitmask_in;
-    struct bitmask* bitmask_out;
-    int numa_node_cpu;
+	struct bitmask *bitmask_in;
+	struct bitmask *bitmask_out;
+	int numa_node_cpu;
 #endif
-}Params;
+} Params;
 
 struct Params p;
 
 static T *A;
 
-bool pred(const T x){
-  return (x % 2) == 0;
+bool pred(const T x)
+{
+	return (x % 2) == 0;
 }
 
-
-void create_test_file(unsigned int nr_elements) {
-    //srand(0);
+void create_test_file(unsigned int nr_elements)
+{
+	//srand(0);
 
 #if NUMA
-    if (p.bitmask_in) {
-        numa_set_membind(p.bitmask_in);
-        numa_free_nodemask(p.bitmask_in);
-    }
-    A = (T*) numa_alloc(nr_elements * sizeof(T));
+	if (p.bitmask_in) {
+		numa_set_membind(p.bitmask_in);
+		numa_free_nodemask(p.bitmask_in);
+	}
+	A = (T *) numa_alloc(nr_elements * sizeof(T));
 #else
-    A = (T*) malloc(nr_elements * sizeof(T));
+	A = (T *) malloc(nr_elements * sizeof(T));
 #endif
 
 #if NUMA
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
 #endif
 
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        //A[i] = (unsigned int) (rand());
-        A[i] = i+1;
-    }
+	for (unsigned int i = 0; i < nr_elements; i++) {
+		//A[i] = (unsigned int) (rand());
+		A[i] = i + 1;
+	}
 
 #if NUMA
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    numa_node_cpu = p.numa_node_cpu;
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 }
@@ -109,116 +114,152 @@ void create_test_file(unsigned int nr_elements) {
 /**
 * @brief compute output in the host
 */
-static int count_host(int size, int t) {
-    int count = 0;
-
-    omp_set_num_threads(t);
-    #pragma omp parallel for reduction(+:count)
-    for(int my = 0; my < size; my++) {
-        if(!pred(A[my])) {
-            count++;
-        }
-    }
-    return count;
+static int count_host(int size, int t)
+{
+	int count = 0;
+
+	omp_set_num_threads(t);
+#pragma omp parallel for reduction(+:count)
+	for (int my = 0; my < size; my++) {
+		if (!pred(A[my])) {
+			count++;
+		}
+	}
+	return count;
 }
 
-void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -d <D>    DPU type (default=fsim)"
-        "\n    -t <T>    # of threads (default=8)"
-        "\n    -w <W>    # of untimed warmup iterations (default=2)"
-        "\n    -e <E>    # of timed repetition iterations (default=5)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=8M elements)"
-        "\n");
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -d <D>    DPU type (default=fsim)"
+		"\n    -t <T>    # of threads (default=8)"
+		"\n    -w <W>    # of untimed warmup iterations (default=2)"
+		"\n    -e <E>    # of timed repetition iterations (default=5)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=8M elements)" "\n");
 }
 
-void input_params(int argc, char **argv) {
-    p.input_size    = 16 << 20;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.n_threads     = 5;
+void input_params(int argc, char **argv)
+{
+	p.input_size = 16 << 20;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.n_threads = 5;
 #if NUMA
-    p.bitmask_in     = NULL;
-    p.bitmask_out    = NULL;
-    p.numa_node_cpu = -1;
+	p.bitmask_in = NULL;
+	p.bitmask_out = NULL;
+	p.numa_node_cpu = -1;
 #endif
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 't': p.n_threads     = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 't':
+			p.n_threads = atoi(optarg);
+			break;
 #if NUMA
-        case 'a': p.bitmask_in    = numa_parse_nodestring(optarg); break;
-        case 'b': p.bitmask_out   = numa_parse_nodestring(optarg); break;
-        case 'c': p.numa_node_cpu = atoi(optarg); break;
+		case 'a':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'b':
+			p.bitmask_out = numa_parse_nodestring(optarg);
+			break;
+		case 'c':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #endif
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(p.n_threads > 0 && "Invalid # of ranks!");
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(p.n_threads > 0 && "Invalid # of ranks!");
 }
 
 /**
 * @brief Main of the Host Application.
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    input_params(argc, argv);
+	input_params(argc, argv);
 
-    const unsigned int file_size = p.input_size;
+	const unsigned int file_size = p.input_size;
 
-    // Create an input file with arbitrary data.
-    create_test_file(file_size);
+	// Create an input file with arbitrary data.
+	create_test_file(file_size);
 
-    Timer timer;
+#if WITH_BENCHMARK
+	Timer timer;
+#endif
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
+#endif
 
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
-        start(&timer, 0, 0);
-        total_count = count_host(file_size, p.n_threads);
-        stop(&timer, 0);
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+		start(&timer, 0, 0);
+		total_count = count_host(file_size, p.n_threads);
+		stop(&timer, 0);
 
-        unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
+		nr_threads++;
 
-        if (rep >= p.n_warmup) {
-            printf("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d"
+		if (rep >= p.n_warmup) {
+			printf
+			    ("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d"
 #if NUMA
-                " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+			     " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
 #endif
-                " | throughput_MBps=%f",
-                nr_threads, XSTR(T), file_size,
+			     " | throughput_MBps=%f",
+			     nr_threads, XSTR(T), file_size,
 #if NUMA
-                numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+			     numa_node_in, numa_node_out, numa_node_cpu,
+			     numa_distance(numa_node_in, numa_node_cpu),
+			     numa_distance(numa_node_cpu, numa_node_out),
+#endif
+			     file_size * 2 * sizeof(T) / timer.time[0]);
+			printf(" throughput_MOpps=%f",
+			       file_size / timer.time[0]);
+			printall(&timer, 0);
+		}
+#endif				// WITH_BENCHMARK
+	}
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
-                file_size * 2 * sizeof(T) / timer.time[0]);
-            printf(" throughput_MOpps=%f",
-                file_size / timer.time[0]);
-            printall(&timer, 0);
-        }
-    }
 
 #if NUMA
-    numa_free(A, file_size * sizeof(T));
+	numa_free(A, file_size * sizeof(T));
 #else
-    free(A);
+	free(A);
 #endif
-    return 0;
+	return 0;
 }
diff --git a/COUNT/dpu/task.c b/COUNT/dpu/task.c
index b2ed79b..8ba6aaf 100644
--- a/COUNT/dpu/task.c
+++ b/COUNT/dpu/task.c
@@ -21,33 +21,36 @@ uint32_t message[NR_TASKLETS];
 uint32_t message_partial_count;
 
 // COUNT in each tasklet
-static unsigned int count(T *input){
-    unsigned int cnt = 0;
-    #pragma unroll
-    for(unsigned int j = 0; j < REGS; j++) {
-        if(!pred(input[j])) {
-            cnt++;
-        }
-    }
-    return cnt;
+static unsigned int count(T *input)
+{
+	unsigned int cnt = 0;
+#pragma unroll
+	for (unsigned int j = 0; j < REGS; j++) {
+		if (!pred(input[j])) {
+			cnt++;
+		}
+	}
+	return cnt;
 }
 
 // Handshake with adjacent tasklets
-static unsigned int handshake_sync(unsigned int l_count, unsigned int tasklet_id){
-    unsigned int p_count;
-    // Wait and read message
-    if(tasklet_id != 0){
-        handshake_wait_for(tasklet_id - 1);
-        p_count = message[tasklet_id];
-    } else {
-        p_count = 0;
-    }
-    // Write message and notify
-    if(tasklet_id < NR_TASKLETS - 1){
-        message[tasklet_id + 1] = p_count + l_count;
-        handshake_notify();
-    }
-    return p_count;
+static unsigned int handshake_sync(unsigned int l_count,
+				   unsigned int tasklet_id)
+{
+	unsigned int p_count;
+	// Wait and read message
+	if (tasklet_id != 0) {
+		handshake_wait_for(tasklet_id - 1);
+		p_count = message[tasklet_id];
+	} else {
+		p_count = 0;
+	}
+	// Write message and notify
+	if (tasklet_id < NR_TASKLETS - 1) {
+		message[tasklet_id + 1] = p_count + l_count;
+		handshake_notify();
+	}
+	return p_count;
 }
 
 // Barrier
@@ -55,63 +58,70 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 extern int main_kernel1(void);
 
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void) { 
-    // Kernel
-    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
+int main(void)
+{
+	// Kernel
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
-    unsigned int tasklet_id = me();
+int main_kernel1()
+{
+	unsigned int tasklet_id = me();
 #if PRINT
-    printf("tasklet_id = %u\n", tasklet_id);
+	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-    if (tasklet_id == 0){ // Initialize once the cycle counter
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
 
-    dpu_results_t *result = &DPU_RESULTS[tasklet_id];
+	dpu_results_t *result = &DPU_RESULTS[tasklet_id];
 
-    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
+	uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
 
-    // Address of the current processing block in MRAM
-    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
-    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
+	// Address of the current processing block in MRAM
+	uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+	uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
 
-    // Initialize a local cache to store the MRAM block
-    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+	// Initialize a local cache to store the MRAM block
+	T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
 
-    // Initialize shared variable
-    if(tasklet_id == NR_TASKLETS - 1)
-        message_partial_count = 0;
-    // Barrier
-    barrier_wait(&my_barrier);
+	// Initialize shared variable
+	if (tasklet_id == NR_TASKLETS - 1)
+		message_partial_count = 0;
+	// Barrier
+	barrier_wait(&my_barrier);
 
-    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
+	for (unsigned int byte_index = base_tasklet;
+	     byte_index < input_size_dpu_bytes;
+	     byte_index += BLOCK_SIZE * NR_TASKLETS) {
 
-        // Load cache with current MRAM block
-        mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, BLOCK_SIZE);
+		// Load cache with current MRAM block
+		mram_read((__mram_ptr void const *)(mram_base_addr_A +
+						    byte_index), cache_A,
+			  BLOCK_SIZE);
 
-        // COUNT in each tasklet
-        uint32_t l_count = count(cache_A);
+		// COUNT in each tasklet
+		uint32_t l_count = count(cache_A);
 
-        // Sync with adjacent tasklets
-        uint32_t p_count = handshake_sync(l_count, tasklet_id);
+		// Sync with adjacent tasklets
+		uint32_t p_count = handshake_sync(l_count, tasklet_id);
 
-        // Barrier
-        barrier_wait(&my_barrier);
+		// Barrier
+		barrier_wait(&my_barrier);
 
-        // Total count in this DPU
-        if(tasklet_id == NR_TASKLETS - 1){
-            result->t_count = message_partial_count + p_count + l_count;
-            message_partial_count = result->t_count;
-        }
+		// Total count in this DPU
+		if (tasklet_id == NR_TASKLETS - 1) {
+			result->t_count =
+			    message_partial_count + p_count + l_count;
+			message_partial_count = result->t_count;
+		}
 
-    }
+	}
 
-    return 0;
+	return 0;
 }
diff --git a/COUNT/host/app.c b/COUNT/host/app.c
index 7708f6d..dad674f 100644
--- a/COUNT/host/app.c
+++ b/COUNT/host/app.c
@@ -33,287 +33,350 @@
 #include <dpu_target_macros.h>
 
 // Pointer declaration
-static T* A;
+static T *A;
 
 // Create input arrays
-static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) {
-    //srand(0);
-    printf("nr_elements\t%u\t", nr_elements);
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        //A[i] = (T) (rand());
-        A[i] = i + 1;
-    }
-    for (unsigned int i = nr_elements; i < nr_elements_round; i++) { // Complete with removable elements
-        A[i] = 0;
-    }
+static void read_input(T *A, unsigned int nr_elements,
+		       unsigned int nr_elements_round)
+{
+	//srand(0);
+	printf("nr_elements\t%u\t", nr_elements);
+	for (unsigned int i = 0; i < nr_elements; i++) {
+		//A[i] = (T) (rand());
+		A[i] = i + 1;
+	}
+	for (unsigned int i = nr_elements; i < nr_elements_round; i++) {	// Complete with removable elements
+		A[i] = 0;
+	}
 }
 
 // Compute output in the host
-static unsigned int count_host(T* A, unsigned int nr_elements) {
-    unsigned int count = 0;
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        if(!pred(A[i])) {
-            count++;
-        }
-    }
-    return count;
+static unsigned int count_host(T *A, unsigned int nr_elements)
+{
+	unsigned int count = 0;
+	for (unsigned int i = 0; i < nr_elements; i++) {
+		if (!pred(A[i])) {
+			count++;
+		}
+	}
+	return count;
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t nr_of_dpus;
+	uint32_t nr_of_ranks;
 
-    // Timer declaration
-    Timer timer;
+	// Timer declaration
+	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	timer.time[TMR_ALLOC] = 0;	// alloc
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+	timer.time[TMR_LOAD] = 0;	// load
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+	timer.time[TMR_FREE] = 0;	// free
 #endif
 
 #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-    unsigned int i = 0;
-    uint32_t accum = 0;
-    uint32_t total_count = 0;
+	unsigned int i = 0;
+	uint32_t accum = 0;
+	uint32_t total_count = 0;
 
-    const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; // Total input size (weak or strong scaling)
-    const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
-    const unsigned int input_size_dpu_round = 
-        (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned
+	const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;	// Total input size (weak or strong scaling)
+	const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS);	// Input size per DPU (max.)
+	const unsigned int input_size_dpu_round = (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_;	// Input size per DPU (max.), 8-byte aligned
 
-    // Input allocation
-    A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
-    T *bufferA = A;
+	// Input allocation
+	A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
+	T *bufferA = A;
 
-    dpu_results_t* results_retrieve[NR_DPUS];
-    for (i = 0; i < NR_DPUS; i++) {
-        results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t));
-    }
+	dpu_results_t *results_retrieve[NR_DPUS];
+	for (i = 0; i < NR_DPUS; i++) {
+		results_retrieve[i] =
+		    (dpu_results_t *) malloc(NR_TASKLETS *
+					     sizeof(dpu_results_t));
+	}
 
-    // Create an input file with arbitrary data
-    read_input(A, input_size, input_size_dpu_round * NR_DPUS);
+	// Create an input file with arbitrary data
+	read_input(A, input_size, input_size_dpu_round * NR_DPUS);
 
-    printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
+	printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
 
-    // Loop over main kernel
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+	// Loop over main kernel
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if WITH_ALLOC_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 0, 0);
-        }
-        DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, TMR_ALLOC, 0);
+		}
+		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+		if (rep >= p.n_warmup) {
+			stop(&timer, TMR_ALLOC);
+		}
 #endif
 #if WITH_LOAD_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 1, 0);
-        }
-        DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 1);
-        }
-        DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-        DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-        assert(nr_of_dpus == NR_DPUS);
+		if (rep >= p.n_warmup) {
+			start(&timer, TMR_LOAD, 0);
+		}
+		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+		if (rep >= p.n_warmup) {
+			stop(&timer, TMR_LOAD);
+		}
+		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+		DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+		assert(nr_of_dpus == NR_DPUS);
 #endif
 
-        // int prev_rank_id = -1;
-        int rank_id = -1;
-        DPU_FOREACH (dpu_set, dpu) {
-            rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
-                numa_node_rank = -1;
-            } else {
-                numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
-            }
-            /*
-            if (rank_id != prev_rank_id) {
-                printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-                prev_rank_id = rank_id;
-            }
-            */
-        }
-
-        // Compute output on CPU (performance comparison and verification purposes)
-        if(rep >= p.n_warmup)
-            start(&timer, 2, 0);
-        total_count = count_host(A, input_size);
-        if(rep >= p.n_warmup)
-            stop(&timer, 2);
-
-        printf("Load input data\n");
-        if(rep >= p.n_warmup)
-            start(&timer, 3, 0);
-        // Input arguments
-        const unsigned int input_size_dpu = input_size_dpu_round;
-        unsigned int kernel = 0;
-        dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel};
-        // Copy input arrays
-        i = 0;
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup)
-            stop(&timer, 3);
-
-        printf("Run program on DPU(s) \n");
-        // Run DPU kernel
-        if(rep >= p.n_warmup) {
-            start(&timer, 4, 0);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_start(&probe));
-            #endif
-        }
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 4);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_stop(&probe));
-            #endif
-        }
-
+		// int prev_rank_id = -1;
+		int rank_id = -1;
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
+				numa_node_rank = -1;
+			} else {
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
+			}
+			/*
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
+		}
+
+		// Compute output on CPU (performance comparison and verification purposes)
+		if (rep >= p.n_warmup)
+			start(&timer, TMR_CPU, 0);
+		total_count = count_host(A, input_size);
+		if (rep >= p.n_warmup)
+			stop(&timer, TMR_CPU);
+
+		printf("Load input data\n");
+		if (rep >= p.n_warmup)
+			start(&timer, TMR_WRITE, 0);
+		// Input arguments
+		const unsigned int input_size_dpu = input_size_dpu_round;
+		unsigned int kernel = 0;
+		dpu_arguments_t input_arguments =
+		    { input_size_dpu * sizeof(T), kernel };
+		// Copy input arrays
+		i = 0;
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(input_arguments), DPU_XFER_DEFAULT));
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferA + input_size_dpu * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup)
+			stop(&timer, TMR_WRITE);
+
+		printf("Run program on DPU(s) \n");
+		// Run DPU kernel
+		if (rep >= p.n_warmup) {
+			start(&timer, TMR_KERNEL, 0);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+		}
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		if (rep >= p.n_warmup) {
+			stop(&timer, TMR_KERNEL);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+		}
 #if PRINT
-        {
-            unsigned int each_dpu = 0;
-            printf("Display DPU Logs\n");
-            DPU_FOREACH (dpu_set, dpu) {
-                printf("DPU#%d:\n", each_dpu);
-                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
-                each_dpu++;
-            }
-        }
+		{
+			unsigned int each_dpu = 0;
+			printf("Display DPU Logs\n");
+			DPU_FOREACH(dpu_set, dpu) {
+				printf("DPU#%d:\n", each_dpu);
+				DPU_ASSERT(dpulog_read_for_dpu
+					   (dpu.dpu, stdout));
+				each_dpu++;
+			}
+		}
 #endif
 
-        printf("Retrieve results\n");
-        dpu_results_t results[NR_DPUS];
-        i = 0;
-        accum = 0;
-
-        if(rep >= p.n_warmup)
-		    start(&timer, 5, 0);
-        // PARALLEL RETRIEVE TRANSFER
-
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT));
-
-        DPU_FOREACH(dpu_set, dpu, i) {
-            // Retrieve tasklet timings
-            for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) {
-                // Count of this DPU
-                if(each_tasklet == NR_TASKLETS - 1){
-                    results[i].t_count = results_retrieve[i][each_tasklet].t_count;
-                }
-            }
-            // Sequential scan
-            accum += results[i].t_count;
-        }
-        if(rep >= p.n_warmup)
-            stop(&timer, 5);
-
-        i = 0;
+		printf("Retrieve results\n");
+		dpu_results_t results[NR_DPUS];
+		i = 0;
+		accum = 0;
+
+		if (rep >= p.n_warmup)
+			start(&timer, TMR_READ, 0);
+		// PARALLEL RETRIEVE TRANSFER
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+			    NR_TASKLETS * sizeof(dpu_results_t),
+			    DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			// Retrieve tasklet timings
+			for (unsigned int each_tasklet = 0;
+			     each_tasklet < NR_TASKLETS; each_tasklet++) {
+				// Count of this DPU
+				if (each_tasklet == NR_TASKLETS - 1) {
+					results[i].t_count =
+					    results_retrieve[i][each_tasklet].
+					    t_count;
+				}
+			}
+			// Sequential scan
+			accum += results[i].t_count;
+		}
+		if (rep >= p.n_warmup)
+			stop(&timer, TMR_READ);
+
+		i = 0;
 
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 8, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, TMR_FREE, 0);
+		}
 #endif
-        DPU_ASSERT(dpu_free(dpu_set));
+		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            stop(&timer, 8);
-        }
+		if (rep >= p.n_warmup) {
+			stop(&timer, TMR_FREE);
+		}
 #endif
 #endif
 
-        // Check output
-        bool status = true;
-        if(accum != total_count) status = false;
-        if (status) {
-            printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-            if (rep >= p.n_warmup) {
-                printf("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
-                    NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size_dpu_round);
-                printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-                    WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-                printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-                    timer.time[0],
-                    timer.time[1],
-                    timer.time[2],
-                    timer.time[3], // write
-                    timer.time[4], // kernel
-                    timer.time[5], // read
-                    timer.time[8]);
-                printf(" latency_total_us=%f",
-                    timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8]);
-                printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-                    input_size * sizeof(T) / timer.time[2],
-                    input_size * sizeof(T) / timer.time[4],
-                    input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8])
-                );
-                printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-                    input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-                printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-                    input_size / timer.time[2],
-                    input_size / timer.time[4],
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8])
-                );
-                printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-                    input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-            }
-        } else {
-            printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
-        }
-    }
-
-    #if ENERGY
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    printf("DPU Energy (J): %f\t", energy);
-    #endif	
-
-    // Deallocation
-    free(A);
+		// Check output
+		bool status = true;
+		if (accum != total_count)
+			status = false;
+		if (status) {
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] Outputs are equal\n");
+			if (rep >= p.n_warmup) {
+				printf
+				    ("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
+				     NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T),
+				     BLOCK_SIZE, input_size,
+				     input_size_dpu_round);
+				printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD, numa_node_rank);
+				printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3],	// write
+				       timer.time[4],	// kernel
+				       timer.time[5],	// read
+				       timer.time[8]);
+				printf(" latency_total_us=%f",
+				       timer.time[0] + timer.time[1] +
+				       timer.time[3] + timer.time[4] +
+				       timer.time[5] + timer.time[8]);
+				printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     input_size * sizeof(T) / timer.time[2],
+				     input_size * sizeof(T) / timer.time[4],
+				     input_size * sizeof(T) / (timer.time[0] +
+							       timer.time[1] +
+							       timer.time[3] +
+							       timer.time[4] +
+							       timer.time[5] +
+							       timer.time[8])
+				    );
+				printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     input_size * sizeof(T) / (timer.time[3] +
+							       timer.time[4] +
+							       timer.time[5]),
+				     input_size * sizeof(T) / (timer.time[1] +
+							       timer.time[3] +
+							       timer.time[4] +
+							       timer.time[5]),
+				     input_size * sizeof(T) / (timer.time[0] +
+							       timer.time[1] +
+							       timer.time[3] +
+							       timer.time[4] +
+							       timer.time[5]));
+				printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     input_size / timer.time[2],
+				     input_size / timer.time[4],
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5] +
+						   timer.time[8])
+				    );
+				printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     input_size / (timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]));
+			}
+		} else {
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] Outputs differ!\n");
+		}
+	}
+
+#if ENERGY
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	printf("DPU Energy (J): %f\t", energy);
+#endif
+
+	// Deallocation
+	free(A);
 
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_free(dpu_set));
+	DPU_ASSERT(dpu_free(dpu_set));
 #endif
 
-    return 0;
+	return 0;
 }
diff --git a/COUNT/support/common.h b/COUNT/support/common.h
index 72270b0..afd5b2d 100755
--- a/COUNT/support/common.h
+++ b/COUNT/support/common.h
@@ -3,15 +3,15 @@
 
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
-    uint32_t size;
+	uint32_t size;
 	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
+		kernel1 = 0,
+		nr_kernels = 1,
 	} kernel;
 } dpu_arguments_t;
 
 typedef struct {
-    uint32_t t_count;
+	uint32_t t_count;
 } dpu_results_t;
 
 // Transfer size between MRAM and WRAM
@@ -26,11 +26,12 @@ typedef struct {
 
 // Data type
 #define T uint64_t
-#define REGS (BLOCK_SIZE >> 3) // 64 bits
+#define REGS (BLOCK_SIZE >> 3)	// 64 bits
 
 // Sample predicate
-bool pred(const T x){
-  return (x % 2) == 0;
+bool pred(const T x)
+{
+	return (x % 2) == 0;
 }
 
 #ifndef ENERGY
diff --git a/COUNT/support/params.h b/COUNT/support/params.h
index bb86211..dd1505e 100644
--- a/COUNT/support/params.h
+++ b/COUNT/support/params.h
@@ -4,53 +4,62 @@
 #include "common.h"
 
 typedef struct Params {
-    unsigned int   input_size;
-    int   n_warmup;
-    int   n_reps;
-    int  exp;
-}Params;
+	unsigned int input_size;
+	int n_warmup;
+	int n_reps;
+	int exp;
+} Params;
 
-static void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=3932160 elements)"
-        "\n");
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=3932160 elements)" "\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 3932160;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 0;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 3932160;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 0;
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'x': p.exp           = atoi(optarg); break;
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
 
-    return p;
+	return p;
 }
 #endif
diff --git a/COUNT/support/timer.h b/COUNT/support/timer.h
index 3ec6d87..76fbcff 100755
--- a/COUNT/support/timer.h
+++ b/COUNT/support/timer.h
@@ -1,66 +1,80 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[9];
-    struct timeval stopTime[9];
-    double         time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba,
+ *        University of Illinois nor the names of its contributors may be used
+ *        to endorse or promote products derived from this Software without
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+	struct timeval startTime[7];
+	struct timeval stopTime[7];
+	double time[7];
+} Timer;
+
+#define TMR_ALLOC 0
+#define TMR_LOAD 1
+#define TMR_CPU 2
+#define TMR_WRITE 3
+#define TMR_KERNEL 4
+#define TMR_READ 5
+#define TMR_FREE 6
+
+void start(Timer *timer, int i, int rep)
+{
+	if (rep == 0) {
+		timer->time[i] = 0.0;
+	}
+	gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+	gettimeofday(&timer->stopTime[i], NULL);
+	timer->time[i] +=
+	    (timer->stopTime[i].tv_sec -
+	     timer->startTime[i].tv_sec) * 1000000.0 +
+	    (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+	printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+	for (int i = 0; i <= maxt; i++) {
+		printf(" timer%d_us=%f", i, timer->time[i]);
+	}
+	printf("\n");
+}
diff --git a/GEMV/Makefile b/GEMV/Makefile
index 5f766ae..644278e 100644
--- a/GEMV/Makefile
+++ b/GEMV/Makefile
@@ -5,16 +5,31 @@ WITH_ALLOC_OVERHEAD ?= 0
 WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-.PHONY: all clean test
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD}
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
 ifdef verbose
@@ -27,7 +42,9 @@ bin:
 	${QUIET}mkdir -p bin
 
 bin/gemv_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
 bin/gemv_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
@@ -37,3 +54,5 @@ clean:
 
 test: all
 	bin/gemv_host -m 1024 -n 1024
+
+.PHONY: all clean test
diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile
index 016d561..60c662c 100644
--- a/GEMV/baselines/cpu/Makefile
+++ b/GEMV/baselines/cpu/Makefile
@@ -1,17 +1,24 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+native ?= 1
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
 TYPE ?= double
 
-ifeq (${NUMA}, 1)
-	FLAGS += -lnuma
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
 endif
 
 .PHONY: all
 all: gemv
 
 gemv: gemv_openmp.c
-	gcc -ggdb -Wall -Wextra -pedantic -march=native -O2 -o gemv -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${FLAGS}
+	gcc -ggdb -Wall -Wextra -pedantic ${CFLAGS} -O3 -o gemv -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${LDFLAGS}
 
 gemv_O0: gemv_openmp.c
 	gcc -o gemv_O0 -fopenmp gemv_openmp.c
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
index 21e24cb..99bba55 100644
--- a/GEMV/baselines/cpu/gemv_openmp.c
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -10,10 +10,10 @@
 #include <numaif.h>
 #include <numa.h>
 
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
@@ -22,7 +22,7 @@ int numa_node_cpu = -1;
 #endif
 
 #if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
 int numa_node_cpu_memcpy = -1;
 int numa_node_local = -1;
 int numa_node_in_is_local = 0;
@@ -35,284 +35,292 @@ int numa_node_in_is_local = 0;
 
 int main(int argc, char *argv[])
 {
-    (void) argc;
+	(void)argc;
 /*  // upstream config:
     const size_t rows = 20480;
     const size_t cols = 8192;
 */
 
-    // DPU config: 163840 -n 4096
-    const size_t rows = 163840;
-    const size_t cols = 4096;
+	// DPU config: 163840 -n 4096
+	const size_t rows = 163840;
+	const size_t cols = 4096;
 
-    T **A, *b, *x;
+	T **A, *b, *x;
 
-    T **A_local, *x_local;
+	T **A_local, *x_local;
 
 #if NUMA
-    bitmask_in    = numa_parse_nodestring(argv[1]);
-    bitmask_out   = numa_parse_nodestring(argv[2]);
-    numa_node_cpu = atoi(argv[3]);
+	bitmask_in = numa_parse_nodestring(argv[1]);
+	bitmask_out = numa_parse_nodestring(argv[2]);
+	numa_node_cpu = atoi(argv[3]);
 #if NUMA_MEMCPY
-    bitmask_cpu   = numa_parse_nodestring(argv[4]);
-    numa_node_cpu_memcpy = atoi(argv[5]);
-#endif // NUMA_MEMCPY
+	bitmask_cpu = numa_parse_nodestring(argv[4]);
+	numa_node_cpu_memcpy = atoi(argv[5]);
+#endif				// NUMA_MEMCPY
 #else
-    (void) argv;
-#endif // NUMA
+	(void)argv;
+#endif				// NUMA
 
 #if NUMA
-    if (bitmask_out) {
-        numa_set_membind(bitmask_out);
-        numa_free_nodemask(bitmask_out);
-    }
-    b = (T*) numa_alloc(sizeof(T)*rows);
+	if (bitmask_out) {
+		numa_set_membind(bitmask_out);
+		numa_free_nodemask(bitmask_out);
+	}
+	b = (T *) numa_alloc(sizeof(T) * rows);
 #else
-    b = (T*) malloc(sizeof(T)*rows);
+	b = (T *) malloc(sizeof(T) * rows);
 #endif
 
 #if NUMA
-    if (bitmask_in) {
-        numa_set_membind(bitmask_in);
-        // no free yet, re-used in allocate_dense
-    }
-    x = (T*) numa_alloc(sizeof(T)*cols);
+	if (bitmask_in) {
+		numa_set_membind(bitmask_in);
+		// no free yet, re-used in allocate_dense
+	}
+	x = (T *) numa_alloc(sizeof(T) * cols);
 #else
-    x = (T*) malloc(sizeof(T)*cols);
+	x = (T *) malloc(sizeof(T) * cols);
 #endif
 
-    allocate_dense(rows, cols, &A);
+	allocate_dense(rows, cols, &A);
 
 #if NUMA
-    if (bitmask_in) {
-        numa_free_nodemask(bitmask_in);
-    }
+	if (bitmask_in) {
+		numa_free_nodemask(bitmask_in);
+	}
 #endif
 
-    make_hilbert_mat(rows,cols, &A);
+	make_hilbert_mat(rows, cols, &A);
 
 #if NUMA
 #if NUMA_MEMCPY
-    if (bitmask_cpu) {
-        numa_set_membind(bitmask_cpu);
-        numa_free_nodemask(bitmask_cpu);
-    }
+	if (bitmask_cpu) {
+		numa_set_membind(bitmask_cpu);
+		numa_free_nodemask(bitmask_cpu);
+	}
 #else
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
 
-    A_local = A;
-    x_local = x;
+	A_local = A;
+	x_local = x;
 
 #if NUMA
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages(A) error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    mp_pages[0] = b;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(b)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages(b) error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_out = mp_status[0];
-    }
-
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages(A) error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	mp_pages[0] = b;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(b)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages(b) error: %d", mp_status[0]);
+	} else {
+		numa_node_out = mp_status[0];
+	}
+
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 #if NUMA_MEMCPY
-    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+	numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+				 || (numa_node_cpu + 8 == numa_node_in)) * 1;
 #endif
 
-    Timer timer;
-    for (int i = 0; i < 20; i++) {
+	Timer timer;
+	for (int i = 0; i < 20; i++) {
 
 #pragma omp parallel
-        {
+		{
 #pragma omp for
-        for (size_t i = 0; i < cols; i++) {
-          x[i] = (T) i+1 ;
-        }
+			for (size_t i = 0; i < cols; i++) {
+				x[i] = (T) i + 1;
+			}
 
 #pragma omp for
-        for (size_t i = 0; i < rows; i++) {
-          b[i] = (T) 0;
-        }
-        }
+			for (size_t i = 0; i < rows; i++) {
+				b[i] = (T) 0;
+			}
+		}
 
 #if NUMA_MEMCPY
-        start(&timer, 1, 0);
-        if (!numa_node_in_is_local) {
-            x_local = (T*) numa_alloc(sizeof(T)*cols);
-            allocate_dense(rows, cols, &A_local);
-        }
-        stop(&timer, 1);
-
-        if (x_local == NULL) {
-            return 1;
-        }
-        if (A_local == NULL) {
-            return 1;
-        }
-
-        if (!numa_node_in_is_local) {
-            if (numa_node_cpu_memcpy != -1) {
-                if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
-                    perror("numa_run_on_node");
-                    numa_node_cpu_memcpy = -1;
-                }
-            }
-        }
-
-        start(&timer, 2, 0);
-        if (!numa_node_in_is_local) {
-            //for (size_t i=0; i < rows; i++ ) {
-             //   memcpy(A_local[i], A[i], cols * sizeof(T));
-            //}
-            memcpy(*A_local, *A, rows * cols * sizeof(T));
-            memcpy(x_local, x, cols * sizeof(T));
-        } else {
-            A_local = A;
-            x_local = x;
-        }
-        stop(&timer, 2);
-
-        if (numa_node_cpu != -1) {
-            if (numa_run_on_node(numa_node_cpu) == -1) {
-                perror("numa_run_on_node");
-                numa_node_cpu = -1;
-            }
-        }
-
-        mp_pages[0] = A_local;
-        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-            perror("move_pages(A_local)");
-        }
-        else if (mp_status[0] < 0) {
-            printf("move_pages error: %d", mp_status[0]);
-        }
-        else {
-            numa_node_local = mp_status[0];
-        }
+		start(&timer, 1, 0);
+		if (!numa_node_in_is_local) {
+			x_local = (T *) numa_alloc(sizeof(T) * cols);
+			allocate_dense(rows, cols, &A_local);
+		}
+		stop(&timer, 1);
+
+		if (x_local == NULL) {
+			return 1;
+		}
+		if (A_local == NULL) {
+			return 1;
+		}
+
+		if (!numa_node_in_is_local) {
+			if (numa_node_cpu_memcpy != -1) {
+				if (numa_run_on_node(numa_node_cpu_memcpy) ==
+				    -1) {
+					perror("numa_run_on_node");
+					numa_node_cpu_memcpy = -1;
+				}
+			}
+		}
+
+		start(&timer, 2, 0);
+		if (!numa_node_in_is_local) {
+			//for (size_t i=0; i < rows; i++ ) {
+			//   memcpy(A_local[i], A[i], cols * sizeof(T));
+			//}
+			memcpy(*A_local, *A, rows * cols * sizeof(T));
+			memcpy(x_local, x, cols * sizeof(T));
+		} else {
+			A_local = A;
+			x_local = x;
+		}
+		stop(&timer, 2);
+
+		if (numa_node_cpu != -1) {
+			if (numa_run_on_node(numa_node_cpu) == -1) {
+				perror("numa_run_on_node");
+				numa_node_cpu = -1;
+			}
+		}
+
+		mp_pages[0] = A_local;
+		if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+			perror("move_pages(A_local)");
+		} else if (mp_status[0] < 0) {
+			printf("move_pages error: %d", mp_status[0]);
+		} else {
+			numa_node_local = mp_status[0];
+		}
 #endif
 
-        unsigned int nr_threads = 0;
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
+		nr_threads++;
 
-        start(&timer, 0, 0);
-        gemv(A_local, x_local, rows, cols, &b);
-        stop(&timer, 0);
+		start(&timer, 0, 0);
+		gemv(A_local, x_local, rows, cols, &b);
+		stop(&timer, 0);
 
 #if NUMA_MEMCPY
-        start(&timer, 3, 0);
-        if (!numa_node_in_is_local) {
-            numa_free(x_local, sizeof(T) * cols);
-            numa_free(*A_local, sizeof(T) * rows * cols);
-            numa_free(A_local, sizeof(void*) * rows);
-        }
-        stop(&timer, 3);
+		start(&timer, 3, 0);
+		if (!numa_node_in_is_local) {
+			numa_free(x_local, sizeof(T) * cols);
+			numa_free(*A_local, sizeof(T) * rows * cols);
+			numa_free(A_local, sizeof(void *) * rows);
+		}
+		stop(&timer, 3);
 #endif
 
 #if NUMA_MEMCPY
-        printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
-            " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
-            " | throughput_MBps=%f throughput_MOpps=%f",
-            nr_threads, XSTR(T), rows * cols,
-            numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
-            rows * cols * sizeof(T) / timer.time[0],
-            rows * cols / timer.time[0]);
-        printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
-            timer.time[0], timer.time[1], timer.time[2], timer.time[3],
-            timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+		printf
+		    ("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+		     " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+		     " | throughput_MBps=%f throughput_MOpps=%f", nr_threads,
+		     XSTR(T), rows * cols, numa_node_in, numa_node_out,
+		     numa_node_cpu, numa_node_local, numa_node_cpu_memcpy,
+		     numa_distance(numa_node_in, numa_node_cpu),
+		     numa_distance(numa_node_cpu, numa_node_out),
+		     rows * cols * sizeof(T) / timer.time[0],
+		     rows * cols / timer.time[0]);
+		printf
+		    (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+		     timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+		     timer.time[0] + timer.time[1] + timer.time[2] +
+		     timer.time[3]);
 #else
-        printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
+		printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
 #if NUMA
-            " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+		       " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
 #endif
-            " | throughput_MBps=%f",
-            nr_threads, XSTR(T), rows * cols,
+		       " | throughput_MBps=%f",
+		       nr_threads, XSTR(T), rows * cols,
 #if NUMA
-            numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+		       numa_node_in, numa_node_out, numa_node_cpu,
+		       numa_distance(numa_node_in, numa_node_cpu),
+		       numa_distance(numa_node_cpu, numa_node_out),
 #endif
-            rows * cols * sizeof(T) / timer.time[0]);
-        printf(" throughput_MOpps=%f latency_us=%f\n",
-            rows * cols / timer.time[0], timer.time[0]);
+		       rows * cols * sizeof(T) / timer.time[0]);
+		printf(" throughput_MOpps=%f latency_us=%f\n",
+		       rows * cols / timer.time[0], timer.time[0]);
 #endif
-    }
-
+	}
 
 #if 0
-  print_vec(x, rows);
-  print_mat(A, rows, cols);
-  print_vec(b, rows);
+	print_vec(x, rows);
+	print_mat(A, rows, cols);
+	print_vec(b, rows);
 #endif
 
 #if TYPE_double || TYPE_float
-  printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
+	printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x, cols),
+	       sum_vec(b, rows));
 #else
-  printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x,cols), sum_vec(b,rows));
+	printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x, cols),
+	       sum_vec(b, rows));
 #endif
 
 #if NUMA
-  numa_free(b, sizeof(T)*rows);
-  numa_free(x, sizeof(T)*cols);
-  numa_free(*A, sizeof(T)*rows*cols);
-  numa_free(A, sizeof(void*)*rows);
+	numa_free(b, sizeof(T) * rows);
+	numa_free(x, sizeof(T) * cols);
+	numa_free(*A, sizeof(T) * rows * cols);
+	numa_free(A, sizeof(void *) * rows);
 #else
-  free(b);
-  free(x);
-  free(*A);
-  free(A);
+	free(b);
+	free(x);
+	free(*A);
+	free(A);
 #endif
 
-  return 0;
+	return 0;
 }
 
-void gemv(T** A, T* x, size_t rows, size_t cols, T** b) {
+void gemv(T **A, T *x, size_t rows, size_t cols, T **b)
+{
 #pragma omp parallel for
-  for (size_t i = 0; i < rows; i ++ )
-  for (size_t j = 0; j < cols; j ++ ) {
-    (*b)[i] = (*b)[i] + A[i][j]*x[j];
-  }
+	for (size_t i = 0; i < rows; i++)
+		for (size_t j = 0; j < cols; j++) {
+			(*b)[i] = (*b)[i] + A[i][j] * x[j];
+		}
 }
 
-void make_hilbert_mat(size_t rows, size_t cols, T*** A) {
+void make_hilbert_mat(size_t rows, size_t cols, T ***A)
+{
 #pragma omp parallel for
-  for (size_t i = 0; i < rows; i++) {
-    for (size_t j = 0; j < cols; j++) {
+	for (size_t i = 0; i < rows; i++) {
+		for (size_t j = 0; j < cols; j++) {
 #if TYPE_double || TYPE_float
-      (*A)[i][j] = 1.0/( (T) i + (T) j + 1.0);
+			(*A)[i][j] = 1.0 / ((T) i + (T) j + 1.0);
 #else
-      (*A)[i][j] = (T)(((i+j)%10));
+			(*A)[i][j] = (T) (((i + j) % 10));
 #endif
-    }
-  }
+		}
+	}
 }
 
-T sum_vec(T* vec, size_t rows) {
-  T sum = 0;
+T sum_vec(T *vec, size_t rows)
+{
+	T sum = 0;
 #pragma omp parallel for reduction(+:sum)
-  for (int i = 0; i < rows; i++) sum = sum + vec[i];
-  return sum;
+	for (int i = 0; i < rows; i++)
+		sum = sum + vec[i];
+	return sum;
 }
diff --git a/GEMV/baselines/cpu/run-perf.sh b/GEMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..5eae822
--- /dev/null
+++ b/GEMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4
diff --git a/GEMV/benchmark-scripts/ccmcc25-sim.sh b/GEMV/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..3f88fcf
--- /dev/null
+++ b/GEMV/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/gemv_host -w 0 -e 5 -n ${nr_cols} -m ${nr_rows} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  GEMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_cols={nr_cols} nr_rows={nr_rows} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: nr_cols 256 512 768 1024 1536 \
+	::: nr_rows 512 68 1024 1536 2048 \
+>> ${fn}.txt
diff --git a/GEMV/benchmark-scripts/ccmcc25.sh b/GEMV/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..591a200
--- /dev/null
+++ b/GEMV/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/gemv_host -w 0 -e 50 -n ${nr_cols} -m ${nr_rows} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  GEMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_cols={nr_cols} nr_rows={nr_rows} \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: nr_cols 2048 4096 8192 \
+		::: nr_rows 40960 81920 163840 \
+	>> ${fn}.txt
+
+done
diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c
index 0226437..120f134 100644
--- a/GEMV/dpu/task.c
+++ b/GEMV/dpu/task.c
@@ -10,14 +10,15 @@
 #include <barrier.h>
 #include <seqread.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 #define roundup(n, m) ((n / m) * m + m)
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
 // GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
 	for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
 		bufferC[pos] += bufferA[i] * bufferB[i];
 	}
@@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // main
-int main() {
+int main()
+{
 	unsigned int tasklet_id = me();
 #if PRINT
 	// printf("tasklet_id = %u\n", tasklet_id);
 #endif
-	if (tasklet_id == 0){ // Initialize once the cycle counter
-		mem_reset(); // Reset the heap
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
 	}
 	// Barrier
 	barrier_wait(&my_barrier);
@@ -44,15 +46,15 @@ int main() {
 	uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
 	uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
 
-	unsigned int element_per_cacheC = 8/sizeof(T);
+	unsigned int element_per_cacheC = 8 / sizeof(T);
 
 	unsigned int nrows = nr_rows;
-	unsigned int rows_per_tasklet; 
+	unsigned int rows_per_tasklet;
 	unsigned int start_row;
 	unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC);
-	unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks; 
+	unsigned int dbl_chunks = chunks * element_per_cacheC;	//chunks + chunks; 
 	rows_per_tasklet = dbl_chunks;
-	unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
+	unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC);	//(NR_TASKLETS + NR_TASKLETS);
 
 	if ((tasklet_id * element_per_cacheC) < rest_rows)
 		rows_per_tasklet += element_per_cacheC;
@@ -60,22 +62,32 @@ int main() {
 		if ((tasklet_id * element_per_cacheC) >= rest_rows) {
 			// unsigned int hlf_rest_rows = rest_rows >> 1;
 			if ((rest_rows % element_per_cacheC) != 0)
-				start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks; 
-				// start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+				start_row =
+				    roundup(rest_rows,
+					    element_per_cacheC) +
+				    tasklet_id * dbl_chunks;
+			// start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
 			else
-				start_row = rest_rows + tasklet_id * dbl_chunks; 
-				// start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
-		} else 
-			start_row = tasklet_id * (dbl_chunks + element_per_cacheC);
-			// start_row = tasklet_id * (dbl_chunks + 2);
+				start_row = rest_rows + tasklet_id * dbl_chunks;
+			// start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
+		} else
+			start_row =
+			    tasklet_id * (dbl_chunks + element_per_cacheC);
+		// start_row = tasklet_id * (dbl_chunks + 2);
 	} else {
 		start_row = tasklet_id * (dbl_chunks);
 	}
 
 	// Address of the current row in MRAM
-	uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
-	uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
-	uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+	uint32_t mram_base_addr_A =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+	uint32_t mram_base_addr_B =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER +
+			max_rows * n_size_pad * sizeof(T));
+	uint32_t mram_base_addr_C =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER +
+			max_rows * n_size_pad * sizeof(T) +
+			n_size_pad * sizeof(T) + start_row * sizeof(T));
 	uint32_t mram_temp_addr_A = mram_base_addr_A;
 	uint32_t mram_temp_addr_B = mram_base_addr_B;
 
@@ -87,55 +99,65 @@ int main() {
 
 	int offset = 0;
 
-	#if PRINT
-	printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet);
-	printf("id: %d, start_row = %d\n",tasklet_id, start_row);
-	#endif
+#if PRINT
+	printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet);
+	printf("id: %d, start_row = %d\n", tasklet_id, start_row);
+#endif
 
 	// Iterate over nr_rows
 	// for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
-	for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) {
+	for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+	     i += element_per_cacheC) {
 
-		mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+		mram_temp_addr_A =
+		    (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
 		mram_temp_addr_B = mram_base_addr_B;
 
 		// cache_C[0] = 0;
 		// cache_C[1] = 0;
 
 		// clear the cache
-		for(unsigned int c = 0; c < element_per_cacheC; c++){
-			cache_C[c] = 0; 
+		for (unsigned int c = 0; c < element_per_cacheC; c++) {
+			cache_C[c] = 0;
 		}
 
 		// for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
 		// for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){
 		// for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){ 
-		for(unsigned int pos = 0; pos < element_per_cacheC; pos++){ 
-			if(i + pos >= nr_rows){
+		for (unsigned int pos = 0; pos < element_per_cacheC; pos++) {
+			if (i + pos >= nr_rows) {
 				// printf("id: %d, nrows: %d, error\n", tasklet_id, nrows);
 				break;
-			} 
+			}
 
 			int n = 0, j;
-			for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
-			{
-
-				mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-				mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
-				if(offset)
-				{
-
-					for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
-					{
+			for (n = 0;
+			     n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+			     n += (BLOCK_SIZE / sizeof(T))) {
+
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_A), cache_A,
+					  BLOCK_SIZE);
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_B), cache_B,
+					  BLOCK_SIZE);
+
+				if (offset) {
+
+					for (unsigned int off = 0;
+					     off < (BLOCK_SIZE / sizeof(T)) - 1;
+					     off++) {
 						cache_A[off] = cache_A[off + 1];
 					}
 
-					mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+					mram_read((__mram_ptr void const
+						   *)(mram_temp_addr_A +
+						      BLOCK_SIZE), cache_A_aux,
+						  8);
 
-					cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+					cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+					    cache_A_aux[0];
 				}
-
 				// Compute GEMV
 				gemv(cache_C, cache_A, cache_B, pos);
 
@@ -144,53 +166,55 @@ int main() {
 				mram_temp_addr_B += BLOCK_SIZE;
 			}
 
-			mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-
+			mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+				  cache_A, BLOCK_SIZE);
 
-			if(offset)
-			{
-				for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
-				{
+			if (offset) {
+				for (unsigned int off = 0;
+				     off < (BLOCK_SIZE / sizeof(T)) - 1;
+				     off++) {
 
 					cache_A[off] = cache_A[off + 1];
 				}
 
-				mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_A + BLOCK_SIZE),
+					  cache_A_aux, 8);
 
-  			       cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+				cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+				    cache_A_aux[0];
 			}
 
+			mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+				  cache_B, BLOCK_SIZE);
 
-			mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
-			for (j = 0; j < (int) (n_size - n); j++) {
+			for (j = 0; j < (int)(n_size - n); j++) {
 				// Compute GEMV
-				if(j >= (int)(BLOCK_SIZE / sizeof(T))){ 
+				if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
 					printf("error\n");
 					break;
 				}
 				cache_C[pos] += cache_A[j] * cache_B[j];
 			}
 
-
-			mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+			mram_temp_addr_A +=
+			    (BLOCK_SIZE -
+			     ((BLOCK_SIZE / sizeof(T)) -
+			      (n_size - n)) * sizeof(T));
 			mram_temp_addr_B = mram_base_addr_B;
 
-			if(mram_temp_addr_A % 8 != 0)
-			{
+			if (mram_temp_addr_A % 8 != 0) {
 				offset = 1;
-			}
-			else
-			{
+			} else {
 				offset = 0;
 			}
 		}
 		// Write cache to current MRAM block
-		mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+		mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
 
 		// Update memory address
 		// mram_base_addr_C += 2 * sizeof(T);
-		mram_base_addr_C += 8; 
+		mram_base_addr_C += 8;
 
 	}
 
diff --git a/GEMV/host/app.c b/GEMV/host/app.c
index ebd0336..9838eb4 100644
--- a/GEMV/host/app.c
+++ b/GEMV/host/app.c
@@ -8,94 +8,110 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
-#include <dpu.h>
-#include <dpu_log.h>
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#if ENERGY
-#include <dpu_probe.h>
+#if ASPECTC
+extern "C" {
 #endif
 
+#include <dpu.h>
+#include <dpu_log.h>
 #include <dpu_management.h>
 #include <dpu_target_macros.h>
 
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
 #define DPU_BINARY "./bin/gemv_dpu"
 #endif
 
-static T* A;
-static T* B;
-static T* C;
-static T* C_dpu;
+static T *A;
+static T *B;
+static T *C;
+static T *C_dpu;
+
+unsigned int kernel = 0;
 
 // Create input arrays
-static void init_data(T* A, T* B, unsigned int m_size, unsigned int n_size) {
+static void init_data(T *A, T *B, unsigned int m_size, unsigned int n_size)
+{
 	srand(0);
 
-	for (unsigned int i = 0; i < m_size * n_size; i++)
-	{
-		A[i] = (unsigned int) (rand()%50);
+	for (unsigned int i = 0; i < m_size * n_size; i++) {
+		A[i] = (unsigned int)(rand() % 50);
 	}
 
-	for (unsigned int i = 0; i < n_size; i++)
-	{
-		B[i] = (unsigned int) (rand()%50);
+	for (unsigned int i = 0; i < n_size; i++) {
+		B[i] = (unsigned int)(rand() % 50);
 	}
 }
 
 // Compute output in the host
-static void gemv_host(T* C, T* A, T* B, unsigned int m_size, unsigned int n_size) {
-	for (unsigned int i = 0; i < m_size; i++)
-	{
+static void gemv_host(T *C, T *A, T *B, unsigned int m_size,
+		      unsigned int n_size)
+{
+	for (unsigned int i = 0; i < m_size; i++) {
 		C[i] = 0;
 	}
 
 	for (unsigned int m = 0; m < m_size; m++) {
-		for (unsigned int n = 0; n < n_size; n++)
-		{
+		for (unsigned int n = 0; n < n_size; n++) {
 			C[m] += A[m * n_size + n] * B[n];
 		}
 	}
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
 	struct Params p = input_params(argc, argv);
 
 	struct dpu_set_t dpu_set, dpu;
 	uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	uint32_t nr_of_ranks;
 
 	// Timer
 	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if DFATOOL_TIMING
+	timer.time[0] = 0;	// alloc
+#endif
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+#if DFATOOL_TIMING
+	timer.time[1] = 0;	// load
+#endif
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[8] = 0; // free
+#if DFATOOL_TIMING
+	timer.time[8] = 0;	// free
+#endif
 #endif
 
 #if ENERGY
@@ -108,12 +124,13 @@ int main(int argc, char **argv) {
 	unsigned int n_size = p.n_size;
 
 	// Initialize help data
-	dpu_info = (struct dpu_info_t *) malloc(NR_DPUS * sizeof(struct dpu_info_t));
-	dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
+	dpu_info =
+	    (struct dpu_info_t *)malloc(NR_DPUS * sizeof(struct dpu_info_t));
+	dpu_arguments_t *input_args =
+	    (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
 	uint32_t max_rows_per_dpu = 0;
 	uint32_t n_size_pad = n_size;
-	if(n_size % 2 == 1)
-	{
+	if (n_size % 2 == 1) {
 		n_size_pad++;
 	}
 
@@ -127,7 +144,10 @@ int main(int argc, char **argv) {
 			rows_per_dpu++;
 		if (rest_rows > 0) {
 			if (i >= rest_rows)
-				prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+				prev_rows_dpu =
+				    rest_rows * (chunks + 1) + (i -
+								rest_rows) *
+				    chunks;
 			else
 				prev_rows_dpu = i * (chunks + 1);
 		} else {
@@ -136,7 +156,7 @@ int main(int argc, char **argv) {
 
 		// Keep max rows for parallel transfers
 		uint32_t rows_per_dpu_pad = rows_per_dpu;
-		if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+		if (rows_per_dpu_pad % 2 == 1)	// 4-byte elements
 			rows_per_dpu_pad++;
 		if (rows_per_dpu_pad > max_rows_per_dpu)
 			max_rows_per_dpu = rows_per_dpu_pad;
@@ -151,10 +171,10 @@ int main(int argc, char **argv) {
 		input_args[i].nr_rows = rows_per_dpu;
 	}
 
-	A = malloc(max_rows_per_dpu * NR_DPUS * n_size_pad * sizeof(T));
-	B = malloc(n_size_pad * sizeof(T));
-	C = malloc(max_rows_per_dpu * NR_DPUS * sizeof(T));
-	C_dpu = malloc(max_rows_per_dpu * NR_DPUS * sizeof(T));
+	A = (T*)malloc(max_rows_per_dpu * NR_DPUS * n_size_pad * sizeof(T));
+	B = (T*)malloc(n_size_pad * sizeof(T));
+	C = (T*)malloc(max_rows_per_dpu * NR_DPUS * sizeof(T));
+	C_dpu = (T*)malloc(max_rows_per_dpu * NR_DPUS * sizeof(T));
 
 	// Initialize data with arbitrary data
 	init_data(A, B, m_size, n_size);
@@ -163,20 +183,20 @@ int main(int argc, char **argv) {
 	for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if WITH_ALLOC_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 0, 0);
 		}
 		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 0);
 		}
 #endif
 #if WITH_LOAD_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 1, 0);
 		}
 		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 1);
 		}
 		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -186,26 +206,33 @@ int main(int argc, char **argv) {
 
 		// int prev_rank_id = -1;
 		int rank_id = -1;
-		DPU_FOREACH (dpu_set, dpu) {
-			rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-			if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
 				numa_node_rank = -1;
 			} else {
-				numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
 			}
 			/*
-			if (rank_id != prev_rank_id) {
-				printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-				prev_rank_id = rank_id;
-			}
-			*/
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
 		}
 
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 2, 0);
 		}
 		gemv_host(C, A, B, m_size, n_size);
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 2);
 		}
 		if (rep >= p.n_warmup) {
@@ -220,23 +247,30 @@ int main(int argc, char **argv) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
 
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 3);
 		}
 		if (rep >= p.n_warmup) {
 			start(&timer, 6, 0);
 		}
-
 		// Copy input array and vector
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, A + dpu_info[i].prev_rows_dpu * n_size));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu,
+				    A + dpu_info[i].prev_rows_dpu * n_size));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    max_rows_per_dpu * n_size_pad * sizeof(T),
+			    DPU_XFER_DEFAULT));
 
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 6);
 		}
 		if (rep >= p.n_warmup) {
@@ -246,12 +280,15 @@ int main(int argc, char **argv) {
 		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, B));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    max_rows_per_dpu * n_size_pad * sizeof(T),
+			    n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
 
 		if (rep >= p.n_warmup) {
 			stop(&timer, 7);
 		}
-
 		// Run kernel on DPUs
 		if (rep >= p.n_warmup) {
 			start(&timer, 4, 0);
@@ -280,89 +317,140 @@ int main(int argc, char **argv) {
 			start(&timer, 5, 0);
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, C_dpu + i * max_rows_per_dpu));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
-		if(rep >= p.n_warmup) {
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    max_rows_per_dpu * n_size_pad * sizeof(T) +
+			    n_size_pad * sizeof(T),
+			    max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
 			stop(&timer, 5);
 		}
 
-
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 8, 0);
 		}
 #endif
 		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 8);
 		}
 #endif
 #endif
 
-
 		// Check output
 		bool status = true;
-		unsigned int n,j;
+		unsigned int n, j;
 		i = 0;
 		for (n = 0; n < NR_DPUS; n++) {
 			for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
-				if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+				if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
 					status = false;
 #if PRINT
-		//			printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+					//                      printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
 #endif
 				}
 				i++;
 			}
 		}
 		if (status) {
-			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] Outputs are equal\n");
 			if (rep >= p.n_warmup) {
-				printf("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
-					NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, n_size * m_size);
-				printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-					WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-				printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-					timer.time[0],
-					timer.time[1],
-					timer.time[2],
-					timer.time[3] + timer.time[6] + timer.time[7],
-					timer.time[4],
-					timer.time[5],
-					timer.time[8]);
-				printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
-						timer.time[3],
-						timer.time[6],
-						timer.time[7]
-					);
-				printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-					n_size * m_size * sizeof(T) / timer.time[2],
-					n_size * m_size * sizeof(T) / (timer.time[4]),
-					n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
-				printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-					n_size * m_size * sizeof(T) / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
-					n_size * m_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
-					n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
-				printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-					n_size * m_size / timer.time[2],
-					n_size * m_size / (timer.time[4]),
-					n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
-				printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-					n_size * m_size / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
-					n_size * m_size / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
-					n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
+				dfatool_printf
+				    ("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
+				     NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T),
+				     BLOCK_SIZE, n_size * m_size);
+				dfatool_printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD, numa_node_rank);
+				dfatool_printf
+				    ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+				     timer.time[0], timer.time[1],
+				     timer.time[2],
+				     timer.time[3] + timer.time[6] +
+				     timer.time[7], timer.time[4],
+				     timer.time[5], timer.time[8]);
+				dfatool_printf
+				    (" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
+				     timer.time[3], timer.time[6], timer.time[7]
+				    );
+				dfatool_printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     n_size * m_size * sizeof(T) /
+				     timer.time[2],
+				     n_size * m_size * sizeof(T) /
+				     (timer.time[4]),
+				     n_size * m_size * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[6] +
+				      timer.time[7] + timer.time[4] +
+				      timer.time[5] + timer.time[8]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     n_size * m_size * sizeof(T) /
+				     (timer.time[3] + timer.time[6] +
+				      timer.time[7] + timer.time[4] +
+				      timer.time[5]),
+				     n_size * m_size * sizeof(T) /
+				     (timer.time[1] + timer.time[3] +
+				      timer.time[6] + timer.time[7] +
+				      timer.time[4] + timer.time[5]),
+				     n_size * m_size * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[6] +
+				      timer.time[7] + timer.time[4] +
+				      timer.time[5]));
+				dfatool_printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     n_size * m_size / timer.time[2],
+				     n_size * m_size / (timer.time[4]),
+				     n_size * m_size / (timer.time[0] +
+							timer.time[1] +
+							timer.time[3] +
+							timer.time[6] +
+							timer.time[7] +
+							timer.time[4] +
+							timer.time[5] +
+							timer.time[8]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     n_size * m_size / (timer.time[3] +
+							timer.time[6] +
+							timer.time[7] +
+							timer.time[4] +
+							timer.time[5]),
+				     n_size * m_size / (timer.time[1] +
+							timer.time[3] +
+							timer.time[6] +
+							timer.time[7] +
+							timer.time[4] +
+							timer.time[5]),
+				     n_size * m_size / (timer.time[0] +
+							timer.time[1] +
+							timer.time[3] +
+							timer.time[6] +
+							timer.time[7] +
+							timer.time[4] +
+							timer.time[5]));
 			}
 		} else {
-			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] Outputs differ!\n");
 		}
 
 	}
 #if ENERGY
 	double acc_energy, avg_energy, acc_time, avg_time;
-	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+	DPU_ASSERT(dpu_probe_get
+		   (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -370,15 +458,15 @@ int main(int argc, char **argv) {
 
 	// Print timing results
 	/*
-	printf("CPU Version Time (ms): ");
-	print(&timer, 0, 1);
-	printf("CPU-DPU Time (ms): ");
-	print(&timer, 1, p.n_reps);
-	printf("DPU Kernel Time (ms): ");
-	print(&timer, 2, p.n_reps);
-	printf("DPU-CPU Time (ms): ");
-	print(&timer, 3, p.n_reps);
-	*/
+	   printf("CPU Version Time (ms): ");
+	   print(&timer, 0, 1);
+	   printf("CPU-DPU Time (ms): ");
+	   print(&timer, 1, p.n_reps);
+	   printf("DPU Kernel Time (ms): ");
+	   print(&timer, 2, p.n_reps);
+	   printf("DPU-CPU Time (ms): ");
+	   print(&timer, 3, p.n_reps);
+	 */
 
 #if ENERGY
 	printf("Energy (J): %f J\t", avg_energy);
diff --git a/GEMV/support/common.h b/GEMV/include/common.h
index 0deebcb..47a9628 100755..100644
--- a/GEMV/support/common.h
+++ b/GEMV/include/common.h
@@ -3,17 +3,17 @@
 
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
-    uint32_t n_size;
-    uint32_t n_size_pad;
-    uint32_t nr_rows;
-    uint32_t max_rows;
+	uint32_t n_size;
+	uint32_t n_size_pad;
+	uint32_t nr_rows;
+	uint32_t max_rows;
 } dpu_arguments_t;
 
 // Specific information for each DPU
 struct dpu_info_t {
-    uint32_t rows_per_dpu;
-    uint32_t rows_per_dpu_pad;
-    uint32_t prev_rows_dpu;
+	uint32_t rows_per_dpu;
+	uint32_t rows_per_dpu_pad;
+	uint32_t prev_rows_dpu;
 };
 struct dpu_info_t *dpu_info;
 
diff --git a/GEMV/include/dfatool_host.ah b/GEMV/include/dfatool_host.ah
new file mode 100644
index 0000000..84c1dd3
--- /dev/null
+++ b/GEMV/include/dfatool_host.ah
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned int n_cols, n_rows;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_cols = p->n_size;
+		n_rows = p->m_size;
+		printf("[>>] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows);
+	}
+
+	advice call("% gemv_host(...)") : after() {
+		printf("[--] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows);
+	}
+};
diff --git a/GEMV/include/params.h b/GEMV/include/params.h
new file mode 100644
index 0000000..c72b0c1
--- /dev/null
+++ b/GEMV/include/params.h
@@ -0,0 +1,65 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+typedef struct Params {
+	unsigned int m_size;
+	unsigned int n_size;
+	unsigned int n_warmup;
+	unsigned int n_reps;
+} Params;
+
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -m <I>    m_size (default=8192 elements)"
+		"\n    -n <I>    n_size (default=8192 elements)" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.m_size = 8192;
+	p.n_size = 8192;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+
+	int opt;
+	while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'm':
+			p.m_size = atoi(optarg);
+			break;
+		case 'n':
+			p.n_size = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+	return p;
+}
+#endif
diff --git a/GEMV/include/timer.h b/GEMV/include/timer.h
new file mode 100644
index 0000000..313151d
--- /dev/null
+++ b/GEMV/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 9
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/GEMV/run-fgbs24a.sh b/GEMV/run-fgbs24a.sh
deleted file mode 100755
index 4135623..0000000
--- a/GEMV/run-fgbs24a.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mkdir -p $(hostname)
-
-ts=$(date +%Y%m%d)
-
-(
-
-echo "prim-benchmarks GEMV (dfatool fgbs24a edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 2304 2048 2543; do
-	for nr_tasklets in 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
-			timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m 163840 -n 4096 || true
-		fi
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then
-			timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m 163840 -n 4096 || true
-		fi
-	done
-done
-echo "Completed at $(date)"
-) | tee "$(hostname)/${ts}-fgbs24a.txt"
diff --git a/GEMV/run-paper-strong-full.sh b/GEMV/run-paper-strong-full.sh
deleted file mode 100755
index 38e6123..0000000
--- a/GEMV/run-paper-strong-full.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks GEMV strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 are not part of upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m 163840 -n 4096 || true
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/GEMV/run-paper-strong-rank.sh b/GEMV/run-paper-strong-rank.sh
deleted file mode 100755
index 64f0751..0000000
--- a/GEMV/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks GEMV strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m 8192 -n 1024 || true
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/GEMV/run-paper-weak.sh b/GEMV/run-paper-weak.sh
deleted file mode 100755
index 0632e71..0000000
--- a/GEMV/run-paper-weak.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks GEMV weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# 256 and 512 are not part of upstream config space
-for nr_dpus in 512 256 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			i=$(( nr_dpus * 1024 ))
-			timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m $i -n 2048 || true
-		fi
-	done
-done
-) | tee log-paper-weak.txt
diff --git a/GEMV/run.sh b/GEMV/run.sh
deleted file mode 100755
index 68637dc..0000000
--- a/GEMV/run.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -m: number of rows
-# -n: number of cols
-
-(
-
-echo "prim-benchmarks GEMV (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# run-paper-strong-full: m=163840 n=4096
-# run-paper-strong-rank: m=8192 n=1024
-# run-paper-weak: m=ndpus*1024 n=2048
-for n in 512 1024 2048 4096; do
-	for m in 512 1024 2048 4096 8192 163840; do
-		for nr_dpus in 1 4 8 16 32 64 128 256 512 768 1024 1536 2048; do
-			for nr_tasklets in 8 12 16; do
-				echo
-				if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
-					timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m $m -n $n || true
-				fi
-			done
-		done
-	done
-done
-) | tee "log-$(hostname)-ndpus.txt"
diff --git a/GEMV/support/params.h b/GEMV/support/params.h
deleted file mode 100644
index 526c71c..0000000
--- a/GEMV/support/params.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-
-typedef struct Params {
-    unsigned int  m_size;
-    unsigned int  n_size;
-    unsigned int  n_warmup;
-    unsigned int  n_reps;
-}Params;
-
-static void usage() {
-    fprintf(stderr,
-            "\nUsage:  ./program [options]"
-            "\n"
-            "\nGeneral options:"
-            "\n    -h        help"
-            "\n    -w <W>    # of untimed warmup iterations (default=1)"
-            "\n    -e <E>    # of timed repetition iterations (default=3)"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -m <I>    m_size (default=8192 elements)"
-            "\n    -n <I>    n_size (default=8192 elements)"
-            "\n");
-}
-
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.m_size        = 8192;
-    p.n_size        = 8192;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-
-    int opt;
-    while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
-        switch(opt) {
-            case 'h':
-                usage();
-                exit(0);
-                break;
-            case 'm': p.m_size        = atoi(optarg); break;
-            case 'n': p.n_size        = atoi(optarg); break;
-            case 'w': p.n_warmup      = atoi(optarg); break;
-            case 'e': p.n_reps        = atoi(optarg); break;
-            default:
-                      fprintf(stderr, "\nUnrecognized option!\n");
-                      usage();
-                      exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
-
-    return p;
-}
-#endif
diff --git a/GEMV/support/timer.h b/GEMV/support/timer.h
deleted file mode 100755
index 99d79f4..0000000
--- a/GEMV/support/timer.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[9];
-    struct timeval stopTime[9];
-    double         time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-    //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-    //                  (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
- 
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/HST-L/Makefile b/HST-L/Makefile
index 1888b0a..45ba86c 100644
--- a/HST-L/Makefile
+++ b/HST-L/Makefile
@@ -3,15 +3,35 @@ NR_TASKLETS ?= 16
 NR_HISTO ?= 1
 BL ?= 10
 ENERGY ?= 0
+WITH_ALLOC_OVERHEAD ?= 0
+WITH_LOAD_OVERHEAD ?= 0
+WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DNR_HISTO=${NR_HISTO}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DNR_HISTO=${NR_HISTO}
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
 ifdef verbose
@@ -23,10 +43,12 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/host_code: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
diff --git a/HST-L/benchmark-scripts/ccmcc25-sim.sh b/HST-L/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..34e841a
--- /dev/null
+++ b/HST-L/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		dfatool_timing=0 aspectc=1 aspectc_timing=1
+	bin/host_code -w 0 -e 5 -b ${bin_size} -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  HST-L  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: input_size $((256 * 256)) $((512 * 512)) $((768 * 768)) $((1024 * 1024)) \
+>> ${fn}.txt
diff --git a/HST-L/benchmark-scripts/ccmcc25.sh b/HST-L/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..1c939f1
--- /dev/null
+++ b/HST-L/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		dfatool_timing=0 aspectc=1 aspectc_timing=1
+	bin/host_code -w 0 -e 50 -b ${bin_size} -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  HST-L  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 numa_rank={numa_rank} \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: input_size $((1024 * 1024)) $((1536 * 1024)) $((2048 * 1024)) \
+	>> ${fn}.txt
+
+done
diff --git a/HST-L/dpu/task.c b/HST-L/dpu/task.c
index 356b2f9..26021bd 100644
--- a/HST-L/dpu/task.c
+++ b/HST-L/dpu/task.c
@@ -12,7 +12,7 @@
 #include <atomic_bit.h>
 #include <mutex.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
diff --git a/HST-L/host/app.c b/HST-L/host/app.c
index b9c07f9..ac7381b 100644
--- a/HST-L/host/app.c
+++ b/HST-L/host/app.c
@@ -8,15 +8,29 @@
 #include <stdbool.h>
 #include <string.h>
 #include <math.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
@@ -26,10 +40,6 @@
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
 // Pointer declaration
 static T* A;
 static unsigned int* histo_host;
@@ -89,17 +99,29 @@ int main(int argc, char **argv) {
     DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
+    // Timer declaration
+    Timer timer;
+
     // Allocate DPUs and load binary
+#if !WITH_ALLOC_OVERHEAD
     DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+    zero(&timer, 0); // aloc
+#endif
+#if !WITH_LOAD_OVERHEAD
     DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
     DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    printf("Allocated %d DPU(s)\n", nr_of_dpus);
+    assert(nr_of_dpus == NR_DPUS);
+    zero(&timer, 1); // load
+#endif
+#if !WITH_FREE_OVERHEAD
+    zero(&timer, 6); // free
+#endif
 
     unsigned int i = 0;
     unsigned int input_size; // Size of input image
     unsigned int dpu_s = p.dpu_s;
     if(p.exp == 0)
-        input_size = p.input_size * nr_of_dpus; // Size of input image
+        input_size = p.input_size * NR_DPUS; // Size of input image
     else if(p.exp == 1)
         input_size = p.input_size; // Size of input image
 	else
@@ -107,20 +129,20 @@ int main(int argc, char **argv) {
 
     const unsigned int input_size_8bytes = 
         ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
-    const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.)
+    const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
     const unsigned int input_size_dpu_8bytes = 
         ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
 
     // Input/output allocation
-    A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
+    A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
     T *bufferA = A;
-    histo_host = malloc(p.bins * sizeof(unsigned int));
-    histo = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
+    histo_host = (unsigned int*)malloc(p.bins * sizeof(unsigned int));
+    histo = (unsigned int*)malloc(NR_DPUS * p.bins * sizeof(unsigned int));
 
     // Create an input file with arbitrary data
     read_input(A, p);
     if(p.exp == 0){
-        for(unsigned int j = 1; j < nr_of_dpus; j++){
+        for(unsigned int j = 1; j < NR_DPUS; j++){
             memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T));
         }
     }
@@ -129,40 +151,59 @@ int main(int argc, char **argv) {
             memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T));
     }
 
-    // Timer declaration
-    Timer timer;
-
-    printf("NR_TASKLETS\t%d\tBL\t%d\tinput_size\t%u\n", NR_TASKLETS, BL, input_size);
-
     // Loop over main kernel
     for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
-        memset(histo_host, 0, p.bins * sizeof(unsigned int));
-        memset(histo, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
 
-        // Compute output on CPU (performance comparison and verification purposes)
-        if(rep >= p.n_warmup)
+#if WITH_ALLOC_OVERHEAD
+        if(rep >= p.n_warmup) {
             start(&timer, 0, 0);
-        histogram_host(histo_host, A, p.bins, p.input_size, 1, nr_of_dpus);
-        if(rep >= p.n_warmup)
+        }
+        DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+        if(rep >= p.n_warmup) {
             stop(&timer, 0);
-
-        printf("Load input data\n");
-        if(rep >= p.n_warmup)
+        }
+#endif
+#if WITH_LOAD_OVERHEAD
+        if(rep >= p.n_warmup) {
             start(&timer, 1, 0);
+        }
+        DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+        if(rep >= p.n_warmup) {
+            stop(&timer, 1);
+        }
+        DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+        assert(nr_of_dpus == NR_DPUS);
+#endif
+
+        memset(histo_host, 0, p.bins * sizeof(unsigned int));
+        memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int));
+
+        // Compute output on CPU (performance comparison and verification purposes)
+        if(rep >= p.n_warmup) {
+            start(&timer, 2, 0);
+        }
+        histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS);
+        if(rep >= p.n_warmup) {
+            stop(&timer, 2);
+        }
+
+        if(rep >= p.n_warmup) {
+            start(&timer, 3, 0);
+        }
         // Input arguments
         unsigned int kernel = 0;
         i = 0;
 	    dpu_arguments_t input_arguments[NR_DPUS];
-	    for(i=0; i<nr_of_dpus-1; i++) {
+	    for(i=0; i<NR_DPUS-1; i++) {
 	        input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
 	        input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
 	        input_arguments[i].bins=p.bins;
-	        input_arguments[i].kernel=kernel;
+	        input_arguments[i].kernel = (enum kernels)kernel;
 	    }
-	    input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
-	    input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-	    input_arguments[nr_of_dpus-1].bins=p.bins;
-	    input_arguments[nr_of_dpus-1].kernel=kernel;
+	    input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
+	    input_arguments[NR_DPUS-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
+	    input_arguments[NR_DPUS-1].bins=p.bins;
+	    input_arguments[NR_DPUS-1].kernel = (enum kernels)kernel;
 
         // Copy input arrays
         i = 0;
@@ -174,13 +215,13 @@ int main(int argc, char **argv) {
             DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
         }
         DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup)
-            stop(&timer, 1);
+        if(rep >= p.n_warmup) {
+            stop(&timer, 3);
+        }
 
-        printf("Run program on DPU(s) \n");
         // Run DPU kernel
         if(rep >= p.n_warmup) {
-            start(&timer, 2, 0);
+            start(&timer, 4, 0);
             #if ENERGY
             DPU_ASSERT(dpu_probe_start(&probe));
             #endif
@@ -188,7 +229,7 @@ int main(int argc, char **argv) {
 
         DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
         if(rep >= p.n_warmup) {
-            stop(&timer, 2);
+            stop(&timer, 4);
             #if ENERGY
             DPU_ASSERT(dpu_probe_stop(&probe));
             #endif
@@ -206,10 +247,10 @@ int main(int argc, char **argv) {
         }
 #endif
 
-        printf("Retrieve results\n");
         i = 0;
-        if(rep >= p.n_warmup)
-            start(&timer, 3, 0);
+        if(rep >= p.n_warmup) {
+            start(&timer, 5, 0);
+        }
         // PARALLEL RETRIEVE TRANSFER
         DPU_FOREACH(dpu_set, dpu, i) {
             DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
@@ -217,40 +258,60 @@ int main(int argc, char **argv) {
         DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
 
         // Final histogram merging
-        for(i = 1; i < nr_of_dpus; i++){
+        for(i = 1; i < NR_DPUS; i++){
             for(unsigned int j = 0; j < p.bins; j++){
                 histo[j] += histo[j + i * p.bins];
             }
         }
-        if(rep >= p.n_warmup)
-            stop(&timer, 3);
+        if(rep >= p.n_warmup) {
+            stop(&timer, 5);
+        }
+
+#if WITH_ALLOC_OVERHEAD
+#if WITH_FREE_OVERHEAD
+        if(rep >= p.n_warmup) {
+            start(&timer, 6, 0);
+        }
+#endif
+        DPU_ASSERT(dpu_free(dpu_set));
+#if WITH_FREE_OVERHEAD
+        if(rep >= p.n_warmup) {
+            stop(&timer, 6);
+        }
+#endif
+#endif
 
         if (rep >= p.n_warmup) {
-            printf("[::] HST-L NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%u n_bins=%d "
-                "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f",
-                nr_of_dpus, NR_TASKLETS, XSTR(T), input_size, p.bins,
-                input_size * sizeof(T) / timer.time[0],
+            dfatool_printf("[::] HST-L UPMEM | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d ",
+                nr_of_dpus, NR_TASKLETS, XSTR(T), input_size, p.bins);
+            dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+                timer.time[0],
+                timer.time[1],
+                timer.time[2],
+                timer.time[3],
+                timer.time[4],
+                timer.time[5],
+                timer.time[6]);
+            dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
                 input_size * sizeof(T) / timer.time[2],
-                input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3]));
-            printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f",
-                input_size / timer.time[0],
+                input_size * sizeof(T) / (timer.time[4]),
+                input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
+            dfatool_printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+                input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
+                input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
+                input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
+            dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
                 input_size / timer.time[2],
-                input_size / (timer.time[1] + timer.time[2] + timer.time[3]));
-            printall(&timer, 3);
+                input_size / (timer.time[4]),
+                input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
+            dfatool_printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+                input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
+                input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
+                input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
         }
 
     }
 
-    // Print timing results
-    printf("CPU ");
-    print(&timer, 0, p.n_reps);
-    printf("CPU-DPU ");
-    print(&timer, 1, p.n_reps);
-    printf("DPU Kernel ");
-    print(&timer, 2, p.n_reps);
-    printf("DPU-CPU ");
-    print(&timer, 3, p.n_reps);
-
     #if ENERGY
     double energy;
     DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
@@ -279,10 +340,10 @@ int main(int argc, char **argv) {
         }
     else
         for (unsigned int j = 0; j < p.bins; j++) {
-            if(nr_of_dpus * histo_host[j] != histo[j]){ 
+            if(NR_DPUS * histo_host[j] != histo[j]){ 
                 status = false;
 #if PRINT
-                printf("%u - %u: %u -- %u\n", j, j, nr_of_dpus * histo_host[j], histo[j]);
+                printf("%u - %u: %u -- %u\n", j, j, NR_DPUS * histo_host[j], histo[j]);
 #endif
             }
         }
@@ -296,7 +357,10 @@ int main(int argc, char **argv) {
     free(A);
     free(histo_host);
     free(histo);
+
+#if !WITH_ALLOC_OVERHEAD
     DPU_ASSERT(dpu_free(dpu_set));
+#endif
 	
     return status ? 0 : -1;
 }
diff --git a/HST-L/support/common.h b/HST-L/include/common.h
index 30df40d..438825e 100755..100644
--- a/HST-L/support/common.h
+++ b/HST-L/include/common.h
@@ -20,15 +20,17 @@
 #define DEPTH 12
 #define ByteSwap16(n) (((((unsigned int)n) << 8) & 0xFF00) | ((((unsigned int)n) >> 8) & 0x00FF))
 
+enum kernels {
+	kernel1 = 0,
+	nr_kernels = 1,
+} kernel;
+
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
     uint32_t size;
     uint32_t transfer_size;
     uint32_t bins;
-	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
-	} kernel;
+	enum kernels kernel;
 } dpu_arguments_t;
 
 #ifndef ENERGY
diff --git a/HST-L/include/dfatool_host.ah b/HST-L/include/dfatool_host.ah
new file mode 100644
index 0000000..db4e441
--- /dev/null
+++ b/HST-L/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_pixels;
+	unsigned int n_bins;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_pixels = p->input_size;
+		n_bins = p->bins;
+		printf("[>>] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins);
+	}
+
+	advice call("% histogram_host(...)") : after() {
+		printf("[--] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins);
+	}
+};
diff --git a/HST-L/support/params.h b/HST-L/include/params.h
index e29449b..d0c3129 100644
--- a/HST-L/support/params.h
+++ b/HST-L/include/params.h
@@ -21,7 +21,7 @@ static void usage() {
         "\n    -h        help"
         "\n    -w <W>    # of untimed warmup iterations (default=1)"
         "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1, 2) scaling (default=0)"
+        "\n    -x <X>    Weak (0) or strong (1, 2) scaling (default=1)"
         "\n"
         "\nBenchmark-specific options:"
         "\n    -i <I>    input size (default=1536*1024 elements)"
@@ -36,7 +36,7 @@ struct Params input_params(int argc, char **argv) {
     p.bins          = 256;
     p.n_warmup      = 1;
     p.n_reps        = 3;
-    p.exp           = 0;
+    p.exp           = 1;
     p.file_name     = "./input/image_VanHateren.iml";
     p.dpu_s         = 64;
 
diff --git a/HST-L/include/timer.h b/HST-L/include/timer.h
new file mode 100644
index 0000000..7b80823
--- /dev/null
+++ b/HST-L/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/HST-L/run-paper-strong-full.sh b/HST-L/run-paper-strong-full.sh
deleted file mode 100755
index 0108d40..0000000
--- a/HST-L/run-paper-strong-full.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks HST-S strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 are not part of upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 2 || true
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/HST-L/run-paper-strong-rank.sh b/HST-L/run-paper-strong-rank.sh
deleted file mode 100755
index f2f80b1..0000000
--- a/HST-L/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks HST-S strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream config space
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 1 || true
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/HST-L/run-paper-weak.sh b/HST-L/run-paper-weak.sh
deleted file mode 100755
index 3ddd801..0000000
--- a/HST-L/run-paper-weak.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks HST-S weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# upstream does not include >64
-for nr_dpus in 256 512 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 0 || true
-		fi
-	done
-done
-) | tee log-paper-weak.txt
diff --git a/HST-L/run.sh b/HST-L/run.sh
deleted file mode 100755
index d2a072f..0000000
--- a/HST-L/run.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-for i in 1 
-do
-	for b in 64 128 256 512 1024 2048 4096
-	do
-    	for k in 1 2 4 8 16
-	    do
-	        NR_DPUS=$i NR_TASKLETS=$k BL=10 make all
-		    wait
-            ./bin/host_code -w 2 -e 5 -b ${b} > profile/HSTL_${b}_tl${k}_dpu${i}.txt
-		    wait
-		    make clean
-		    wait
-		done
-	done
-done
diff --git a/HST-L/support/timer.h b/HST-L/support/timer.h
deleted file mode 100755
index 5c00213..0000000
--- a/HST-L/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[4];
-    struct timeval stopTime[4];
-    double         time[4];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/HST-S/baselines/cpu/app_baseline.c b/HST-S/baselines/cpu/app_baseline.c
index 745e384..bb4e28a 100644
--- a/HST-S/baselines/cpu/app_baseline.c
+++ b/HST-S/baselines/cpu/app_baseline.c
@@ -24,10 +24,10 @@
 #include <numaif.h>
 #include <numa.h>
 
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
@@ -41,7 +41,6 @@ int numa_node_local = -1;
 int numa_node_in_is_local = 0;
 #endif
 
-
 #include "../../support/common.h"
 #include "../../support/timer.h"
 
@@ -49,364 +48,399 @@ int numa_node_in_is_local = 0;
 #define STR(x) #x
 
 // Pointer declaration
-static T* A;
+static T *A;
 static T *A_local;
-static unsigned int* histo_host;
+static unsigned int *histo_host;
 
 typedef struct Params {
-    unsigned int   input_size;
-    unsigned int   bins;
-    int   n_warmup;
-    int   n_reps;
-    const char *file_name;
-    int  exp;
-    int  n_threads;
+	unsigned int input_size;
+	unsigned int bins;
+	int n_warmup;
+	int n_reps;
+	const char *file_name;
+	int exp;
+	int n_threads;
 #if NUMA
-    struct bitmask* bitmask_in;
-    struct bitmask* bitmask_out;
-    int numa_node_cpu;
+	struct bitmask *bitmask_in;
+	struct bitmask *bitmask_out;
+	int numa_node_cpu;
 #endif
 #if NUMA_MEMCPY
-    int numa_node_cpu_memcpy;
-    struct bitmask* bitmask_cpu;
+	int numa_node_cpu_memcpy;
+	struct bitmask *bitmask_cpu;
 #endif
-}Params;
+} Params;
 
 /**
 * @brief creates input arrays
 * @param nr_elements how many elements in input arrays
 */
-static void read_input(T* A, const Params p) {
-
-    char  dctFileName[100];
-    FILE *File = NULL;
-
-    // Open input file
-    unsigned short temp;
-    sprintf(dctFileName, "%s", p.file_name);
-    if((File = fopen(dctFileName, "rb")) != NULL) {
-        for(unsigned int y = 0; y < p.input_size; y++) {
-            if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
-                A[y] = (unsigned int)ByteSwap16(temp);
-                if(A[y] >= 4096)
-                    A[y] = 4095;
-            } else {
-                //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
-                rewind(File);
-            }
-        }
-        fclose(File);
-    } else {
-        printf("%s does not exist\n", dctFileName);
-        exit(1);
-    }
+static void read_input(T *A, const Params p)
+{
+
+	char dctFileName[100];
+	FILE *File = NULL;
+
+	// Open input file
+	unsigned short temp;
+	sprintf(dctFileName, "%s", p.file_name);
+	if ((File = fopen(dctFileName, "rb")) != NULL) {
+		for (unsigned int y = 0; y < p.input_size; y++) {
+			if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
+				A[y] = (unsigned int)ByteSwap16(temp);
+				if (A[y] >= 4096)
+					A[y] = 4095;
+			} else {
+				//printf("out of bounds read at offset %d -- seeking back to 0\n", y);
+				rewind(File);
+			}
+		}
+		fclose(File);
+	} else {
+		printf("%s does not exist\n", dctFileName);
+		exit(1);
+	}
 }
 
 /**
 * @brief compute output in the host
 */
-static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus, int t) {
-
-    omp_set_num_threads(t);
-
-    if(!exp){
-        #pragma omp parallel for
-        for (unsigned int i = 0; i < nr_of_dpus; i++) {
-            for (unsigned int j = 0; j < nr_elements; j++) {
-                T d = A[j];
-                histo[i * bins + ((d * bins) >> DEPTH)] += 1;
-            }
-        }
-    }
-    else{
-        #pragma omp parallel for
-        for (unsigned int j = 0; j < nr_elements; j++) {
-            T d = A[j];
-            #pragma omp atomic update
-            histo[(d * bins) >> DEPTH] += 1;
-        }
-    }
+static void histogram_host(unsigned int *histo, T *A, unsigned int bins,
+			   unsigned int nr_elements, int exp,
+			   unsigned int nr_of_dpus, int t)
+{
+
+	omp_set_num_threads(t);
+
+	if (!exp) {
+#pragma omp parallel for
+		for (unsigned int i = 0; i < nr_of_dpus; i++) {
+			for (unsigned int j = 0; j < nr_elements; j++) {
+				T d = A[j];
+				histo[i * bins + ((d * bins) >> DEPTH)] += 1;
+			}
+		}
+	} else {
+#pragma omp parallel for
+		for (unsigned int j = 0; j < nr_elements; j++) {
+			T d = A[j];
+#pragma omp atomic update
+			histo[(d * bins) >> DEPTH] += 1;
+		}
+	}
 }
 
 // Params ---------------------------------------------------------------------
-void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -t <T>    # of threads (default=8)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=1536*1024 elements)"
-        "\n    -b <B>    histogram size (default=256 bins)"
-        "\n    -f <F>    input image file (default=../input/image_VanHateren.iml)"
-        "\n");
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -t <T>    # of threads (default=8)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=1536*1024 elements)"
+		"\n    -b <B>    histogram size (default=256 bins)"
+		"\n    -f <F>    input image file (default=../input/image_VanHateren.iml)"
+		"\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 1536 * 1024;
-    p.bins          = 256;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.n_threads     = 8;
-    p.exp           = 1;
-    p.file_name     = "../../input/image_VanHateren.iml";
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 1536 * 1024;
+	p.bins = 256;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.n_threads = 8;
+	p.exp = 1;
+	p.file_name = "../../input/image_VanHateren.iml";
 #if NUMA
-    p.bitmask_in     = NULL;
-    p.bitmask_out    = NULL;
-    p.numa_node_cpu = -1;
+	p.bitmask_in = NULL;
+	p.bitmask_out = NULL;
+	p.numa_node_cpu = -1;
 #endif
 #if NUMA_MEMCPY
-    p.numa_node_cpu_memcpy  = -1;
-    p.bitmask_cpu    = NULL;
+	p.numa_node_cpu_memcpy = -1;
+	p.bitmask_cpu = NULL;
 #endif
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'b': p.bins          = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'f': p.file_name     = optarg; break;
-        case 'x': p.exp           = atoi(optarg); break;
-        case 't': p.n_threads     = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atoi(optarg);
+			break;
+		case 'b':
+			p.bins = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'f':
+			p.file_name = optarg;
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		case 't':
+			p.n_threads = atoi(optarg);
+			break;
 #if NUMA
-        case 'A': p.bitmask_in    = numa_parse_nodestring(optarg); break;
-        case 'B': p.bitmask_out   = numa_parse_nodestring(optarg); break;
-        case 'C': p.numa_node_cpu = atoi(optarg); break;
+		case 'A':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'B':
+			p.bitmask_out = numa_parse_nodestring(optarg);
+			break;
+		case 'C':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #if NUMA_MEMCPY
-        case 'D': p.bitmask_cpu   = numa_parse_nodestring(optarg); break;
-        case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
-#endif // NUMA_MEMCPY
-#endif // NUMA
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(p.n_threads > 0 && "Invalid # of ranks!");
-
-    return p;
+		case 'D':
+			p.bitmask_cpu = numa_parse_nodestring(optarg);
+			break;
+		case 'M':
+			p.numa_node_cpu_memcpy = atoi(optarg);
+			break;
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(p.n_threads > 0 && "Invalid # of ranks!");
+
+	return p;
 }
 
 /**
 * @brief Main of the Host Application.
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
+
+	struct Params p = input_params(argc, argv);
 
-    struct Params p = input_params(argc, argv);
+	uint32_t nr_of_dpus;
 
-    uint32_t nr_of_dpus;
-    
-    const unsigned int input_size = p.input_size; // Size of input image
-    if(!p.exp)
-        assert(input_size % p.n_threads == 0 && "Input size!");
-    else
-        assert(input_size % p.n_threads == 0 && "Input size!");
+	const unsigned int input_size = p.input_size;	// Size of input image
+	if (!p.exp)
+		assert(input_size % p.n_threads == 0 && "Input size!");
+	else
+		assert(input_size % p.n_threads == 0 && "Input size!");
 
-    // Input/output allocation
+	// Input/output allocation
 #if NUMA
-    if (p.bitmask_in) {
-        numa_set_membind(p.bitmask_in);
-        numa_free_nodemask(p.bitmask_in);
-    }
-    A = numa_alloc(input_size * sizeof(T));
+	if (p.bitmask_in) {
+		numa_set_membind(p.bitmask_in);
+		numa_free_nodemask(p.bitmask_in);
+	}
+	A = numa_alloc(input_size * sizeof(T));
 #else
-    A = malloc(input_size * sizeof(T));
+	A = malloc(input_size * sizeof(T));
 #endif
 
-    // Create an input file with arbitrary data.
-    read_input(A, p);
+	// Create an input file with arbitrary data.
+	read_input(A, p);
 
 #if NUMA
-    if (p.bitmask_out) {
-        numa_set_membind(p.bitmask_out);
-        numa_free_nodemask(p.bitmask_out);
-    }
+	if (p.bitmask_out) {
+		numa_set_membind(p.bitmask_out);
+		numa_free_nodemask(p.bitmask_out);
+	}
 #endif
-    if(!p.exp) {
-        // upstream code left nr_of_dpus uninitialized
-        nr_of_dpus = p.n_threads;
+	if (!p.exp) {
+		// upstream code left nr_of_dpus uninitialized
+		nr_of_dpus = p.n_threads;
 #if NUMA
-        histo_host = numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int));
+		histo_host =
+		    numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int));
 #else
-        histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
+		histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
 #endif
-    } else {
+	} else {
 #if NUMA
-        histo_host = numa_alloc(p.bins * sizeof(unsigned int));
+		histo_host = numa_alloc(p.bins * sizeof(unsigned int));
 #else
-        histo_host = malloc(p.bins * sizeof(unsigned int));
+		histo_host = malloc(p.bins * sizeof(unsigned int));
 #endif
-    }
+	}
 
 #if NUMA
 #if NUMA_MEMCPY
-    if (p.bitmask_cpu) {
-        numa_set_membind(p.bitmask_cpu);
-        numa_free_nodemask(p.bitmask_cpu);
-    }
+	if (p.bitmask_cpu) {
+		numa_set_membind(p.bitmask_cpu);
+		numa_free_nodemask(p.bitmask_cpu);
+	}
 #else
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
 
 #if NUMA
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    mp_pages[0] = histo_host;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(C)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_out = mp_status[0];
-    }
-
-    numa_node_cpu = p.numa_node_cpu;
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	mp_pages[0] = histo_host;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(C)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_out = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 #if NUMA_MEMCPY
-    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+	numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+				 || (numa_node_cpu + 8 == numa_node_in)) * 1;
 #endif
 
-    Timer timer;
+	Timer timer;
 
 #if NUMA_MEMCPY
-    numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
-    start(&timer, 1, 0);
-    if (!numa_node_in_is_local) {
-        A_local = (T*) numa_alloc(input_size * sizeof(T));
-    }
-    stop(&timer, 1);
-    if (!numa_node_in_is_local) {
-        if (p.numa_node_cpu_memcpy != -1) {
-            if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
-                perror("numa_run_on_node");
-                numa_node_cpu_memcpy = -1;
-            }
-        }
-    }
-    start(&timer, 2, 0);
-    if (!numa_node_in_is_local) {
-        memcpy(A_local, A, input_size * sizeof(T));
-    } else {
-        A_local = A;
-    }
-    stop(&timer, 2);
-    if (p.numa_node_cpu != -1) {
-        if (numa_run_on_node(p.numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
-    mp_pages[0] = A_local;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A_local)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_local = mp_status[0];
-    }
+	numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+	start(&timer, 1, 0);
+	if (!numa_node_in_is_local) {
+		A_local = (T *) numa_alloc(input_size * sizeof(T));
+	}
+	stop(&timer, 1);
+	if (!numa_node_in_is_local) {
+		if (p.numa_node_cpu_memcpy != -1) {
+			if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
+				perror("numa_run_on_node");
+				numa_node_cpu_memcpy = -1;
+			}
+		}
+	}
+	start(&timer, 2, 0);
+	if (!numa_node_in_is_local) {
+		memcpy(A_local, A, input_size * sizeof(T));
+	} else {
+		A_local = A;
+	}
+	stop(&timer, 2);
+	if (p.numa_node_cpu != -1) {
+		if (numa_run_on_node(p.numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
+	mp_pages[0] = A_local;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A_local)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_local = mp_status[0];
+	}
 #else
-    A_local = A;
+	A_local = A;
 #endif
 
-    start(&timer, 0, 0);
+	start(&timer, 0, 0);
 
-    if(!p.exp)
-        memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
-    else
-        memset(histo_host, 0, p.bins * sizeof(unsigned int));
+	if (!p.exp)
+		memset(histo_host, 0,
+		       nr_of_dpus * p.bins * sizeof(unsigned int));
+	else
+		memset(histo_host, 0, p.bins * sizeof(unsigned int));
 
-    histogram_host(histo_host, A_local, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads);
+	histogram_host(histo_host, A_local, p.bins, input_size, p.exp,
+		       nr_of_dpus, p.n_threads);
 
-    stop(&timer, 0);
+	stop(&timer, 0);
 
 #if NUMA_MEMCPY
-    start(&timer, 3, 0);
-    if (!numa_node_in_is_local) {
-        numa_free(A_local, input_size * sizeof(T));
-    }
-    stop(&timer, 3);
+	start(&timer, 3, 0);
+	if (!numa_node_in_is_local) {
+		numa_free(A_local, input_size * sizeof(T));
+	}
+	stop(&timer, 3);
 #endif
 
-    unsigned int nr_threads = 0;
+	unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-    nr_threads++;
+	nr_threads++;
 
 #if NUMA_MEMCPY
-    printf("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
-        " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
-        " | throughput_MBps=%f",
-        nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus,
-        numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
-        input_size * sizeof(T) / timer.time[0]);
-    printf(" throughput_MOpps=%f",
-        input_size / timer.time[0]);
-    printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
-        timer.time[0], timer.time[1], timer.time[2], timer.time[3],
-        timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+	printf
+	    ("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
+	     " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+	     " | throughput_MBps=%f", nr_threads, XSTR(T), input_size,
+	     p.exp ? p.bins : p.bins * nr_of_dpus, numa_node_in,
+	     numa_node_local, numa_node_out, numa_node_cpu,
+	     numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
+	     numa_distance(numa_node_cpu, numa_node_out),
+	     input_size * sizeof(T) / timer.time[0]);
+	printf(" throughput_MOpps=%f", input_size / timer.time[0]);
+	printf
+	    (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+	     timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+	     timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
 #else
-    printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
+	printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
 #if NUMA
-        " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+	       " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
 #endif
-        " | throughput_MBps=%f",
-        nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus,
+	       " | throughput_MBps=%f",
+	       nr_threads, XSTR(T), input_size,
+	       p.exp ? p.bins : p.bins * nr_of_dpus,
 #if NUMA
-        numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+	       numa_node_in, numa_node_out, numa_node_cpu,
+	       numa_distance(numa_node_in, numa_node_cpu),
+	       numa_distance(numa_node_cpu, numa_node_out),
 #endif
-        input_size * sizeof(T) / timer.time[0]);
-    printf(" throughput_MOpps=%f latency_us=%f\n",
-        input_size / timer.time[0], timer.time[0]);
-#endif // NUMA_MEMCPY
+	       input_size * sizeof(T) / timer.time[0]);
+	printf(" throughput_MOpps=%f latency_us=%f\n",
+	       input_size / timer.time[0], timer.time[0]);
+#endif				// NUMA_MEMCPY
 
 #if NUMA
-    numa_free(A, input_size * sizeof(T));
-    if (!p.exp) {
-        numa_free(histo_host, nr_of_dpus * p.bins * sizeof(unsigned int));
-    } else {
-        numa_free(histo_host, p.bins * sizeof(unsigned int));
-    }
+	numa_free(A, input_size * sizeof(T));
+	if (!p.exp) {
+		numa_free(histo_host,
+			  nr_of_dpus * p.bins * sizeof(unsigned int));
+	} else {
+		numa_free(histo_host, p.bins * sizeof(unsigned int));
+	}
 #else
-    free(A);
-    free(histo_host);
+	free(A);
+	free(histo_host);
 #endif
 
-    return 0;
+	return 0;
 }
diff --git a/HST-S/dpu/task.c b/HST-S/dpu/task.c
index 135f0d1..0333072 100644
--- a/HST-S/dpu/task.c
+++ b/HST-S/dpu/task.c
@@ -15,102 +15,121 @@
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
 // Array for communication between adjacent tasklets
-uint32_t* message[NR_TASKLETS];
+uint32_t *message[NR_TASKLETS];
 // DPU histogram
-uint32_t* histo_dpu;
+uint32_t *histo_dpu;
 
 // Barrier
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // Histogram in each tasklet
-static void histogram(uint32_t* histo, uint32_t bins, T *input, unsigned int l_size){
-    for(unsigned int j = 0; j < l_size; j++) {
-        T d = input[j];
-        histo[(d * bins) >> DEPTH] += 1;
-    }
+static void histogram(uint32_t *histo, uint32_t bins, T *input,
+		      unsigned int l_size)
+{
+	for (unsigned int j = 0; j < l_size; j++) {
+		T d = input[j];
+		histo[(d * bins) >> DEPTH] += 1;
+	}
 }
 
 extern int main_kernel1(void);
 
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void) { 
-    // Kernel
-    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
+int main(void)
+{
+	// Kernel
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
-    unsigned int tasklet_id = me();
+int main_kernel1()
+{
+	unsigned int tasklet_id = me();
 #if PRINT
-    printf("tasklet_id = %u\n", tasklet_id);
+	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-    if (tasklet_id == 0){ // Initialize once the cycle counter
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
-    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
-    uint32_t bins = DPU_INPUT_ARGUMENTS.bins;
-
-    // Address of the current processing block in MRAM
-    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
-    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
-    uint32_t mram_base_addr_histo = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
-
-    // Initialize a local cache to store the MRAM block
-    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
-	
-    // Local histogram
-    uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
-
-    // Initialize local histogram
-    for(unsigned int i = 0; i < bins; i++){
-        histo[i] = 0;
-    }
-
-    // Compute histogram
-    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
-
-        // Bound checking
-        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
-
-        // Load cache with current MRAM block
-        mram_read((const __mram_ptr void*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
-
-        // Histogram in each tasklet
-        histogram(histo, bins, cache_A, l_size_bytes >> DIV);
-
-    }
-    message[tasklet_id] = histo;
-
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    uint32_t *histo_dpu = message[0];
-
-    for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS){
-        uint32_t b = 0;		
-        for (unsigned int j = 0; j < NR_TASKLETS; j++){			
-            b += *(message[j] + i);
-        }
-        histo_dpu[i] = b;
-    }
-
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Write dpu histogram to current MRAM block
-    if(tasklet_id == 0){
-        if(bins * sizeof(uint32_t) <= 2048)
-            mram_write(histo_dpu, (__mram_ptr void*)(mram_base_addr_histo), bins * sizeof(uint32_t));
-        else 
-            for(unsigned int offset = 0; offset < ((bins * sizeof(uint32_t)) >> 11); offset++){
-                mram_write(histo_dpu + (offset << 9), (__mram_ptr void*)(mram_base_addr_histo + (offset << 11)), 2048);
-            }
-    }
-
-    return 0;
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
+	uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size;	// Transfer input size per DPU in bytes
+	uint32_t bins = DPU_INPUT_ARGUMENTS.bins;
+
+	// Address of the current processing block in MRAM
+	uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+	uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	uint32_t mram_base_addr_histo =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
+
+	// Initialize a local cache to store the MRAM block
+	T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+
+	// Local histogram
+	uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
+
+	// Initialize local histogram
+	for (unsigned int i = 0; i < bins; i++) {
+		histo[i] = 0;
+	}
+
+	// Compute histogram
+	for (unsigned int byte_index = base_tasklet;
+	     byte_index < input_size_dpu_bytes;
+	     byte_index += BLOCK_SIZE * NR_TASKLETS) {
+
+		// Bound checking
+		uint32_t l_size_bytes =
+		    (byte_index + BLOCK_SIZE >=
+		     input_size_dpu_bytes) ? (input_size_dpu_bytes -
+					      byte_index) : BLOCK_SIZE;
+
+		// Load cache with current MRAM block
+		mram_read((const __mram_ptr void *)(mram_base_addr_A +
+						    byte_index), cache_A,
+			  l_size_bytes);
+
+		// Histogram in each tasklet
+		histogram(histo, bins, cache_A, l_size_bytes >> DIV);
+
+	}
+	message[tasklet_id] = histo;
+
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	uint32_t *histo_dpu = message[0];
+
+	for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS) {
+		uint32_t b = 0;
+		for (unsigned int j = 0; j < NR_TASKLETS; j++) {
+			b += *(message[j] + i);
+		}
+		histo_dpu[i] = b;
+	}
+
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Write dpu histogram to current MRAM block
+	if (tasklet_id == 0) {
+		if (bins * sizeof(uint32_t) <= 2048)
+			mram_write(histo_dpu,
+				   (__mram_ptr void *)(mram_base_addr_histo),
+				   bins * sizeof(uint32_t));
+		else
+			for (unsigned int offset = 0;
+			     offset < ((bins * sizeof(uint32_t)) >> 11);
+			     offset++) {
+				mram_write(histo_dpu + (offset << 9),
+					   (__mram_ptr void
+					    *)(mram_base_addr_histo +
+					       (offset << 11)), 2048);
+			}
+	}
+
+	return 0;
 }
diff --git a/HST-S/host/app.c b/HST-S/host/app.c
index 2c4e6a5..7f66f6e 100644
--- a/HST-S/host/app.c
+++ b/HST-S/host/app.c
@@ -40,362 +40,415 @@
 #endif
 
 // Pointer declaration
-static T* A;
-static unsigned int* histo_host;
-static unsigned int* histo;
+static T *A;
+static unsigned int *histo_host;
+static unsigned int *histo;
 
 // Create input arrays
-static void read_input(T* A, const Params p) {
-
-    char  dctFileName[100];
-    FILE *File = NULL;
-
-    // Open input file
-    unsigned short temp;
-    sprintf(dctFileName, "%s", p.file_name);
-    if((File = fopen(dctFileName, "rb")) != NULL) {
-        for(unsigned int y = 0; y < p.input_size; y++) {
-            if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
-                A[y] = (unsigned int)ByteSwap16(temp);
-                if(A[y] >= 4096)
-                    A[y] = 4095;
-            } else {
-                //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
-                rewind(File);
-            }
-        }
-        fclose(File);
-    } else {
-        printf("%s does not exist\n", dctFileName);
-        exit(1);
-    }
+static void read_input(T *A, const Params p)
+{
+
+	char dctFileName[100];
+	FILE *File = NULL;
+
+	// Open input file
+	unsigned short temp;
+	sprintf(dctFileName, "%s", p.file_name);
+	if ((File = fopen(dctFileName, "rb")) != NULL) {
+		for (unsigned int y = 0; y < p.input_size; y++) {
+			if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
+				A[y] = (unsigned int)ByteSwap16(temp);
+				if (A[y] >= 4096)
+					A[y] = 4095;
+			} else {
+				//printf("out of bounds read at offset %d -- seeking back to 0\n", y);
+				rewind(File);
+			}
+		}
+		fclose(File);
+	} else {
+		printf("%s does not exist\n", dctFileName);
+		exit(1);
+	}
 }
 
 // Compute output in the host
-static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus) {
-    if(!exp){
-        for (unsigned int i = 0; i < nr_of_dpus; i++) {
-            for (unsigned int j = 0; j < nr_elements; j++) {
-                T d = A[j];
-                histo[i * bins + ((d * bins) >> DEPTH)] += 1;
-            }
-        }
-    }
-    else{
-        for (unsigned int j = 0; j < nr_elements; j++) {
-            T d = A[j];
-            histo[(d * bins) >> DEPTH] += 1;
-        }
-    }
+static void histogram_host(unsigned int *histo, T *A, unsigned int bins,
+			   unsigned int nr_elements, int exp,
+			   unsigned int nr_of_dpus)
+{
+	if (!exp) {
+		for (unsigned int i = 0; i < nr_of_dpus; i++) {
+			for (unsigned int j = 0; j < nr_elements; j++) {
+				T d = A[j];
+				histo[i * bins + ((d * bins) >> DEPTH)] += 1;
+			}
+		}
+	} else {
+		for (unsigned int j = 0; j < nr_elements; j++) {
+			T d = A[j];
+			histo[(d * bins) >> DEPTH] += 1;
+		}
+	}
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t nr_of_dpus;
+	uint32_t nr_of_ranks;
 
 #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-    // Timer declaration
-    Timer timer;
+	// Timer declaration
+	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
+	timer.time[0] = 0;	// alloc
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+	timer.time[1] = 0;	// load
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+	timer.time[6] = 0;	// free
 #endif
 
-    unsigned int i = 0;
-    unsigned int input_size; // Size of input image
-    unsigned int dpu_s = p.dpu_s;
-    if(p.exp == 0)
-        input_size = p.input_size * NR_DPUS; // Size of input image
-    else if(p.exp == 1)
-        input_size = p.input_size; // Size of input image
-    else
-        input_size = p.input_size * dpu_s; // Size of input image
-
-    const unsigned int input_size_8bytes = 
-        ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
-    const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
-    const unsigned int input_size_dpu_8bytes = 
-        ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
-
-    // Input/output allocation
-    A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    T *bufferA = A;
-    histo_host = malloc(p.bins * sizeof(unsigned int));
-    histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int));
-
-    // Create an input file with arbitrary data
-    read_input(A, p);
-    if(p.exp == 0){
-        for(unsigned int j = 1; j < NR_DPUS; j++){
-            memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T));
-        }
-    }
-    else if(p.exp == 2){
-        for(unsigned int j = 1; j < dpu_s; j++)
-            memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T));
-    }
-
-    // Loop over main kernel
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
-        memset(histo_host, 0, p.bins * sizeof(unsigned int));
-        memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int));
+	unsigned int i = 0;
+	unsigned int input_size;	// Size of input image
+	unsigned int dpu_s = p.dpu_s;
+	if (p.exp == 0)
+		input_size = p.input_size * NR_DPUS;	// Size of input image
+	else if (p.exp == 1)
+		input_size = p.input_size;	// Size of input image
+	else
+		input_size = p.input_size * dpu_s;	// Size of input image
+
+	const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size;	// Input size per DPU (max.), 8-byte aligned
+	const unsigned int input_size_dpu = divceil(input_size, NR_DPUS);	// Input size per DPU (max.)
+	const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu;	// Input size per DPU (max.), 8-byte aligned
+
+	// Input/output allocation
+	A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	T *bufferA = A;
+	histo_host = malloc(p.bins * sizeof(unsigned int));
+	histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int));
+
+	// Create an input file with arbitrary data
+	read_input(A, p);
+	if (p.exp == 0) {
+		for (unsigned int j = 1; j < NR_DPUS; j++) {
+			memcpy(&A[j * input_size_dpu_8bytes], &A[0],
+			       input_size_dpu_8bytes * sizeof(T));
+		}
+	} else if (p.exp == 2) {
+		for (unsigned int j = 1; j < dpu_s; j++)
+			memcpy(&A[j * p.input_size], &A[0],
+			       p.input_size * sizeof(T));
+	}
+	// Loop over main kernel
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+		memset(histo_host, 0, p.bins * sizeof(unsigned int));
+		memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int));
 
 #if WITH_ALLOC_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 0, 0);
-        }
-        DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 0, 0);
+		}
+		DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 0);
+		}
 #endif
 #if WITH_DPUINFO
-        printf("DPUs:");
-        DPU_FOREACH (dpu_set, dpu) {
-            int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            int slice = dpu_get_slice_id(dpu_from_set(dpu));
-            int member = dpu_get_member_id(dpu_from_set(dpu));
-            printf(" %d(%d.%d)", rank, slice, member);
-        }
-        printf("\n");
+		printf("DPUs:");
+		DPU_FOREACH(dpu_set, dpu) {
+			int rank =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			int slice = dpu_get_slice_id(dpu_from_set(dpu));
+			int member = dpu_get_member_id(dpu_from_set(dpu));
+			printf(" %d(%d.%d)", rank, slice, member);
+		}
+		printf("\n");
 #endif
 #if WITH_LOAD_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 1, 0);
-        }
-        DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 1);
-        }
-        DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-        DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-        assert(nr_of_dpus == NR_DPUS);
+		if (rep >= p.n_warmup) {
+			start(&timer, 1, 0);
+		}
+		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 1);
+		}
+		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+		DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+		assert(nr_of_dpus == NR_DPUS);
 #endif
 
-        // int prev_rank_id = -1;
-        int rank_id = -1;
-        DPU_FOREACH (dpu_set, dpu) {
-            rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
-                numa_node_rank = -1;
-            } else {
-                numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
-            }
-            /*
-            if (rank_id != prev_rank_id) {
-                printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-                prev_rank_id = rank_id;
-            }
-            */
-        }
-
-        // Compute output on CPU (performance comparison and verification purposes)
-        if(rep >= p.n_warmup) {
-            start(&timer, 2, 0);
-        }
-        histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS);
-        if(rep >= p.n_warmup) {
-            stop(&timer, 2);
-        }
-
-        if(rep >= p.n_warmup) {
-            start(&timer, 3, 0);
-        }
-        // Input arguments
-        unsigned int kernel = 0;
-        i = 0;
-	    dpu_arguments_t input_arguments[NR_DPUS];
-	    for(i=0; i<NR_DPUS-1; i++) {
-	        input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
-	        input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-	        input_arguments[i].bins=p.bins;
-	        input_arguments[i].kernel=kernel;
-	    }
-	    input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
-	    input_arguments[NR_DPUS-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-	    input_arguments[NR_DPUS-1].bins=p.bins;
-	    input_arguments[NR_DPUS-1].kernel=kernel;
-
-        // Copy input arrays
-        i = 0;
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 3);
-        }
-
-        // Run DPU kernel
-        if(rep >= p.n_warmup) {
-            start(&timer, 4, 0);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_start(&probe));
-            #endif
-        }
- 
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 4);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_stop(&probe));
-            #endif
-        }
+		// int prev_rank_id = -1;
+		int rank_id = -1;
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
+				numa_node_rank = -1;
+			} else {
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
+			}
+			/*
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
+		}
+
+		// Compute output on CPU (performance comparison and verification purposes)
+		if (rep >= p.n_warmup) {
+			start(&timer, 2, 0);
+		}
+		histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS);
+		if (rep >= p.n_warmup) {
+			stop(&timer, 2);
+		}
+
+		if (rep >= p.n_warmup) {
+			start(&timer, 3, 0);
+		}
+		// Input arguments
+		unsigned int kernel = 0;
+		i = 0;
+		dpu_arguments_t input_arguments[NR_DPUS];
+		for (i = 0; i < NR_DPUS - 1; i++) {
+			input_arguments[i].size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].transfer_size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].bins = p.bins;
+			input_arguments[i].kernel = kernel;
+		}
+		input_arguments[NR_DPUS - 1].size =
+		    (input_size_8bytes -
+		     input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T);
+		input_arguments[NR_DPUS - 1].transfer_size =
+		    input_size_dpu_8bytes * sizeof(T);
+		input_arguments[NR_DPUS - 1].bins = p.bins;
+		input_arguments[NR_DPUS - 1].kernel = kernel;
+
+		// Copy input arrays
+		i = 0;
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferA + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 3);
+		}
+		// Run DPU kernel
+		if (rep >= p.n_warmup) {
+			start(&timer, 4, 0);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+		}
 
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 4);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+		}
 #if PRINT
-        {
-            unsigned int each_dpu = 0;
-            printf("Display DPU Logs\n");
-            DPU_FOREACH (dpu_set, dpu) {
-                printf("DPU#%d:\n", each_dpu);
-                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
-                each_dpu++;
-            }
-        }
+		{
+			unsigned int each_dpu = 0;
+			printf("Display DPU Logs\n");
+			DPU_FOREACH(dpu_set, dpu) {
+				printf("DPU#%d:\n", each_dpu);
+				DPU_ASSERT(dpulog_read_for_dpu
+					   (dpu.dpu, stdout));
+				each_dpu++;
+			}
+		}
 #endif
 
-        i = 0;
-        if(rep >= p.n_warmup) {
-            start(&timer, 5, 0);
-        }
-        // PARALLEL RETRIEVE TRANSFER
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
-
-        // Final histogram merging
-        for(i = 1; i < NR_DPUS; i++){
-            for(unsigned int j = 0; j < p.bins; j++){
-                histo[j] += histo[j + i * p.bins];
-            }
-        }
-        if(rep >= p.n_warmup) {
-            stop(&timer, 5);
-        }
-
+		i = 0;
+		if (rep >= p.n_warmup) {
+			start(&timer, 5, 0);
+		}
+		// PARALLEL RETRIEVE TRANSFER
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size_dpu_8bytes * sizeof(T),
+			    p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
+
+		// Final histogram merging
+		for (i = 1; i < NR_DPUS; i++) {
+			for (unsigned int j = 0; j < p.bins; j++) {
+				histo[j] += histo[j + i * p.bins];
+			}
+		}
+		if (rep >= p.n_warmup) {
+			stop(&timer, 5);
+		}
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 6, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 6, 0);
+		}
 #endif
-        DPU_ASSERT(dpu_free(dpu_set));
+		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            stop(&timer, 6);
-        }
+		if (rep >= p.n_warmup) {
+			stop(&timer, 6);
+		}
 #endif
 #endif
 
-        if (rep >= p.n_warmup) {
-            printf("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d",
-                nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), input_size, p.bins);
-            printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-                WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-            printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-                timer.time[0],
-                timer.time[1],
-                timer.time[2],
-                timer.time[3],
-                timer.time[4],
-                timer.time[5],
-                timer.time[6]);
-            printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-                input_size * sizeof(T) / timer.time[2],
-                input_size * sizeof(T) / (timer.time[4]),
-                input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-            printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-                input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
-                input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-            printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-                input_size / timer.time[2],
-                input_size / (timer.time[4]),
-                input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-            printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-                input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
-                input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-        }
-
-    }
-
-    #if ENERGY
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    printf("DPU Energy (J): %f\t", energy);
-    #endif
-
-    // Check output
-    bool status = true;
-    if(p.exp == 1) 
-        for (unsigned int j = 0; j < p.bins; j++) {
-            if(histo_host[j] != histo[j]){ 
-                status = false;
+		if (rep >= p.n_warmup) {
+			printf
+			    ("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d",
+			     nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T),
+			     input_size, p.bins);
+			printf
+			    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+			     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+			     WITH_FREE_OVERHEAD, numa_node_rank);
+			printf
+			    ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+			     timer.time[0], timer.time[1], timer.time[2],
+			     timer.time[3], timer.time[4], timer.time[5],
+			     timer.time[6]);
+			printf
+			    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+			     input_size * sizeof(T) / timer.time[2],
+			     input_size * sizeof(T) / (timer.time[4]),
+			     input_size * sizeof(T) / (timer.time[0] +
+						       timer.time[1] +
+						       timer.time[3] +
+						       timer.time[4] +
+						       timer.time[5] +
+						       timer.time[6]));
+			printf
+			    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+			     input_size * sizeof(T) / (timer.time[3] +
+						       timer.time[4] +
+						       timer.time[5]),
+			     input_size * sizeof(T) / (timer.time[1] +
+						       timer.time[3] +
+						       timer.time[4] +
+						       timer.time[5]),
+			     input_size * sizeof(T) / (timer.time[0] +
+						       timer.time[1] +
+						       timer.time[3] +
+						       timer.time[4] +
+						       timer.time[5]));
+			printf
+			    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+			     input_size / timer.time[2],
+			     input_size / (timer.time[4]),
+			     input_size / (timer.time[0] + timer.time[1] +
+					   timer.time[3] + timer.time[4] +
+					   timer.time[5] + timer.time[6]));
+			printf
+			    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+			     input_size / (timer.time[3] + timer.time[4] +
+					   timer.time[5]),
+			     input_size / (timer.time[1] + timer.time[3] +
+					   timer.time[4] + timer.time[5]),
+			     input_size / (timer.time[0] + timer.time[1] +
+					   timer.time[3] + timer.time[4] +
+					   timer.time[5]));
+		}
+
+	}
+
+#if ENERGY
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	printf("DPU Energy (J): %f\t", energy);
+#endif
+
+	// Check output
+	bool status = true;
+	if (p.exp == 1)
+		for (unsigned int j = 0; j < p.bins; j++) {
+			if (histo_host[j] != histo[j]) {
+				status = false;
 #if PRINT
-                printf("%u - %u: %u -- %u\n", j, j, histo_host[j], histo[j]);
+				printf("%u - %u: %u -- %u\n", j, j,
+				       histo_host[j], histo[j]);
 #endif
-            }
-        }
-    else if(p.exp == 2) 
-        for (unsigned int j = 0; j < p.bins; j++) {
-            if(dpu_s * histo_host[j] != histo[j]){ 
-                status = false;
+			}
+	} else if (p.exp == 2)
+		for (unsigned int j = 0; j < p.bins; j++) {
+			if (dpu_s * histo_host[j] != histo[j]) {
+				status = false;
 #if PRINT
-                printf("%u - %u: %u -- %u\n", j, j, dpu_s * histo_host[j], histo[j]);
+				printf("%u - %u: %u -- %u\n", j, j,
+				       dpu_s * histo_host[j], histo[j]);
 #endif
-            }
-        }
-    else
-        for (unsigned int j = 0; j < p.bins; j++) {
-            if(NR_DPUS * histo_host[j] != histo[j]){ 
-                status = false;
+			}
+	} else
+		for (unsigned int j = 0; j < p.bins; j++) {
+			if (NR_DPUS * histo_host[j] != histo[j]) {
+				status = false;
 #if PRINT
-                printf("%u - %u: %u -- %u\n", j, j, NR_DPUS * histo_host[j], histo[j]);
+				printf("%u - %u: %u -- %u\n", j, j,
+				       NR_DPUS * histo_host[j], histo[j]);
 #endif
-            }
-        }
-    if (status) {
-        printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-    } else {
-        printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
-    }
-
-    // Deallocation
-    free(A);
-    free(histo_host);
-    free(histo);
+			}
+		}
+	if (status) {
+		printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+		       "] Outputs are equal\n");
+	} else {
+		printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+		       "] Outputs differ!\n");
+	}
+
+	// Deallocation
+	free(A);
+	free(histo_host);
+	free(histo);
 #if !WITH_ALLOC_OVERHEAD
 	DPU_ASSERT(dpu_free(dpu_set));
 #endif
-	
-    return status ? 0 : -1;
+
+	return status ? 0 : -1;
 }
diff --git a/HST-S/support/common.h b/HST-S/support/common.h
index 30df40d..e0cacbb 100755
--- a/HST-S/support/common.h
+++ b/HST-S/support/common.h
@@ -13,8 +13,8 @@
 
 // Data type
 #define T uint32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
-#define REGS (BLOCK_SIZE >> 2) // 32 bits
+#define DIV 2			// Shift right to divide by sizeof(T)
+#define REGS (BLOCK_SIZE >> 2)	// 32 bits
 
 // Pixel depth
 #define DEPTH 12
@@ -22,19 +22,19 @@
 
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
-    uint32_t size;
-    uint32_t transfer_size;
-    uint32_t bins;
+	uint32_t size;
+	uint32_t transfer_size;
+	uint32_t bins;
 	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
+		kernel1 = 0,
+		nr_kernels = 1,
 	} kernel;
 } dpu_arguments_t;
 
 #ifndef ENERGY
 #define ENERGY 0
 #endif
-#define PRINT 0 
+#define PRINT 0
 
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
diff --git a/HST-S/support/params.h b/HST-S/support/params.h
index e29449b..3028a50 100644
--- a/HST-S/support/params.h
+++ b/HST-S/support/params.h
@@ -4,64 +4,80 @@
 #include "common.h"
 
 typedef struct Params {
-    unsigned int   input_size;
-    unsigned int   bins;
-    int   n_warmup;
-    int   n_reps;
-    const char *file_name;
-    int  exp;
-    int  dpu_s;
-}Params;
+	unsigned int input_size;
+	unsigned int bins;
+	int n_warmup;
+	int n_reps;
+	const char *file_name;
+	int exp;
+	int dpu_s;
+} Params;
 
-static void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1, 2) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=1536*1024 elements)"
-        "\n    -b <B>    histogram size (default=256 bins)"
-        "\n    -f <F>    input image file (default=../input/image_VanHateren.iml)"
-        "\n");
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1, 2) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=1536*1024 elements)"
+		"\n    -b <B>    histogram size (default=256 bins)"
+		"\n    -f <F>    input image file (default=../input/image_VanHateren.iml)"
+		"\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 1536 * 1024;
-    p.bins          = 256;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 0;
-    p.file_name     = "./input/image_VanHateren.iml";
-    p.dpu_s         = 64;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 1536 * 1024;
+	p.bins = 256;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 0;
+	p.file_name = "./input/image_VanHateren.iml";
+	p.dpu_s = 64;
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'b': p.bins          = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'f': p.file_name     = optarg; break;
-        case 'x': p.exp           = atoi(optarg); break;
-        case 'z': p.dpu_s         = atoi(optarg); break;
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atoi(optarg);
+			break;
+		case 'b':
+			p.bins = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'f':
+			p.file_name = optarg;
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		case 'z':
+			p.dpu_s = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
 
-    return p;
+	return p;
 }
 #endif
diff --git a/HST-S/support/timer.h b/HST-S/support/timer.h
index 4d597b9..df68334 100755
--- a/HST-S/support/timer.h
+++ b/HST-S/support/timer.h
@@ -1,66 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba,
+ *        University of Illinois nor the names of its contributors may be used
+ *        to endorse or promote products derived from this Software without
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+	struct timeval startTime[7];
+	struct timeval stopTime[7];
+	double time[7];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+	if (rep == 0) {
+		timer->time[i] = 0.0;
+	}
+	gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+	gettimeofday(&timer->stopTime[i], NULL);
+	timer->time[i] +=
+	    (timer->stopTime[i].tv_sec -
+	     timer->startTime[i].tv_sec) * 1000000.0 +
+	    (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+	printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+	for (int i = 0; i <= maxt; i++) {
+		printf(" timer%d_us=%f", i, timer->time[i]);
+	}
+	printf("\n");
+}
diff --git a/MLP/Makefile b/MLP/Makefile
index 944b3ca..1ce804d 100644
--- a/MLP/Makefile
+++ b/MLP/Makefile
@@ -1,44 +1,54 @@
-DPU_DIR := dpu
-HOST_DIR := host
-BUILDDIR ?= bin
-NR_TASKLETS ?= 16 
+NR_DPUS ?= 1
+NR_TASKLETS ?= 16
 BL ?= 10
-NR_DPUS ?= 1 
 
-define conf_filename
-	${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf
-endef
-CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL})
+HOST_SOURCES := $(wildcard host/*.c)
+DPU_SOURCES := $(wildcard dpu/*.c)
 
-HOST_TARGET := ${BUILDDIR}/mlp_host
-DPU_TARGET := ${BUILDDIR}/mlp_dpu
+aspectc ?= 0
+aspectc_timing ?= 0
 
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
-DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
+HOST_CC := ${CC}
 
-.PHONY: all clean test
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} 
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DASPECTC=${aspectc}
+DPU_FLAGS := ${COMMON_FLAGS} -O2
+
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
 
-__dirs := $(shell mkdir -p ${BUILDDIR})
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL}
-DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL}
+QUIET = @
 
-all: ${HOST_TARGET} ${DPU_TARGET}
+ifdef verbose
+	QUIET =
+endif
 
-${CONF}:
-	$(RM) $(call conf_filename,*,*)
-	touch ${CONF}
+all: bin/mlp_dpu bin/mlp_host
 
-${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
-	$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin:
+	${QUIET}mkdir -p bin
 
-${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
-	dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
+bin/mlp_host: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
+
+bin/mlp_dpu: ${DPU_SOURCES} include bin
+	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
-	$(RM) -r $(BUILDDIR)
+	${QUIET}$(RM) -r $(BUILDDIR)
 
 test: all
-	./${HOST_TARGET} -m 1024 -n 1024
+	bin/mlp_host -m 1024 -n 1024
+
+.PHONY: all clean test
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
index 3404638..7eb5f00 100644
--- a/MLP/baselines/cpu/Makefile
+++ b/MLP/baselines/cpu/Makefile
@@ -1,7 +1,28 @@
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
+endif
+
 all: mlp_openmp
 
 mlp_openmp: mlp_openmp.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS}
 
 mlp_openmp_O0: mlp_openmp.c
 	gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 8f95e7c..b473d7a 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -11,173 +11,261 @@
 #include <getopt.h>
 #include <assert.h>
 #include <stdint.h>
-#include "../../support/timer.h"
 #include "../../support/common.h"
 
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
+
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void *mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+int numa_node_data = -1;
+int numa_node_cpu = -1;
+#endif
+
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-T** A;
-T* B;
-T* C;
+// weights
+T **A;
+
+// input/output
+T *B;
+
+// intermediate
+T *C;
 
 // Create input arrays
-static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
-    for (unsigned int l = 0; l < NUM_LAYERS; l++)
-		for (unsigned int i = 0; i < m_size * n_size; i++){
-			if(i % 100 < 98){
+static void init_data(T **A, unsigned int m_size, unsigned int n_size)
+{
+	for (unsigned int l = 0; l < NUM_LAYERS; l++) {
+		for (unsigned int i = 0; i < m_size * n_size; i++) {
+			if (i % 100 < 98) {
 				A[l][i] = 0;
-			}else{
-				A[l][i] = (l+i) % 2;
+			} else {
+				A[l][i] = (l + i) % 2;
 			}
 		}
-	for (unsigned int i = 0; i < n_size; i++){
-		if(i % 50 < 48){
+	}
+}
+
+static void init_B(T *B, unsigned int n_size)
+{
+	for (unsigned int i = 0; i < n_size; i++) {
+		if (i % 50 < 48) {
 			B[i] = 0;
-		}
-		else{
+		} else {
 			B[i] = i % 2;
 		}
 	}
 }
 
 // Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
-	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
-		for (unsigned int m = 0; m < m_size; m++){
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+		     unsigned int n_size)
+{
+	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+		for (unsigned int m = 0; m < m_size; m++) {
 			C[m] = 0;
 		}
-		#pragma omp parallel for
-		for (unsigned int m = 0; m < m_size; m++){
-			for (unsigned int n = 0; n < n_size; n++){
+#pragma omp parallel for
+		for (unsigned int m = 0; m < m_size; m++) {
+			for (unsigned int n = 0; n < n_size; n++) {
 				C[m] += A[nl][m * n_size + n] * B[n];
 			}
 			C[m] = max(0, C[m]);
 		}
-		for (unsigned int n = 0; n < n_size; n++){
+		for (unsigned int n = 0; n < n_size; n++) {
 			B[n] = C[n];
 		}
 	}
 }
 
-static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
-  uint64_t sum = 0;
-  for (uint64_t m = 0; m < n_size; m++){
-    sum += B[m];
-  }
-  return sum;
+static uint64_t mlp_host_sum(uint64_t n_size)
+{
+	uint64_t sum = 0;
+	for (uint64_t m = 0; m < n_size; m++) {
+		sum += B[m];
+	}
+	return sum;
 }
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-  char* dpu_type;
-  int   nr_of_ranks;
-  int   input_size_n;
-  int   input_size_m;
-  int   n_warmup;
-  int   n_reps;
-}Params;
-
-void usage() {
-  fprintf(stderr,
-    "\nUsage:  ./program [options]"
-    "\n"
-    "\nGeneral options:"
-    "\n    -h        help"
-    "\n    -d <D>    DPU type (default=fsim)"
-    "\n    -r <R>    # of ranks (default=2)"
-    "\n"
-    "\nBenchmark-specific options:"
-    "\n    -i <I>    input size (default=8M elements)"
-    "\n");
-  }
-
-  struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.dpu_type      = "fsim";
-    p.nr_of_ranks   = 1;
-    p.input_size_n  = 1 << 9;
-    p.input_size_m  = 1 << 9;
-    p.n_warmup      = 2;
-    p.n_reps        = 3;
-
-    int opt;
-    while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
-      switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'd': p.dpu_type        = optarg; break;
-        case 'r': p.nr_of_ranks     = atoi(optarg); break;
-        case 'n': p.input_size_n    = atoi(optarg); break;
-        case 'm': p.input_size_m    = atoi(optarg); break;
-        default:
-        fprintf(stderr, "\nUnrecognized option!\n");
-        usage();
-        exit(0);
-      }
-    }
-    assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
-
-    return p;
-  }
+	int input_size_n;
+	int input_size_m;
+	int n_reps;
+#if NUMA
+	struct bitmask *bitmask;
+	int numa_node_cpu;
+#endif
+} Params;
+
+void usage()
+{
+	fprintf(stderr, "\nUsage:  ./program [options]" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size_n = 8192;
+	p.input_size_m = 20480;
+	p.n_reps = 100;
+#if NUMA
+	p.bitmask = NULL;
+	p.numa_node_cpu = -1;
+#endif
+
+	int opt;
+	while ((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'n':
+			p.input_size_n = atoi(optarg);
+			break;
+		case 'm':
+			p.input_size_m = atoi(optarg);
+			break;
+#if NUMA
+		case 'A':
+			p.bitmask = numa_parse_nodestring(optarg);
+			break;
+		case 'C':
+			p.numa_node_cpu = atoi(optarg);
+			break;
+#endif
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+
+	return p;
+}
 
   /**
   * @brief Main of the Host Application.
   */
-  int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
-    uint64_t n_size = 8192;
-    uint64_t m_size = 20480;
+	struct Params p = input_params(argc, argv);
+	uint64_t n_size = p.input_size_n;
+	uint64_t m_size = p.input_size_m;
 
-    Timer timer;
-    A = malloc(NUM_LAYERS * sizeof(T*));
-    for(int l = 0; l < NUM_LAYERS; l++)
-        A[l] = malloc(n_size*m_size*sizeof(unsigned int));
-    B = malloc(m_size*sizeof(unsigned int));
-    C = malloc(m_size*sizeof(unsigned int));
+#if WITH_BENCHMARK
+	Timer timer;
+#endif
 
-    for (int i = 0; i < 100; i++) {
-        // Create an input file with arbitrary data.
-        init_data(A, B, m_size, n_size);
+#if NUMA
+	if (p.bitmask) {
+		numa_set_membind(p.bitmask);
+		numa_free_nodemask(p.bitmask);
+	}
+	A = numa_alloc(NUM_LAYERS * sizeof(T *));
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		A[l] = numa_alloc(n_size * m_size * sizeof(unsigned int));
+	}
+	B = numa_alloc(m_size * sizeof(unsigned int));
+	C = numa_alloc(m_size * sizeof(unsigned int));
+
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_data = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
+#else
+	A = malloc(NUM_LAYERS * sizeof(T *));
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		A[l] = malloc(n_size * m_size * sizeof(unsigned int));
+	}
+	B = malloc(m_size * sizeof(unsigned int));
+	C = malloc(m_size * sizeof(unsigned int));
+#endif
+
+	// Create an input file with arbitrary data.
+	init_data(A, m_size, n_size);
+
+	for (int i = 0; i < p.n_reps; i++) {
+		init_B(B, n_size);
 
-        start(&timer, 0, 0);
-        mlp_host(C, A, B, n_size, m_size);
-        stop(&timer, 0);
+		start(&timer, 0, 0);
+		mlp_host(C, A, B, n_size, m_size);
+		stop(&timer, 0);
 
-        unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
-
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu "
-            "| throughput_cpu_omp_MBps=%f\n",
-            nr_threads, XSTR(T), n_size * m_size,
-            n_size * m_size * sizeof(T) / timer.time[0]);
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu "
-            "| throughput_cpu_omp_MOpps=%f\n",
-            nr_threads, XSTR(T), n_size * m_size,
-            n_size * m_size / timer.time[0]);
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu |",
-            nr_threads, XSTR(T), n_size * m_size);
-        printall(&timer, 0);
-    }
-
-    uint32_t sum = mlp_host_sum(n_size, m_size);
-   
-    printf("Kernel ");
-    print(&timer, 0, 1);
-    printf("\n");
-
-    printf("SUM = %d \n", sum);
-
-    for(int l = 0; l < NUM_LAYERS; l++)
-        free(A[l]);
-    free(A);
-    free(B);
-    free(C);
-
-    return 0;
+		nr_threads++;
+
+		printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
+		       nr_threads, XSTR(T), n_size * m_size);
+#if NUMA
+		printf
+		    (" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+		     numa_node_data, numa_node_cpu,
+		     numa_distance(numa_node_data, numa_node_cpu));
+#endif
+		printf(" | throughput_MBps=%f throughput_MOpps=%f",
+		       n_size * m_size * sizeof(T) / timer.time[0],
+		       n_size * m_size / timer.time[0]);
+		printf(" latency_us=%f\n", timer.time[0]);
+#endif				// WITH_BENCHMARK
+	}
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
+#endif
+
+	uint32_t sum = mlp_host_sum(n_size);
+
+	printf("SUM = %d \n", sum);
+
+#if NUMA
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		numa_free(A[l], n_size * m_size * sizeof(unsigned int));
+	}
+	numa_free(A, NUM_LAYERS * sizeof(T *));
+	numa_free(B, m_size * sizeof(unsigned int));
+	numa_free(C, m_size * sizeof(unsigned int));
+#else
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		free(A[l]);
+	}
+	free(A);
+	free(B);
+	free(C);
+#endif
+
+	return 0;
 }
diff --git a/MLP/benchmark-scripts/ccmcc25-sim.sh b/MLP/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..3abe82e
--- /dev/null
+++ b/MLP/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/mlp_host -w 0 -e 50 -m ${nr_rows} -n ${nr_cols}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  MLP  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_cols={nr_cols} nr_rows={nr_rows} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: nr_cols 1024 2048 3072 4096 \
+	::: nr_rows 512 768 1024 2048 \
+>> ${fn}.txt
diff --git a/MLP/benchmark-scripts/ccmcc25.sh b/MLP/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..02063b9
--- /dev/null
+++ b/MLP/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/mlp_host -w 0 -e 50 -m ${nr_rows} -n ${nr_cols}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  MLP  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} nr_cols={nr_cols} nr_rows={nr_rows} \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: nr_cols 4096 8192 16384 \
+		::: nr_rows 1024 2048 4096 \
+	>> ${fn}.txt
+
+done
diff --git a/MLP/dpu/task.c b/MLP/dpu/task.c
index de3e554..ae400ae 100644
--- a/MLP/dpu/task.c
+++ b/MLP/dpu/task.c
@@ -10,12 +10,13 @@
 #include <barrier.h>
 #include <seqread.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
 // GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
 	for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
 		bufferC[pos] += bufferA[i] * bufferB[i];
 	}
@@ -26,13 +27,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // main
-int main() {
+int main()
+{
 	unsigned int tasklet_id = me();
 #if PRINT
 	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-	if (tasklet_id == 0){ // Initialize once the cycle counter
-		mem_reset(); // Reset the heap
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
 	}
 	// Barrier
 	barrier_wait(&my_barrier);
@@ -42,12 +44,11 @@ int main() {
 	uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
 	uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
 
-
 	unsigned int nrows = nr_rows;
-	unsigned int rows_per_tasklet; 
+	unsigned int rows_per_tasklet;
 	unsigned int start_row;
 	unsigned int chunks = nrows / (NR_TASKLETS + NR_TASKLETS);
-	unsigned int dbl_chunks = chunks + chunks;                                                                       
+	unsigned int dbl_chunks = chunks + chunks;
 	rows_per_tasklet = dbl_chunks;
 	unsigned int rest_rows = nrows % (NR_TASKLETS + NR_TASKLETS);
 
@@ -57,19 +58,30 @@ int main() {
 		if ((tasklet_id + tasklet_id) >= rest_rows) {
 			unsigned int hlf_rest_rows = rest_rows >> 1;
 			if ((rest_rows & 1) == 1)
-				start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+				start_row =
+				    (hlf_rest_rows + 1) * (dbl_chunks + 2) +
+				    (tasklet_id - 1 -
+				     hlf_rest_rows) * dbl_chunks;
 			else
-				start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
-		} else 
+				start_row =
+				    (hlf_rest_rows) * (dbl_chunks + 2) +
+				    (tasklet_id - hlf_rest_rows) * dbl_chunks;
+		} else
 			start_row = tasklet_id * (dbl_chunks + 2);
 	} else {
 		start_row = tasklet_id * (dbl_chunks);
 	}
 
 	// Address of the current row in MRAM
-	uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
-	uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
-	uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+	uint32_t mram_base_addr_A =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+	uint32_t mram_base_addr_B =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER +
+			max_rows * n_size_pad * sizeof(T));
+	uint32_t mram_base_addr_C =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER +
+			max_rows * n_size_pad * sizeof(T) +
+			n_size_pad * sizeof(T) + start_row * sizeof(T));
 	uint32_t mram_temp_addr_A = mram_base_addr_A;
 	uint32_t mram_temp_addr_B = mram_base_addr_B;
 
@@ -82,34 +94,44 @@ int main() {
 	int offset = 0;
 
 	// Iterate over nr_rows
-	for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
+	for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+	     i += 2) {
 
-		mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+		mram_temp_addr_A =
+		    (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
 		mram_temp_addr_B = mram_base_addr_B;
 
 		cache_C[0] = 0;
 		cache_C[1] = 0;
-		for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
+		for (unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++) {
 			int n = 0, j;
-			for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
-			{
-
-				mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-				mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
-				if(offset)
-				{
-
-					for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
-					{
+			for (n = 0;
+			     n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+			     n += (BLOCK_SIZE / sizeof(T))) {
+
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_A), cache_A,
+					  BLOCK_SIZE);
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_B), cache_B,
+					  BLOCK_SIZE);
+
+				if (offset) {
+
+					for (unsigned int off = 0;
+					     off < (BLOCK_SIZE / sizeof(T)) - 1;
+					     off++) {
 						cache_A[off] = cache_A[off + 1];
 					}
 
-					mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+					mram_read((__mram_ptr void const
+						   *)(mram_temp_addr_A +
+						      BLOCK_SIZE), cache_A_aux,
+						  8);
 
-					cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+					cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+					    cache_A_aux[0];
 				}
-
 				// Compute GEMV
 				gemv(cache_C, cache_A, cache_B, pos);
 
@@ -118,49 +140,51 @@ int main() {
 				mram_temp_addr_B += BLOCK_SIZE;
 			}
 
-			mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
+			mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+				  cache_A, BLOCK_SIZE);
 
-
-			if(offset)
-			{
-				for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
-				{
+			if (offset) {
+				for (unsigned int off = 0;
+				     off < (BLOCK_SIZE / sizeof(T)) - 1;
+				     off++) {
 
 					cache_A[off] = cache_A[off + 1];
 				}
 
-				mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+				mram_read((__mram_ptr void const
+					   *)(mram_temp_addr_A + BLOCK_SIZE),
+					  cache_A_aux, 8);
 
-  			       cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+				cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+				    cache_A_aux[0];
 			}
 
+			mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+				  cache_B, BLOCK_SIZE);
 
-			mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
-			for (j = 0; j < (int) (n_size - n); j++) {
+			for (j = 0; j < (int)(n_size - n); j++) {
 				// Compute GEMV
-				if(j >= (int)(BLOCK_SIZE / sizeof(T))){ 
+				if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
 					printf("error\n");
 					break;
 				}
 				cache_C[pos] += cache_A[j] * cache_B[j];
 			}
 
-
-			mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+			mram_temp_addr_A +=
+			    (BLOCK_SIZE -
+			     ((BLOCK_SIZE / sizeof(T)) -
+			      (n_size - n)) * sizeof(T));
 			mram_temp_addr_B = mram_base_addr_B;
 
-			if(mram_temp_addr_A % 8 != 0)
-			{
+			if (mram_temp_addr_A % 8 != 0) {
 				offset = 1;
-			}
-			else
-			{
+			} else {
 				offset = 0;
 			}
 		}
 		// Write cache to current MRAM block
-		mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+		mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
 
 		// Update memory address
 		mram_base_addr_C += 2 * sizeof(T);
diff --git a/MLP/host/app.c b/MLP/host/app.c
index 952cb3f..9c32ab8 100644
--- a/MLP/host/app.c
+++ b/MLP/host/app.c
@@ -8,47 +8,57 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
-#include <dpu.h>
-#include <dpu_log.h>
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
+#if ASPECTC
+extern "C" {
+#endif
+
+#include <dpu.h>
+#include <dpu_log.h>
+
 #if ENERGY
 #include <dpu_probe.h>
 #endif
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#if ASPECTC
+}
+#endif
+
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
 #define DPU_BINARY "./bin/mlp_dpu"
 #endif
 
-static T** A;
-static T* B;
-static T* B_host;
-static T* B_tmp;
-static T* C;
-static T* C_dpu;
+static T **A;
+static T *B;
+static T *B_host;
+static T *B_tmp;
+static T *C;
+static T *C_dpu;
 
 // Create input arrays
-static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int n_size) {
+static void init_data(T **A, T *B, T *B_host, unsigned int m_size,
+		      unsigned int n_size)
+{
 	for (unsigned int l = 0; l < NUM_LAYERS; l++)
-		for (unsigned int i = 0; i < m_size * n_size; i++){
-			if(i % 100 < 98){
+		for (unsigned int i = 0; i < m_size * n_size; i++) {
+			if (i % 100 < 98) {
 				A[l][i] = 0;
-			}else{
-				A[l][i] = (l+i) % 2;
+			} else {
+				A[l][i] = (l + i) % 2;
 			}
 		}
-	for (unsigned int i = 0; i < n_size; i++){
-		if(i % 50 < 48){
+	for (unsigned int i = 0; i < n_size; i++) {
+		if (i % 50 < 48) {
 			B[i] = 0;
-		}
-		else{
+		} else {
 			B[i] = i % 2;
 		}
 		B_host[i] = B[i];
@@ -56,26 +66,29 @@ static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int
 }
 
 // Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+		     unsigned int n_size)
+{
 
-	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
-		for (unsigned int m = 0; m < m_size; m++){
+	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+		for (unsigned int m = 0; m < m_size; m++) {
 			C[m] = 0;
 		}
-		for (unsigned int m = 0; m < m_size; m++){
-			for (unsigned int n = 0; n < n_size; n++){
+		for (unsigned int m = 0; m < m_size; m++) {
+			for (unsigned int n = 0; n < n_size; n++) {
 				C[m] += A[nl][m * n_size + n] * B[n];
 			}
 			C[m] = max(0, C[m]);
 		}
-		for (unsigned int n = 0; n < n_size; n++){
+		for (unsigned int n = 0; n < n_size; n++) {
 			B[n] = C[n];
 		}
 	}
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
 	struct Params p = input_params(argc, argv);
 
@@ -97,14 +110,15 @@ int main(int argc, char **argv) {
 	unsigned int n_size = p.n_size;
 
 	// Initialize help data
-	dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t));
-	dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
+	dpu_info =
+	    (struct dpu_info_t *)malloc(nr_of_dpus * sizeof(struct dpu_info_t));
+	dpu_arguments_t *input_args =
+	    (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
 	uint32_t max_rows_per_dpu = 0;
 	uint32_t n_size_pad = n_size;
-	if(n_size % 2 == 1){
+	if (n_size % 2 == 1) {
 		n_size_pad++;
 	}
-
 	// Timer
 	Timer timer;
 	i = 0;
@@ -118,7 +132,10 @@ int main(int argc, char **argv) {
 			rows_per_dpu++;
 		if (rest_rows > 0) {
 			if (i >= rest_rows)
-				prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+				prev_rows_dpu =
+				    rest_rows * (chunks + 1) + (i -
+								rest_rows) *
+				    chunks;
 			else
 				prev_rows_dpu = i * (chunks + 1);
 		} else {
@@ -127,7 +144,7 @@ int main(int argc, char **argv) {
 
 		// Keep max rows for parallel transfers
 		uint32_t rows_per_dpu_pad = rows_per_dpu;
-		if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+		if (rows_per_dpu_pad % 2 == 1)	// 4-byte elements
 			rows_per_dpu_pad++;
 		if (rows_per_dpu_pad > max_rows_per_dpu)
 			max_rows_per_dpu = rows_per_dpu_pad;
@@ -142,16 +159,17 @@ int main(int argc, char **argv) {
 		input_args[i].nr_rows = rows_per_dpu;
 	}
 
-	A = (T**)malloc(NUM_LAYERS * sizeof(T*));
-	for(l = 0; l < NUM_LAYERS; l++)
-		A[l] = (T*)malloc( max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T));
-
+	A = (T **) malloc(NUM_LAYERS * sizeof(T *));
+	for (l = 0; l < NUM_LAYERS; l++)
+		A[l] =
+		    (T *) malloc(max_rows_per_dpu * nr_of_dpus * n_size_pad *
+				 sizeof(T));
 
-	B = (T*)malloc(n_size * sizeof(T));
-	B_host = (T*)malloc(n_size * sizeof(T));
-	C = (T*)malloc(m_size * sizeof(T));
-	C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
-	B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
+	B = (T *) malloc(n_size * sizeof(T));
+	B_host = (T *) malloc(n_size * sizeof(T));
+	C = (T *) malloc(m_size * sizeof(T));
+	C_dpu = (T*)malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
+	B_tmp = (T*)malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
 
 	init_data(A, B, B_host, m_size, n_size);
 
@@ -170,26 +188,36 @@ int main(int argc, char **argv) {
 			input_args[i].max_rows = max_rows_per_dpu;
 			DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
-
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
 
 		// Copy input array and vector
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, A[0] + dpu_info[i].prev_rows_dpu * n_size));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu,
+				    A[0] + dpu_info[i].prev_rows_dpu * n_size));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    max_rows_per_dpu * n_size_pad * sizeof(T),
+			    DPU_XFER_DEFAULT));
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, B));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    max_rows_per_dpu * n_size_pad * sizeof(T),
+			    n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
 		if (rep >= p.n_warmup)
 			stop(&timer, 1);
 
 		// Run kernel on DPUs
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			start(&timer, 2, rep - p.n_warmup);
 #if ENERGY
 			DPU_ASSERT(dpu_probe_start(&probe));
@@ -198,31 +226,38 @@ int main(int argc, char **argv) {
 
 		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
 
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			stop(&timer, 2);
 #if ENERGY
 			DPU_ASSERT(dpu_probe_stop(&probe));
 #endif
 		}
 
-		for(int lay = 1; lay < NUM_LAYERS; lay++){
+		for (int lay = 1; lay < NUM_LAYERS; lay++) {
 			if (rep >= p.n_warmup)
 				start(&timer, 4, rep - p.n_warmup);
 			i = 0;
 
 			// Copy C_dpu
 			DPU_FOREACH(dpu_set, dpu, i) {
-				DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+				DPU_ASSERT(dpu_prepare_xfer
+					   (dpu, C_dpu + i * max_rows_per_dpu));
 			}
-			DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+			DPU_ASSERT(dpu_push_xfer
+				   (dpu_set, DPU_XFER_FROM_DPU,
+				    DPU_MRAM_HEAP_POINTER_NAME,
+				    max_rows_per_dpu * n_size_pad * sizeof(T) +
+				    n_size_pad * sizeof(T),
+				    max_rows_per_dpu * sizeof(T),
+				    DPU_XFER_DEFAULT));
 
 			// B = C
 			unsigned int n, j;
 			i = 0;
 			for (n = 0; n < nr_of_dpus; n++) {
 				for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
-					B_tmp[i] = C_dpu[n * max_rows_per_dpu + j];
+					B_tmp[i] =
+					    C_dpu[n * max_rows_per_dpu + j];
 					i++;
 				}
 			}
@@ -230,20 +265,31 @@ int main(int argc, char **argv) {
 			DPU_FOREACH(dpu_set, dpu, i) {
 				DPU_ASSERT(dpu_prepare_xfer(dpu, B_tmp));
 			}
-			DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+			DPU_ASSERT(dpu_push_xfer
+				   (dpu_set, DPU_XFER_TO_DPU,
+				    DPU_MRAM_HEAP_POINTER_NAME,
+				    max_rows_per_dpu * n_size_pad * sizeof(T),
+				    n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
 
 			// Copy next matrix of weights
 			i = 0;
 			DPU_FOREACH(dpu_set, dpu, i) {
-				DPU_ASSERT(dpu_prepare_xfer(dpu, A[lay] + dpu_info[i].prev_rows_dpu * n_size));
+				DPU_ASSERT(dpu_prepare_xfer
+					   (dpu,
+					    A[lay] +
+					    dpu_info[i].prev_rows_dpu *
+					    n_size));
 			}
-			DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+			DPU_ASSERT(dpu_push_xfer
+				   (dpu_set, DPU_XFER_TO_DPU,
+				    DPU_MRAM_HEAP_POINTER_NAME, 0,
+				    max_rows_per_dpu * n_size_pad * sizeof(T),
+				    DPU_XFER_DEFAULT));
 
-			if(rep >= p.n_warmup)
+			if (rep >= p.n_warmup)
 				stop(&timer, 4);
 
-			if (rep >= p.n_warmup)
-			{
+			if (rep >= p.n_warmup) {
 				start(&timer, 2, rep - p.n_warmup);
 #if ENERGY
 				DPU_ASSERT(dpu_probe_start(&probe));
@@ -252,8 +298,7 @@ int main(int argc, char **argv) {
 
 			DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
 
-			if (rep >= p.n_warmup)
-			{
+			if (rep >= p.n_warmup) {
 				stop(&timer, 2);
 #if ENERGY
 				DPU_ASSERT(dpu_probe_stop(&probe));
@@ -273,37 +318,31 @@ int main(int argc, char **argv) {
 			start(&timer, 3, rep - p.n_warmup);
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, C_dpu + i * max_rows_per_dpu));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
-		if(rep >= p.n_warmup)
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    max_rows_per_dpu * n_size_pad * sizeof(T) +
+			    n_size_pad * sizeof(T),
+			    max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup)
 			stop(&timer, 3);
 	}
 
 #if ENERGY
 	double acc_energy, avg_energy, acc_time, avg_time;
-	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+	DPU_ASSERT(dpu_probe_get
+		   (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
 #endif
 
-	// Print timing results
-	printf("CPU Version Time (ms): ");
-	print(&timer, 0, 1);
-	printf("CPU-DPU Time (ms): ");
-	print(&timer, 1, p.n_reps);
-	printf("DPU Kernel Time (ms): ");
-	print(&timer, 2, p.n_reps);
-	printf("Inter-DPU Time (ms): ");
-	print(&timer, 4, p.n_reps);
-	printf("DPU-CPU Time (ms): ");
-	print(&timer, 3, p.n_reps);
-
 #if ENERGY
 	printf("Energy (J): %f J\t", avg_energy);
 #endif
-	printf("\n\n");
 
 	// Check output
 	bool status = true;
@@ -311,23 +350,26 @@ int main(int argc, char **argv) {
 	i = 0;
 	for (n = 0; n < nr_of_dpus; n++) {
 		for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
-			if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+			if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
 				status = false;
 #if PRINT
-				printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+				printf("%d: %d -- %d\n", i, C[i],
+				       C_dpu[n * max_rows_per_dpu + j]);
 #endif
 			}
 			i++;
 		}
 	}
 	if (status) {
-		printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+		printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+		       "] Outputs are equal\n");
 	} else {
-		printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+		printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+		       "] Outputs differ!\n");
 	}
 
 	// Deallocation
-	for(i = 0; i < NUM_LAYERS; i++)
+	for (i = 0; i < NUM_LAYERS; i++)
 		free(A[i]);
 	free(A);
 	free(B);
diff --git a/MLP/support/common.h b/MLP/include/common.h
index 53b2f1c..4b5031b 100755..100644
--- a/MLP/support/common.h
+++ b/MLP/include/common.h
@@ -3,21 +3,21 @@
 
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
-    uint32_t n_size;
-    uint32_t n_size_pad;
-    uint32_t nr_rows;
-    uint32_t max_rows;
+	uint32_t n_size;
+	uint32_t n_size_pad;
+	uint32_t nr_rows;
+	uint32_t max_rows;
 } dpu_arguments_t;
 
 // Specific information for each DPU
 struct dpu_info_t {
-  uint32_t rows_per_dpu;
-  uint32_t rows_per_dpu_pad;
-  uint32_t prev_rows_dpu;
+	uint32_t rows_per_dpu;
+	uint32_t rows_per_dpu_pad;
+	uint32_t prev_rows_dpu;
 };
 struct dpu_info_t *dpu_info;
 
-#define NUM_LAYERS 3 
+#define NUM_LAYERS 3
 #define max(x, y) (x > y ? x : y)
 #define min(x, y) (x < y ? x : y)
 
diff --git a/MLP/include/dfatool_host.ah b/MLP/include/dfatool_host.ah
new file mode 100644
index 0000000..6ea4a18
--- /dev/null
+++ b/MLP/include/dfatool_host.ah
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned int n_rows, n_cols;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(uint32_t);
+	}
+
+	advice call("% input_params(...)"): after() {
+		Params* p = tjp->result();
+		n_rows = p->m_size;
+		n_cols = p->n_size;
+		printf("[>>] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols);
+	}
+
+	advice call("% start(...)") : after() {
+		if (*(tjp->arg<1>()) == 1) {
+			printf("[--] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols);
+		}
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols);
+	}
+};
diff --git a/MLP/include/params.h b/MLP/include/params.h
new file mode 100644
index 0000000..4bfc2fc
--- /dev/null
+++ b/MLP/include/params.h
@@ -0,0 +1,65 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+typedef struct Params {
+	unsigned int m_size;
+	unsigned int n_size;
+	unsigned int n_warmup;
+	unsigned int n_reps;
+} Params;
+
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -m <I>    m_size (default=2048 elements)"
+		"\n    -n <I>    n_size (default=2048 elements)" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.m_size = 163840;
+	p.n_size = 4096;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+
+	int opt;
+	while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'm':
+			p.m_size = atoi(optarg);
+			break;
+		case 'n':
+			p.n_size = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+	return p;
+}
+#endif
diff --git a/MLP/include/timer.h b/MLP/include/timer.h
new file mode 100644
index 0000000..bff638d
--- /dev/null
+++ b/MLP/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 5
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/MLP/support/params.h b/MLP/support/params.h
deleted file mode 100644
index f9e790e..0000000
--- a/MLP/support/params.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-
-typedef struct Params {
-    unsigned int  m_size;
-    unsigned int  n_size;
-    unsigned int  n_warmup;
-    unsigned int  n_reps;
-}Params;
-
-static void usage() {
-    fprintf(stderr,
-            "\nUsage:  ./program [options]"
-            "\n"
-            "\nGeneral options:"
-            "\n    -h        help"
-            "\n    -w <W>    # of untimed warmup iterations (default=1)"
-            "\n    -e <E>    # of timed repetition iterations (default=3)"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -m <I>    m_size (default=2048 elements)"
-            "\n    -n <I>    n_size (default=2048 elements)"
-            "\n");
-}
-
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.m_size        = 163840;
-    p.n_size        = 4096;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-
-    int opt;
-    while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
-        switch(opt) {
-            case 'h':
-                usage();
-                exit(0);
-                break;
-            case 'm': p.m_size        = atoi(optarg); break;
-            case 'n': p.n_size        = atoi(optarg); break;
-            case 'w': p.n_warmup      = atoi(optarg); break;
-            case 'e': p.n_reps        = atoi(optarg); break;
-            default:
-                      fprintf(stderr, "\nUnrecognized option!\n");
-                      usage();
-                      exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
-
-    return p;
-}
-#endif
diff --git a/MLP/support/timer.h b/MLP/support/timer.h
deleted file mode 100755
index 886380a..0000000
--- a/MLP/support/timer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[5];
-    struct timeval stopTime[5];
-    double         time[5];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-    //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-    //                  (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
- 
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
diff --git a/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh
new file mode 100755
index 0000000..ee5ee99
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+	bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 20 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+for sdk in 2025.1.0-orig 2025.1.0-notransform; do
+	source /opt/upmem/transformation-benchmarks/${sdk}/upmem_env.sh
+	fn=log/$(hostname)/upvec-${sdk}
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+		::: numa_rank any \
+		::: numa_in 1 \
+		::: numa_out 1 \
+		::: numa_cpu 1 \
+		::: nr_ranks $(seq 1 40) \
+		::: input_size 1048576 \
+	>> ${fn}.txt
+done
diff --git a/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh
new file mode 100755
index 0000000..869ada3
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/nodmc25-alloc
+
+source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	./make-size.sh ${size}
+	n_nops=$((size * 256))
+	if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then
+		for l in $(seq 1 20); do
+			bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}')
+		done
+	fi
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+(
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+	run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \
+	::: i $(seq 1 5) \
+	::: numa_rank -1 \
+	::: numa_cpu 0 1 \
+	::: nr_ranks $(seq 1 40) \
+	::: size $(seq 0 15)
+
+) >> ${fn}.txt
diff --git a/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh
new file mode 100755
index 0000000..33bb12f
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/nodmc25-transfer
+
+source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+	bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+(
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+	run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+	::: i $(seq 1 10) \
+	::: numa_rank -1 \
+	::: numa_in 0 1 \
+	::: numa_out 0 1 \
+	::: numa_cpu 0 1 \
+	::: nr_ranks $(seq 1 40) \
+	::: input_size 1 1048576
+
+) >> ${fn}.txt
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-read.pdf b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf
new file mode 100644
index 0000000..63af6cc
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance
new file mode 100755
index 0000000..b175b8d
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+data=$(mktemp -d)
+
+echo
+echo SDK with transformation
+echo
+
+DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \
+analyze-log.py \
+--filter-param='n_elements_per_dpu=1048576' \
+--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \
+--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \
+--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \
+--export-pgf-unparam ${data}/orig- \
+--cross-validate=kfold:10 --progress \
+--show-model=param --show-model-error --show-model-precision=6 \
+log/tinos/upvec-2025.1.0-orig.txt
+
+echo
+echo SDK without transformation
+echo
+
+DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \
+analyze-log.py \
+--filter-param='n_elements_per_dpu=1048576' \
+--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \
+--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \
+--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \
+--export-pgf-unparam ${data}/notransform- \
+--cross-validate=kfold:10 --progress \
+--show-model=param --show-model-error --show-model-precision=6 \
+log/tinos/upvec-2025.1.0-notransform.txt
+
+for op in read write; do
+	cp util/upvec-${op}.tex ${data}
+	lualatex -output-directory ${data} ${data}/upvec-${op}
+	cp ${data}/upvec-${op}.pdf util
+done
+
+rm -rf ${data}
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-write.tex b/Microbenchmarks/CPU-DPU/util/upvec-write.tex
new file mode 100644
index 0000000..f6d7bf5
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-write.tex
@@ -0,0 +1,38 @@
+\documentclass{standalone}
+
+\usepackage[T1]{fontenc}
+\usepackage[default]{opensans}
+\usepackage[scaled]{beramono}
+
+\usepackage{tikz}
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.18}
+\usepgfplotslibrary{statistics}
+
+\begin{document}
+	\begin{tikzpicture}
+		\begin{axis}[
+				ylabel={write [GB/s]},
+				xlabel={\# Ranks},
+				x label style={font=\footnotesize, yshift=2mm},
+				y label style={font=\footnotesize},
+				tick label style={/pgf/number format/assume math mode=true},
+				title={Benchmark Data},
+				title style={yshift=-3mm},
+				legend style={font=\footnotesize, legend columns=-1, column sep=1ex},
+				legend pos=south east,
+				legend entries={upstream,,{no transformation},},
+				reverse legend,
+				ymin=0,ymax=14,
+				xmin=0,xmax=41,
+				width=90mm,height=45mm
+			]
+			\addplot[thick, color=red, domain=1:40] {5.042768 + 0.258673 * min(x, 31.536102)};
+			\addplot[color=red,only marks,mark=*,mark size=0.9,opacity=.05]
+				table[x=n_ranks, y=value] {orig-NMC-transfer-writeThroughputGBps.txt};
+			\addplot[thick, color=blue, domain=1:40] {5.049962 + 0.308594 * min(x, 25.657012)};
+			\addplot[color=blue,only marks,mark=*,mark size=0.9,opacity=.05]
+				table[x=n_ranks, y=value] {notransform-NMC-transfer-writeThroughputGBps.txt};
+		\end{axis}
+	\end{tikzpicture}
+\end{document}
diff --git a/NW/Makefile b/NW/Makefile
index 68f495a..10276b1 100644
--- a/NW/Makefile
+++ b/NW/Makefile
@@ -1,46 +1,56 @@
-DPU_DIR := dpu
-HOST_DIR := host
-BUILDDIR ?= bin
 NR_TASKLETS ?= 13
-BL ?= 1024 
-BL_IN ?= 4 
-NR_DPUS ?= 1 
+BL ?= 1024
+BL_IN ?= 4
+NR_DPUS ?= 1
 ENERGY ?= 0
 
-define conf_filename
-	${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf
-endef
-CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL})
+HOST_SOURCES := $(wildcard host/*.c)
+DPU_SOURCES := $(wildcard dpu/*.c)
 
-HOST_TARGET := ${BUILDDIR}/nw_host
-DPU_TARGET := ${BUILDDIR}/nw_dpu
+aspectc ?= 0
+aspectc_timing ?= 0
 
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
-DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
+HOST_CC := ${CC}
 
-.PHONY: all clean test
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL}
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DASPECTC=${aspectc}
+DPU_FLAGS := ${COMMON_FLAGS} -O2 -DBL_IN=${BL_IN}
+
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
 
-__dirs := $(shell mkdir -p ${BUILDDIR})
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY}
-DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DBL_IN=${BL_IN}
+QUIET = @
 
-all: ${HOST_TARGET} ${DPU_TARGET}
+ifdef verbose
+	QUIET =
+endif
 
-${CONF}:
-	$(RM) $(call conf_filename,*,*)
-	touch ${CONF}
+all: bin/nw_host bin/nw_dpu
 
-${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
-	$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin:
+	${QUIET}mkdir -p bin
 
-${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
-	dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
+bin/nw_host: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
+
+bin/nw_dpu: ${DPU_SOURCES} include bin
+	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
 	$(RM) -r $(BUILDDIR)
 
 test: all
-	./${HOST_TARGET}
+	bin/nw_host
+
+.PHONY: all clean test
diff --git a/NW/benchmark-scripts/ccmcc25.sh b/NW/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..80df155
--- /dev/null
+++ b/NW/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Generates huge logfiles and crashes frequently. Probably not helpful.
+exit 1
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/ccmcc25
+
+source /opt/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=32 BL_IN=2 \
+	dfatool_timing=0 aspectc=1 aspectc_timing=1
+	bin/nw_host -w 0 -e 50 -n ${nr_rows}
+}
+
+export -f run_benchmark_nmc
+
+echo "prim-benchmarks  NW  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} nr_rows={nr_rows} \
+	::: numa_rank any \
+	::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+	::: nr_rows 32768 65536 131072 \
+>> ${fn}.txt
diff --git a/NW/dpu/task.c b/NW/dpu/task.c
index c022f70..fab163a 100644
--- a/NW/dpu/task.c
+++ b/NW/dpu/task.c
@@ -10,7 +10,7 @@
 #include <perfcounter.h>
 #include <barrier.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
diff --git a/NW/host/app.c b/NW/host/app.c
index 0e899ec..9de2918 100644
--- a/NW/host/app.c
+++ b/NW/host/app.c
@@ -7,20 +7,30 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
-#include <unistd.h>
-#include <getopt.h>
-#include <assert.h>
-
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
 
 #if ENERGY
 #include <dpu_probe.h>
 #endif
 
+#if ASPECTC
+}
+#endif
+
+#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include "common.h"
+#include "timer.h"
+#include "params.h"
+
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
 #define DPU_BINARY "./bin/nw_dpu"
@@ -184,7 +194,7 @@ int main(int argc, char **argv) {
 
     struct Params p = input_params(argc, argv);
     struct dpu_set_t dpu_set, dpu;
-    uint32_t nr_of_dpus, max_dpus;
+    uint32_t nr_of_dpus, nr_of_ranks, max_dpus;
 
 #if ENERGY
     struct dpu_probe_t probe;
@@ -195,6 +205,7 @@ int main(int argc, char **argv) {
     DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
     DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
     DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
     printf("Allocated %d DPU(s)\n", nr_of_dpus);
     printf("Allocated %d TASKLET(s) per DPU\n", NR_TASKLETS);
 #if DYNAMIC
@@ -822,28 +833,6 @@ int main(int argc, char **argv) {
             stop(&timer, 1);
 
     }
-
-    // Print timing results
-    printf("CPU version ");
-    print(&timer, 0, p.n_reps);
-    printf("CPU-DPU ");
-    print(&timer, 2, p.n_reps);
-    printf("DPU Kernel ");
-    print(&timer, 3, p.n_reps);
-    printf("Inter-DPU ");
-    print(&timer, 1, p.n_reps);
-    printf("DPU-CPU ");
-    print(&timer, 4, p.n_reps);
-    printf("\n");
-    printf("Longest Diagonal CPU-DPU ");
-    print(&long_diagonal_timer, 2, p.n_reps);
-    printf("Longest Diagonal DPU Kernel ");
-    print(&long_diagonal_timer, 3, p.n_reps);
-    printf("Longest Diagonal Inter-DPU ");
-    print(&long_diagonal_timer, 1, p.n_reps);
-    printf("Longest Diagonal DPU-CPU ");
-    print(&long_diagonal_timer, 4, p.n_reps);
-    printf("\n");
     
 #if ENERGY
     printf("DPU Energy (J): %f \t ", tavg_energy / p.n_reps);
diff --git a/NW/support/common.h b/NW/include/common.h
index 69069e7..69069e7 100755..100644
--- a/NW/support/common.h
+++ b/NW/include/common.h
diff --git a/NW/include/dfatool_host.ah b/NW/include/dfatool_host.ah
new file mode 100644
index 0000000..d45aef3
--- /dev/null
+++ b/NW/include/dfatool_host.ah
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned long n_elements;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(uint32_t);
+	}
+
+	advice call("% input_params(...)"): after() {
+		Params* p = tjp->result();
+		n_elements = p->max_rows;
+		printf("[>>] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+
+	advice call("% srand(...)") : after() {
+		printf("[--] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+};
diff --git a/NW/support/params.h b/NW/include/params.h
index 8874248..8874248 100644
--- a/NW/support/params.h
+++ b/NW/include/params.h
diff --git a/NW/support/timer.h b/NW/include/timer.h
index efaefcd..2fc798f 100755..100644
--- a/NW/support/timer.h
+++ b/NW/include/timer.h
@@ -1,59 +1,59 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[5];
-    struct timeval stopTime[5];
-    double         time[5];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); 
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the 
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba, 
+ *        University of Illinois nor the names of its contributors may be used 
+ *        to endorse or promote products derived from this Software without 
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer{
+
+    struct timeval startTime[5];
+    struct timeval stopTime[5];
+    double         time[5];
+
+}Timer;
+
+void start(Timer *timer, int i, int rep) {
+    if(rep == 0) {
+        timer->time[i] = 0.0;
+    }
+    gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i) {
+    gettimeofday(&timer->stopTime[i], NULL);
+    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); 
+}
+
+void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
diff --git a/README.md b/README.md
index 9c10fd9..4f1c8c4 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 This is an improved and extended version of the PrIM benchmark suite originally developed for **UPMEM PIM** (near-memory computing / processing-in-memory) evaluation by Gómez-Luna et al.
 The extension adds
 
-* support for **NUMA**-aware **HBM** (high-bandwidth memory) and **DRAM** benchmarks,
+* support for **NUMA**-aware **UPMEM**, **CXL** (Compute eXpress Link), **HBM** (high-bandwidth memory), and **DRAM** benchmarks,
 * a new **COUNT** benchmark, and
 * numerous bugfixes.
 
@@ -61,6 +61,7 @@ Up-to-date source code is available on the following mirrors:
 
 The following benchmark adjustments have been made:
 
+* A (AspectC++ support, including DFA trace generation)
 * B (significant bugfixes)
 * D (dfatool-compatible output of benchmark metrics)
 * E (efficiency improvements; may affect input/output format)
@@ -69,39 +70,41 @@ The following benchmark adjustments have been made:
 
 CPU and DPU benchmarks in this repository have been adjusted as follows:
 
-* BFS: DL
-* BS: DLN
+* BFS: ADL
+* BS: ADLN
 * COUNT: DLN (new benchmark, based on SEL)
-* GEMV: DLN
-* HST-L: D
+* GEMV: ADLN
+* HST-L: AD
 * HST-S: DLN
-* MLP: –
-* NW: –
+* MLP: A
+* NW: A
 * RED: DLN
 * SCAN-SSA: D
-* SCAN-RSS: DLN
+* SCAN-RSS: ADLN
 * SEL: DLN
-* SpMV: DL
-* TRNS: BDLN
-* TS: DLN
+* SpMV: ADL
+* TRNS: ABDLN
+* TS: ADLN
 * UNI: DL
-* VA: DLN
+* VA: ADLN
 
 GPU versions are un-changed.
 
 The original README follows.
+It contains minor adjustments to the directory structure;
+benchmark how-tos that no do not apply to this fork have been removed.
 
 ---
 
 # PrIM (Processing-In-Memory Benchmarks)
 
-PrIM is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. 
-PrIM is developed to evaluate, analyze, and characterize the first publicly-available real-world processing-in-memory (PIM) architecture, the [UPMEM](https://www.upmem.com/) PIM architecture. 
+PrIM is the first benchmark suite for a real-world processing-in-memory (PIM) architecture.
+PrIM is developed to evaluate, analyze, and characterize the first publicly-available real-world processing-in-memory (PIM) architecture, the [UPMEM](https://www.upmem.com/) PIM architecture.
 The UPMEM PIM architecture combines traditional DRAM memory arrays with general-purpose in-order cores, called DRAM Processing Units (DPUs), integrated in the same chip.
 
-PrIM provides a common set of workloads to evaluate the UPMEM PIM architecture with and can be useful for programming, architecture and system researchers all alike to improve multiple aspects of future PIM hardware and software. 
-The workloads have different characteristics, exhibiting heterogeneity in their memory access patterns, operations and data types, and communication patterns. 
-This repository also contains baseline CPU and GPU implementations of PrIM benchmarks for comparison purposes. 
+PrIM provides a common set of workloads to evaluate the UPMEM PIM architecture with and can be useful for programming, architecture and system researchers all alike to improve multiple aspects of future PIM hardware and software.
+The workloads have different characteristics, exhibiting heterogeneity in their memory access patterns, operations and data types, and communication patterns.
+This repository also contains baseline CPU and GPU implementations of PrIM benchmarks for comparison purposes.
 
 PrIM also includes a set of microbenchmarks can be used to assess various architecture limits such as compute throughput and memory bandwidth.
 
@@ -155,18 +158,15 @@ Bibtex entries for citation:
 
 ## Repository Structure and Installation
 
-We point out next the repository structure and some important folders and files. 
-All benchmark folders have similar structure to the one shown for BFS. 
-The microbenchmark folder contains eight different microbenchmarks, each with similar folder structure. 
+We point out next the repository structure and some important folders and files.
+All benchmark folders have similar structure to the one shown for BFS.
+The microbenchmark folder contains eight different microbenchmarks, each with similar folder structure.
 The repository also includes `run_*.py` scripts to run strong and weak scaling experiments for PrIM benchmarks.
 
 ```
 .
 +-- LICENSE
 +-- README.md
-+-- run_strong_full.py
-+-- run_strong_rank.py
-+-- run_weak.py
 +-- BFS/
 |   +-- baselines/
 |	|	+-- cpu/
@@ -174,7 +174,7 @@ The repository also includes `run_*.py` scripts to run strong and weak scaling e
 |   +-- data/
 |   +-- dpu/
 |   +-- host/
-|   +-- support/
+|   +-- include/
 |   +-- Makefile
 +-- BS/
 |   +-- ...
@@ -219,60 +219,12 @@ The repository also includes `run_*.py` scripts to run strong and weak scaling e
 
 ### Prerequisites
 
-Running PrIM requires installing the [UPMEM SDK](https://sdk.upmem.com). 
+Running PrIM requires installing the [UPMEM SDK](https://sdk.upmem.com).
 PrIM benchmarks and microbenchmarks are designed to run on a server with real UPMEM modules, but they also run on the functional simulator include in the UPMEM SDK.
 
-### Getting Started
-
-Clone the repository:
-```sh
-git clone https://github.com/CMU-SAFARI/prim-benchmarks
-
-cd prim-benchmarks
-./set-root-dir.sh
-```
-
 ## Running PrIM
 
-### PrIM Benchmarks 
-
-The repository includes scripts to run weak scaling and strong scaling experiments:
-* `run_weak.py`: Weak scaling experiments for 16 PrIM benchmarks using 1 rank of UPMEM DPUs (1 to 64 DPUs).
-* `run_strong_rank.py`: Strong scaling experiments for 16 PrIM benchmarks using 1 rank of UPMEM DPUs (1 to 64 DPUs).
-* `run_strong_full.py`: Strong scaling experiments for 16 PrIM benchmarks using 4 to 32 ranks of UPMEM DPUs (256 to 2048 DPUs).
-
-To run weak scaling experiments for BFS or SpMV, update the paths to input files in `run_weak.py`. 
-The scripts save the results in a folder called `profile` inside each benchmark folder.
-
-```sh
-cd prim-benchmarks
-
-# Weak scaling experiments for BFS
-python3 run_weak.py BFS
-```
-
-Inside each PrIM benchmark folder, one can compile and run each benchmark with different input parameters. 
-Choose a benchmark and compile. Every Makefile accepts several input parameters:
-```sh
-cd BFS
-
-# Compile BFS for 32 DPUs and 16 tasklets (i.e., software threads) per DPU
-NR_DPUS=32 NR_TASKLETS=16 make all
-```
-
-For help instructions:
-```sh
-./bin/host_code -h
-```
-
-Run the benchmark:
-```sh
-./bin/host_code -v 0 -f data/loc-gowalla_edges.txt
-```
-
-Several benchmark folders (HST-S, HST-L, RED, SCAN-SSA, SCAN-RSS) contain a script (`run.sh`) that compiles and runs the benchmark for the experiments in the appendix of the [paper](https://arxiv.org/pdf/2105.03814.pdf).
-
-### Microbenchmarks 
+### Microbenchmarks
 
 Each microbenchmark folder contains a script (`run.sh`) that compiles and runs the microbenchmark for the experiments in the [paper](https://arxiv.org/pdf/2105.03814.pdf):
 
@@ -284,10 +236,9 @@ cd Microbenchmarks/Arithmetic-Throughput
 
 ### Getting Help
 
-If you have any suggestions for improvement, please contact el1goluj at gmail dot com. 
+If you have any suggestions for improvement, please contact el1goluj at gmail dot com.
 If you find any bugs or have further questions or requests, please post an issue at the [issue page](https://github.com/CMU-SAFARI/prim-benchmarks/issues).
 
+## Acknowledgments
 
-## Acknowledgments 
-
-We thank UPMEM’s Fabrice Devaux, Rémy Cimadomo, Romaric Jodin, and Vincent Palatin for their valuable support. We acknowledge the support of SAFARI Research Group’s industrial partners, especially ASML, Facebook, Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. Izzat El Hajj acknowledges the support of the University Research Board of the American University of Beirut (URB-AUB-103951-25960). 
+We thank UPMEM’s Fabrice Devaux, Rémy Cimadomo, Romaric Jodin, and Vincent Palatin for their valuable support. We acknowledge the support of SAFARI Research Group’s industrial partners, especially ASML, Facebook, Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. Izzat El Hajj acknowledges the support of the University Research Board of the American University of Beirut (URB-AUB-103951-25960).
diff --git a/RED/Makefile b/RED/Makefile
index f20e1f7..c65df94 100644
--- a/RED/Makefile
+++ b/RED/Makefile
@@ -8,17 +8,34 @@ WITH_ALLOC_OVERHEAD ?= 0
 WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DPERF=${PERF} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DPERF=${PERF} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DPERF=${PERF}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -27,10 +44,12 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/host_code: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
diff --git a/RED/benchmark-scripts/ccmcc25-sim.sh b/RED/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..bc97344
--- /dev/null
+++ b/RED/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 5 -i ${nr_elements} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  BS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_elements={nr_elements} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: nr_elements $((2**20)) $((2**21)) $((2**22)) \
+>> ${fn}.txt
diff --git a/RED/benchmark-scripts/ccmcc25.sh b/RED/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..074933d
--- /dev/null
+++ b/RED/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 5 -i ${nr_elements} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  RED  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_elements={nr_elements} \
+			::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+			::: nr_elements $((2**27)) $((2**28)) $((2**29)) \
+	>> ${fn}.txt
+
+done
diff --git a/RED/dpu/task.c b/RED/dpu/task.c
index 5536d4d..90386b2 100644
--- a/RED/dpu/task.c
+++ b/RED/dpu/task.c
@@ -11,8 +11,8 @@
 #include <handshake.h>
 #include <barrier.h>
 
-#include "../support/common.h"
-#include "../support/cyclecount.h"
+#include "common.h"
+#include "cyclecount.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 __host dpu_results_t DPU_RESULTS[NR_TASKLETS];
diff --git a/RED/host/app.c b/RED/host/app.c
index 204f056..cb7a6ac 100644
--- a/RED/host/app.c
+++ b/RED/host/app.c
@@ -7,15 +7,31 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+#include <dpu_management.h>
+#include <dpu_target_macros.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
@@ -25,13 +41,6 @@
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
-#include <dpu_management.h>
-#include <dpu_target_macros.h>
-
 // Pointer declaration
 static T* A;
 
@@ -70,17 +79,17 @@ int main(int argc, char **argv) {
     // Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
     DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+    zero(&timer, 0); // alloc
 #endif
 #if !WITH_LOAD_OVERHEAD
     DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
     DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
     DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
     assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+    zero(&timer, 1); // load
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+    zero(&timer, 6); // free
 #endif
 
 #if ENERGY
@@ -102,7 +111,7 @@ int main(int argc, char **argv) {
         ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
 
     // Input/output allocation
-    A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+    A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
     T *bufferA = A;
     T count = 0;
     T count_host = 0;
@@ -168,12 +177,12 @@ int main(int argc, char **argv) {
         // Input arguments
         unsigned int kernel = 0;
         dpu_arguments_t input_arguments[NR_DPUS];
-        for(i=0; i<NR_DPUS-1; i++) {
-            input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
-            input_arguments[i].kernel=kernel;
+        for(int j=0; j<NR_DPUS-1; j++) {
+            input_arguments[j].size=input_size_dpu_8bytes * sizeof(T); 
+            input_arguments[j].kernel=(enum kernels)kernel;
         }
         input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
-        input_arguments[NR_DPUS-1].kernel=kernel;
+        input_arguments[NR_DPUS-1].kernel=(enum kernels)kernel;
         // Copy input arrays
         i = 0;
         DPU_FOREACH(dpu_set, dpu, i) {
@@ -218,7 +227,7 @@ int main(int argc, char **argv) {
 
         //printf("Retrieve results\n");
         dpu_results_t results[NR_DPUS];
-        T* results_count = malloc(NR_DPUS * sizeof(T));
+        T* results_count = (T*)malloc(NR_DPUS * sizeof(T));
         if(rep >= p.n_warmup)
             start(&timer, 5, 0);
         i = 0;
@@ -302,11 +311,11 @@ int main(int argc, char **argv) {
         if (status) {
             printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
             if (rep >= p.n_warmup) {
-                printf("[::] RED UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
+                dfatool_printf("[::] RED UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
                     NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size);
-                printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+                dfatool_printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
                     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-                printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+                dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
                     timer.time[0],
                     timer.time[1],
                     timer.time[2],
@@ -314,19 +323,19 @@ int main(int argc, char **argv) {
                     timer.time[4],
                     timer.time[5],
                     timer.time[6]);
-                printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+                dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
                     input_size * sizeof(T) / timer.time[2],
                     input_size * sizeof(T) / (timer.time[4]),
                     input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+                dfatool_printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
                     input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
                     input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
                     input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-                printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+                dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
                     input_size / timer.time[2],
                     input_size / (timer.time[4]),
                     input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+                dfatool_printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
                     input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
                     input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
                     input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
diff --git a/RED/support/common.h b/RED/include/common.h
index 121bf31..6cc1ae2 100755..100644
--- a/RED/support/common.h
+++ b/RED/include/common.h
@@ -38,19 +38,21 @@
 #define DIV 1 // Shift right to divide by sizeof(T)
 #endif
 
+enum kernels {
+	kernel1 = 0,
+	nr_kernels = 1,
+};
+
 // Structures used by both the host and the dpu to communicate information
 typedef struct {
-    uint32_t size;
-	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
-	} kernel;
-    T t_count;
+	uint32_t size;
+	enum kernels kernel;
+	T t_count;
 } dpu_arguments_t;
 
 typedef struct {
-    uint64_t cycles;
-    T t_count;
+	uint64_t cycles;
+	T t_count;
 } dpu_results_t;
 
 #ifndef PERF
diff --git a/RED/support/cyclecount.h b/RED/include/cyclecount.h
index c4247b5..c4247b5 100644
--- a/RED/support/cyclecount.h
+++ b/RED/include/cyclecount.h
diff --git a/RED/include/dfatool_host.ah b/RED/include/dfatool_host.ah
new file mode 100644
index 0000000..88dfbd8
--- /dev/null
+++ b/RED/include/dfatool_host.ah
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_elements;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_elements = p->input_size;
+		printf("[>>] RED | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+
+	advice call("% reduction_host(...)") : after() {
+		printf("[--] RED | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] RED | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+};
diff --git a/RED/support/params.h b/RED/include/params.h
index 97bc50a..ee90908 100644
--- a/RED/support/params.h
+++ b/RED/include/params.h
@@ -18,7 +18,7 @@ static void usage() {
         "\n    -h        help"
         "\n    -w <W>    # of untimed warmup iterations (default=1)"
         "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+        "\n    -x <X>    Weak (0) or strong (1) scaling (default=1)"
         "\n"
         "\nBenchmark-specific options:"
         "\n    -i <I>    input size (default=6553600 elements)"
@@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) {
     p.input_size    = 6553600;
     p.n_warmup      = 1;
     p.n_reps        = 3;
-    p.exp           = 0;
+    p.exp           = 1;
 
     int opt;
     while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
diff --git a/RED/include/timer.h b/RED/include/timer.h
new file mode 100644
index 0000000..7b80823
--- /dev/null
+++ b/RED/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/RED/support/timer.h b/RED/support/timer.h
deleted file mode 100755
index 4d597b9..0000000
--- a/RED/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/SCAN-RSS/Makefile b/SCAN-RSS/Makefile
index f1975e8..d55eb07 100644
--- a/SCAN-RSS/Makefile
+++ b/SCAN-RSS/Makefile
@@ -8,17 +8,34 @@ WITH_ALLOC_OVERHEAD ?= 0
 WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL}
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
 DPU_FLAGS := ${COMMON_FLAGS} -O2
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -27,10 +44,13 @@ all: bin/dpu_code bin/host_code
 bin:
 	${QUIET}mkdir -p bin
 
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+# cp/rm are needed to work around AspectC++ not liking symlinks
+bin/host_code: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
diff --git a/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh b/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..2715db7
--- /dev/null
+++ b/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 5 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  SCAN-RSS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: input_size $((2**22)) $((2**23)) $((2**24)) \
+>> ${fn}.txt
diff --git a/SCAN-RSS/benchmark-scripts/ccmcc25.sh b/SCAN-RSS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..ff0a31e
--- /dev/null
+++ b/SCAN-RSS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 50 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  SCAN-RSS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: input_size $((2**27)) $((2**28)) $((2**29)) \
+	>> ${fn}.txt
+
+done
diff --git a/SCAN-RSS/dpu/task.c b/SCAN-RSS/dpu/task.c
index 7a4b029..afc42c7 100644
--- a/SCAN-RSS/dpu/task.c
+++ b/SCAN-RSS/dpu/task.c
@@ -11,7 +11,7 @@
 #include <handshake.h>
 #include <barrier.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 __host dpu_results_t DPU_RESULTS[NR_TASKLETS];
diff --git a/SCAN-RSS/host/app.c b/SCAN-RSS/host/app.c
index 6771207..ffcc2cf 100644
--- a/SCAN-RSS/host/app.c
+++ b/SCAN-RSS/host/app.c
@@ -7,15 +7,31 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+#include <dpu_management.h>
+#include <dpu_target_macros.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
@@ -25,12 +41,7 @@
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
-#include <dpu_management.h>
-#include <dpu_target_macros.h>
+unsigned int kernel;
 
 // Pointer declaration
 static T* A;
@@ -78,17 +89,17 @@ int main(int argc, char **argv) {
     // Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
     DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+    zero(&timer, 0);
 #endif
 #if !WITH_LOAD_OVERHEAD
     DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
     DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
     DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
     assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+    zero(&timer, 1);
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+    zero(&timer, 6);
 #endif
 
     unsigned int i = 0;
@@ -100,8 +111,8 @@ int main(int argc, char **argv) {
         (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned
 
     // Input/output allocation
-    A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
-    C = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
+    A = (T*) malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
+    C = (T*) malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
     T *bufferA = A;
     T *bufferC = C;
 
@@ -167,8 +178,8 @@ int main(int argc, char **argv) {
         }
         // Input arguments
         const unsigned int input_size_dpu = input_size_dpu_round;
-        unsigned int kernel = 0;
-        dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel, 0};
+        kernel = 0;
+        dpu_arguments_t input_arguments = {(uint32_t)(input_size_dpu * sizeof(T)), (enum kernels)kernel, 0};
         // Copy input arrays
         i = 0;
         DPU_FOREACH(dpu_set, dpu, i) {
@@ -214,7 +225,7 @@ int main(int argc, char **argv) {
 
         //printf("Retrieve results\n");
         dpu_results_t results[nr_of_dpus];
-        T* results_scan = malloc(nr_of_dpus * sizeof(T));
+        T* results_scan = (T*) malloc(nr_of_dpus * sizeof(T));
         i = 0;
         accum = 0;
 
@@ -251,7 +262,7 @@ int main(int argc, char **argv) {
         dpu_arguments_t input_arguments_2[NR_DPUS];
         for(i=0; i<nr_of_dpus; i++) {
             input_arguments_2[i].size=input_size_dpu * sizeof(T); 
-            input_arguments_2[i].kernel=kernel;
+            input_arguments_2[i].kernel=(enum kernels)kernel;
             input_arguments_2[i].t_count=results_scan[i];
         }
         DPU_FOREACH(dpu_set, dpu, i) {
@@ -332,11 +343,11 @@ int main(int argc, char **argv) {
         }
         if (status) {
             printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-            printf("[::] SCAN-RSS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%d",
+            dfatool_printf("[::] SCAN-RSS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%d",
                 NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, UNROLL, input_size);
-            printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+            dfatool_printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
                 WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-            printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_free_us=%f",
+            dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_free_us=%f",
                 timer.time[0],
                 timer.time[1],
                 timer.time[2],
@@ -345,20 +356,20 @@ int main(int argc, char **argv) {
                 timer.time[5], // sync
                 timer.time[7], // read
                 timer.time[8]);
-            printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+            dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
                 input_size * sizeof(T) / timer.time[2],
                 input_size * sizeof(T) / (timer.time[4] + timer.time[5] + timer.time[6]),
                 input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8]));
-            printf(" throughput_upmem_s_MBps=%f throughput_upmem_wxsxr_MBps=%f throughput_upmem_lwxsxr_MBps=%f throughput_upmem_alwxsxr_MBps=%f",
+            dfatool_printf(" throughput_upmem_s_MBps=%f throughput_upmem_wxsxr_MBps=%f throughput_upmem_lwxsxr_MBps=%f throughput_upmem_alwxsxr_MBps=%f",
                 input_size * sizeof(T) / (timer.time[5]),
                 input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]),
                 input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]),
                 input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]));
-            printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+            dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
                 input_size / timer.time[2],
                 input_size / (timer.time[4] + timer.time[5] + timer.time[6]),
                 input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8]));
-            printf(" throughput_upmem_s_MOpps=%f throughput_upmem_wxsxr_MOpps=%f throughput_upmem_lwxsxr_MOpps=%f throughput_upmem_alwxsxr_MOpps=%f\n",
+            dfatool_printf(" throughput_upmem_s_MOpps=%f throughput_upmem_wxsxr_MOpps=%f throughput_upmem_lwxsxr_MOpps=%f throughput_upmem_alwxsxr_MOpps=%f\n",
                 input_size / (timer.time[5]),
                 input_size / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]),
                 input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]),
diff --git a/SCAN-RSS/support/common.h b/SCAN-RSS/include/common.h
index be19a8c..859a3fe 100755..100644
--- a/SCAN-RSS/support/common.h
+++ b/SCAN-RSS/include/common.h
@@ -40,15 +40,17 @@
 
 #define REGS (BLOCK_SIZE >> DIV)
 
+enum kernels {
+	kernel1 = 0,
+	kernel2 = 1,
+	nr_kernels = 2,
+};
+
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
-    uint32_t size;
-	enum kernels {
-	    kernel1 = 0,
-	    kernel2 = 1,
-	    nr_kernels = 2,
-	} kernel;
-    T t_count;
+	uint32_t size;
+	enum kernels kernel;
+	T t_count;
 } dpu_arguments_t;
 
 typedef struct {
diff --git a/SCAN-RSS/include/dfatool_host.ah b/SCAN-RSS/include/dfatool_host.ah
new file mode 100644
index 0000000..6d2fad5
--- /dev/null
+++ b/SCAN-RSS/include/dfatool_host.ah
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_elements;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_elements = p->input_size;
+		printf("[>>] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+
+	advice call("% scan_host(...)") : after() {
+		printf("[--] SCAN-RSS | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+};
diff --git a/SCAN-SSA/support/params.h b/SCAN-RSS/include/params.h
index 9f6aacc..a96b33f 100644
--- a/SCAN-SSA/support/params.h
+++ b/SCAN-RSS/include/params.h
@@ -18,7 +18,7 @@ static void usage() {
         "\n    -h        help"
         "\n    -w <W>    # of untimed warmup iterations (default=1)"
         "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+        "\n    -x <X>    Weak (0) or strong (1) scaling (default=1)"
         "\n"
         "\nBenchmark-specific options:"
         "\n    -i <I>    input size (default=3932160 elements)"
@@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) {
     p.input_size    = 3932160;
     p.n_warmup      = 1;
     p.n_reps        = 3;
-    p.exp           = 0;
+    p.exp           = 1;
 
     int opt;
     while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
diff --git a/SCAN-RSS/include/timer.h b/SCAN-RSS/include/timer.h
new file mode 100644
index 0000000..313151d
--- /dev/null
+++ b/SCAN-RSS/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 9
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/SCAN-RSS/run-paper-strong-full.sh b/SCAN-RSS/run-paper-strong-full.sh
deleted file mode 100755
index a00e96d..0000000
--- a/SCAN-RSS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SCAN-RSS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 is not part of upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 251658240 -x 1 || true
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/SCAN-RSS/run-paper-strong-rank.sh b/SCAN-RSS/run-paper-strong-rank.sh
deleted file mode 100755
index 3391a1b..0000000
--- a/SCAN-RSS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SCAN-RSS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream config space
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 1 || true
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/SCAN-RSS/run-paper-weak.sh b/SCAN-RSS/run-paper-weak.sh
deleted file mode 100755
index 053d9a6..0000000
--- a/SCAN-RSS/run-paper-weak.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SCAN-RSS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# upstream does not include 256 and 512 in config space
-for nr_dpus in 512 256 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then
-			timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 0 || true
-		fi
-	done
-done
-) | tee log-paper-weak.txt
diff --git a/SCAN-RSS/run.sh b/SCAN-RSS/run.sh
deleted file mode 100755
index 1c39f7c..0000000
--- a/SCAN-RSS/run.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-for i in  2048 4096 8192 16384 65536 262144 1048576 3932160
-do
-		    NR_DPUS=1 NR_TASKLETS=16 BL=10 VERSION=SINGLE make all
-			wait
-            ./bin/host_code -w 10 -e 100 -i ${i} > profile/out${i}_tl16_bl10_dpu11
-			wait
-			make clean
-			wait
-done
diff --git a/SCAN-RSS/support/timer.h b/SCAN-RSS/support/timer.h
deleted file mode 100755
index 3ec6d87..0000000
--- a/SCAN-RSS/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[9];
-    struct timeval stopTime[9];
-    double         time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/SCAN-SSA/Makefile b/SCAN-SSA/Makefile
index 319f2da..c741138 100644
--- a/SCAN-SSA/Makefile
+++ b/SCAN-SSA/Makefile
@@ -9,14 +9,31 @@ HOST_SOURCES := $(wildcard host/app.c)
 OMP_SOURCES := $(wildcard host/omp.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_INCLUDES := support
-COMMON_FLAGS = -Wall -Wextra -O2 -I${COMMON_INCLUDES} -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL}
-HOST_FLAGS = ${COMMON_FLAGS} -std=c11 `dpu-pkg-config --cflags --libs dpu`
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS = -Wall -Wextra -O2 -Iinclude -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL}
+HOST_FLAGS = ${COMMON_FLAGS} `dpu-pkg-config --cflags --libs dpu` -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
 DPU_FLAGS = ${COMMON_FLAGS}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -29,7 +46,9 @@ bin/dpu_code: ${DPU_SOURCES} bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} ${DPU_SOURCES} -o $@
 
 bin/host_code: ${HOST_SOURCES} bin
-	${QUIET}${CC} ${HOST_FLAGS} ${HOST_SOURCES} -o $@
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
 bin/omp_code: ${OMP_SOURCES}
 	${QUIET}${CC} ${HOST_FLAGS} -fopenmp ${OMP_SOURCES} -o $@
diff --git a/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh b/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..2715db7
--- /dev/null
+++ b/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 5 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  SCAN-RSS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: input_size $((2**22)) $((2**23)) $((2**24)) \
+>> ${fn}.txt
diff --git a/SCAN-SSA/benchmark-scripts/ccmcc25.sh b/SCAN-SSA/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..c9655c8
--- /dev/null
+++ b/SCAN-SSA/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 50 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  SCAN-SSA  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: input_size $((2**27)) $((2**28)) $((2**29)) \
+	>> ${fn}.txt
+
+done
diff --git a/SCAN-SSA/dpu/task.c b/SCAN-SSA/dpu/task.c
index 15411a4..76f393d 100644
--- a/SCAN-SSA/dpu/task.c
+++ b/SCAN-SSA/dpu/task.c
@@ -11,7 +11,7 @@
 #include <handshake.h>
 #include <barrier.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 __host dpu_results_t DPU_RESULTS[NR_TASKLETS];
diff --git a/SCAN-SSA/host/app.c b/SCAN-SSA/host/app.c
index 25471fe..8675f17 100644
--- a/SCAN-SSA/host/app.c
+++ b/SCAN-SSA/host/app.c
@@ -7,15 +7,29 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
@@ -25,15 +39,13 @@
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
 // Pointer declaration
 static T* A;
 static T* C;
 static T* C2;
 
+unsigned int kernel;
+
 // Create input arrays
 static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) {
     srand(0);
@@ -95,9 +107,9 @@ int main(int argc, char **argv) {
         (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned
 
     // Input/output allocation
-    A = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
-    C = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
-    C2 = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
+    A = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
+    C = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
+    C2 = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T));
     T *bufferA = A;
     T *bufferC = C2;
 
@@ -124,8 +136,8 @@ int main(int argc, char **argv) {
             start(&timer, 1, 0);
         // Input arguments
         const unsigned int input_size_dpu = input_size_dpu_round;
-        unsigned int kernel = 0;
-        dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel, 0};
+        kernel = 0;
+        dpu_arguments_t input_arguments = {(uint32_t)(input_size_dpu * sizeof(T)), (enum kernels)kernel, 0};
         // Copy input arrays
         i = 0;
         DPU_FOREACH(dpu_set, dpu, i) {
@@ -170,7 +182,7 @@ int main(int argc, char **argv) {
 
         //printf("Retrieve results\n");
         dpu_results_t results[nr_of_dpus];
-        T* results_scan = malloc(nr_of_dpus * sizeof(T));
+        T* results_scan = (T*) malloc(nr_of_dpus * sizeof(T));
         i = 0;
         accum = 0;
 
@@ -207,7 +219,7 @@ int main(int argc, char **argv) {
         dpu_arguments_t input_arguments_2[NR_DPUS];
         for(i=0; i<nr_of_dpus; i++) {
             input_arguments_2[i].size=input_size_dpu * sizeof(T); 
-            input_arguments_2[i].kernel=kernel;
+            input_arguments_2[i].kernel=(enum kernels)kernel;
             input_arguments_2[i].t_count=results_scan[i];
         }
         DPU_FOREACH(dpu_set, dpu, i) {
@@ -272,17 +284,16 @@ int main(int argc, char **argv) {
         }
         if (status) {
             printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-            printf("[::] SCAN-SSA NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%u "
-                "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f\n",
+            dfatool_printf("[::] SCAN-SSA NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%u "
+                "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f",
                 nr_of_dpus, NR_TASKLETS, XSTR(T), BLOCK_SIZE, UNROLL, input_size,
                 input_size * sizeof(T) / timer.time[0],
                 input_size * sizeof(T) / (timer.time[2] + timer.time[3] + timer.time[4]),
                 input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
-            printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f\n",
+            dfatool_printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f\n",
                 input_size / timer.time[0],
                 input_size / (timer.time[2] + timer.time[3] + timer.time[4]),
                 input_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
-            printall(&timer, 5);
         } else {
             printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
         }
diff --git a/SCAN-SSA/host/omp.c b/SCAN-SSA/host/omp.c
index efa5360..3e722dc 100644
--- a/SCAN-SSA/host/omp.c
+++ b/SCAN-SSA/host/omp.c
@@ -12,9 +12,9 @@
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
diff --git a/SCAN-SSA/support/common.h b/SCAN-SSA/include/common.h
index 0bdf7ca..f395cc5 100644
--- a/SCAN-SSA/support/common.h
+++ b/SCAN-SSA/include/common.h
@@ -40,15 +40,17 @@
 
 #define REGS (BLOCK_SIZE >> DIV)
 
+enum kernels {
+	kernel1 = 0,
+	kernel2 = 1,
+	nr_kernels = 2,
+};
+
 // Structures used by both the host and the dpu to communicate information
 typedef struct {
-    uint32_t size;
-	enum kernels {
-	    kernel1 = 0,
-	    kernel2 = 1,
-	    nr_kernels = 2,
-	} kernel;
-    T t_count;
+	uint32_t size;
+	enum kernels kernel;
+	T t_count;
 } dpu_arguments_t;
 
 typedef struct {
diff --git a/SCAN-SSA/include/dfatool_host.ah b/SCAN-SSA/include/dfatool_host.ah
new file mode 100644
index 0000000..6d2fad5
--- /dev/null
+++ b/SCAN-SSA/include/dfatool_host.ah
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_elements;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_elements = p->input_size;
+		printf("[>>] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+
+	advice call("% scan_host(...)") : after() {
+		printf("[--] SCAN-RSS | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements);
+	}
+};
diff --git a/SCAN-RSS/support/params.h b/SCAN-SSA/include/params.h
index 9f6aacc..a96b33f 100644
--- a/SCAN-RSS/support/params.h
+++ b/SCAN-SSA/include/params.h
@@ -18,7 +18,7 @@ static void usage() {
         "\n    -h        help"
         "\n    -w <W>    # of untimed warmup iterations (default=1)"
         "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+        "\n    -x <X>    Weak (0) or strong (1) scaling (default=1)"
         "\n"
         "\nBenchmark-specific options:"
         "\n    -i <I>    input size (default=3932160 elements)"
@@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) {
     p.input_size    = 3932160;
     p.n_warmup      = 1;
     p.n_reps        = 3;
-    p.exp           = 0;
+    p.exp           = 1;
 
     int opt;
     while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
diff --git a/SCAN-SSA/include/timer.h b/SCAN-SSA/include/timer.h
new file mode 100644
index 0000000..5b8eba3
--- /dev/null
+++ b/SCAN-SSA/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/SCAN-SSA/run-omp.sh b/SCAN-SSA/run-omp.sh
deleted file mode 100755
index ccbb1bd..0000000
--- a/SCAN-SSA/run-omp.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i: input size (number of elements, not number of bytes!)
-
-echo "prim-benchmarks SCAN-SSA (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_threads in 1 2 4 6 8 12 16 20 24 32; do
-	for i in 2048 4096 8192 16384 65536 262144 1048576 3932160 15728640 31457280; do
-		for dt in UINT32 UINT64 INT32 INT64 FLOAT DOUBLE; do
-			echo
-			if make -B TYPE=${dt} bin/omp_code; then
-				OMP_NUM_THREADS=$nr_threads timeout -k 1m 30m bin/omp_code -w 0 -e 100 -i ${i} || true
-			fi
-		done
-	done
-done
diff --git a/SCAN-SSA/run.sh b/SCAN-SSA/run.sh
deleted file mode 100755
index 54d5f93..0000000
--- a/SCAN-SSA/run.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i: input size (number of elements, not number of bytes!)
-
-echo "prim-benchmarks SCAN-SSA (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do
-	for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do
-		for i in  2048 4096 8192 16384 65536 262144 1048576 3932160; do
-			for dt in UINT32 UINT64 INT32 INT64 FLOAT DOUBLE; do
-				echo
-				if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 TYPE=${dt} UNROLL=1 \
-					|| make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 TYPE=${dt} UNROLL=0; then
-					timeout -k 1m 30m bin/host_code -w 0 -e 100 -i ${i} || true
-				fi
-			done
-		done
-	done
-done
diff --git a/SCAN-SSA/support/timer.h b/SCAN-SSA/support/timer.h
deleted file mode 100644
index 5411254..0000000
--- a/SCAN-SSA/support/timer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/SpMV/Makefile b/SpMV/Makefile
index 0e7a70c..c2d9d50 100644
--- a/SpMV/Makefile
+++ b/SpMV/Makefile
@@ -1,21 +1,31 @@
 NR_TASKLETS ?= 16
 NR_DPUS ?= 1
 
-COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard host/*.c)
-DPU_SOURCES := $(wildcard dpu/*.c)
-CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c)
-GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu)
-
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-CPU_BASE_FLAGS := -O3 -fopenmp
-GPU_BASE_FLAGS := -O3
+
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
 
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -24,19 +34,13 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-gpu: bin/gpu_baseline
-
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
-
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-
-bin/cpu_baseline: ${CPU_BASE_SOURCES}
-	${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS}
+bin/host_code: host/app.c include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ host/app.c ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/gpu_baseline: ${GPU_BASE_SOURCES}
-	${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS}
+bin/dpu_code: dpu/task.c include bin
+	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c
 
 clean:
 	${QUIET}rm -rf bin
diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile
index 5b2367b..a24b764 100644
--- a/SpMV/baselines/cpu/Makefile
+++ b/SpMV/baselines/cpu/Makefile
@@ -1,7 +1,15 @@
+native ?= 1
+
+CFLAGS =
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
 all: spmv
 
 spmv: app.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -o spmv -fopenmp app.c
+	gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o spmv -fopenmp app.c
 
 spmv_O0: app.c
 	gcc -o spmv_O0 -fopenmp app.c
diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c
index 8d360ee..e33761f 100644
--- a/SpMV/baselines/cpu/app.c
+++ b/SpMV/baselines/cpu/app.c
@@ -13,60 +13,63 @@
 #include "../../support/timer.h"
 #include "../../support/utils.h"
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
 
-    // Process parameters
-    struct Params p = input_params(argc, argv);
+	// Process parameters
+	struct Params p = input_params(argc, argv);
 
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    float* inVector = malloc(csrMatrix.numCols*sizeof(float));
-    float* outVector = malloc(csrMatrix.numRows*sizeof(float));
-    initVector(inVector, csrMatrix.numCols);
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	float *inVector = malloc(csrMatrix.numCols * sizeof(float));
+	float *outVector = malloc(csrMatrix.numRows * sizeof(float));
+	initVector(inVector, csrMatrix.numCols);
 
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    //omp_set_num_threads(4);
-    Timer timer;
-    startTimer(&timer);
-    #pragma omp parallel for
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = csrMatrix.rowPtrs[rowIdx]; i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = csrMatrix.nonzeros[i].col;
-            float value = csrMatrix.nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVector[rowIdx] = sum;
-    }
-    stopTimer(&timer);
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	//omp_set_num_threads(4);
+	Timer timer;
+	startTimer(&timer);
+#pragma omp parallel for
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = csrMatrix.rowPtrs[rowIdx];
+		     i < csrMatrix.rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = csrMatrix.nonzeros[i].col;
+			float value = csrMatrix.nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVector[rowIdx] = sum;
+	}
+	stopTimer(&timer);
 
-
-    unsigned int nr_threads = 0;
+	unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-    nr_threads++;
-
+	nr_threads++;
 
-    // coomatrix / csrmatrix use uint32_t indexes and float values
-    printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
-        " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
-        nr_threads, csrMatrix.numNonzeros,
-        csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6),
-        csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6),
-        getElapsedTime(timer)*1e6);
-    //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
-    PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+	// coomatrix / csrmatrix use uint32_t indexes and float values
+	printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |"
+	       " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n",
+	       nr_threads, csrMatrix.numNonzeros,
+	       csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer) *
+							1e6),
+	       csrMatrix.numNonzeros / (getElapsedTime(timer) * 1e6),
+	       getElapsedTime(timer) * 1e6);
+	//if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+	PRINT_INFO(p.verbosity >= 1, "    Elapsed time: %f ms",
+		   getElapsedTime(timer) * 1e3);
 
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
 
-    return 0;
+	return 0;
 
 }
diff --git a/SpMV/baselines/cpu/run-perf.sh b/SpMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..714498d
--- /dev/null
+++ b/SpMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
diff --git a/SpMV/benchmark-scripts/ccmcc25-sim.sh b/SpMV/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..9d1af4e
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 4 8 16; do
+	./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  SpMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+	::: data bcsstk30.mtx bcsstk30.4.mtx bcsstk30.8.mtx bcsstk30.16.mtx \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/benchmark-scripts/ccmcc25.sh b/SpMV/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..176ea99
--- /dev/null
+++ b/SpMV/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -v 0 -f data/${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+cd data/generate
+for i in 8 32 64; do
+	./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx
+done
+cd ../..
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  SpMV  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} numa_rank={numa_rank} \
+		::: i $(seq 0 10) \
+		::: data bcsstk30.mtx bcsstk30.8.mtx bcsstk30.32.mtx bcsstk30.64.mtx \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+	>> ${fn}.txt
+
+done
+
+rm -f data/bcsstk30.*.mtx
diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c
index 589b6f4..305a645 100644
--- a/SpMV/dpu/task.c
+++ b/SpMV/dpu/task.c
@@ -11,7 +11,7 @@
 #include <perfcounter.h>
 #include <seqread.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m   "fmt"\n", ##__VA_ARGS__)
 
@@ -20,120 +20,164 @@
 BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 // main
-int main() {
-
-    if(me() == 0) {
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Load parameters
-    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    uint32_t numRows = params_w->dpuNumRows;
-
-    // Sanity check
-    if(me() == 0) {
-        if(numRows%2 != 0) {
-            // The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
-            PRINT_ERROR("The number of rows is not a multiple of two!");
-        }
-    }
-
-    // Identify tasklet's rows
-    uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/NR_TASKLETS + 1); // Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
-    uint32_t taskletRowsStart = me()*numRowsPerTasklet;
-    uint32_t taskletNumRows;
-    if(taskletRowsStart > numRows) {
-        taskletNumRows = 0;
-    } else if(taskletRowsStart + numRowsPerTasklet > numRows) {
-        taskletNumRows = numRows - taskletRowsStart;
-    } else {
-        taskletNumRows = numRowsPerTasklet;
-    }
-
-    // Only process tasklets with nonzero number of rows
-    if(taskletNumRows > 0) {
-
-        // Extract parameters
-        uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
-        uint32_t rowPtrs_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
-        uint32_t nonzeros_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuNonzeros_m;
-        uint32_t inVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuInVector_m;
-        uint32_t outVector_m = ((uint32_t)DPU_MRAM_HEAP_POINTER) + params_w->dpuOutVector_m;
-
-        // Initialize row pointer sequential reader
-        uint32_t taskletRowPtrs_m = rowPtrs_m + taskletRowsStart*sizeof(uint32_t);
-        seqreader_t rowPtrReader;
-        uint32_t* taskletRowPtrs_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletRowPtrs_m, &rowPtrReader);
-        uint32_t firstRowPtr = *taskletRowPtrs_w;
-
-        // Initialize nonzeros sequential reader
-        uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
-        uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart*sizeof(struct Nonzero); // 8-byte aligned because Nonzero is 8 bytes
-        seqreader_t nonzerosReader;
-        struct Nonzero* taskletNonzeros_w = seqread_init(seqread_alloc(), (__mram_ptr void*)taskletNonzeros_m, &nonzerosReader);
-
-        // Initialize input vector cache
-        uint32_t inVectorTileSize = 64;
-        float* inVectorTile_w = mem_alloc(inVectorTileSize*sizeof(float));
-        mram_read((__mram_ptr void const*)inVector_m, inVectorTile_w, 256);
-        uint32_t currInVectorTileIdx = 0;
-
-        // Initialize output vector cache
-        uint32_t taskletOutVector_m = outVector_m + taskletRowsStart*sizeof(float);
-        uint32_t outVectorTileSize = 64;
-        float* outVectorTile_w = mem_alloc(outVectorTileSize*sizeof(float));
-
-        // SpMV
-        uint32_t nextRowPtr = firstRowPtr;
-        for(uint32_t row = 0; row < taskletNumRows; ++row) {
-
-            // Find row nonzeros
-            taskletRowPtrs_w = seqread_get(taskletRowPtrs_w, sizeof(uint32_t), &rowPtrReader);
-            uint32_t rowPtr = nextRowPtr;
-            nextRowPtr = *taskletRowPtrs_w;
-            uint32_t taskletNNZ = nextRowPtr - rowPtr;
-
-            // Multiply row with vector
-            float outValue = 0.0f;
-            for(uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
-
-                // Get matrix value
-                float matValue = taskletNonzeros_w->value;
-
-                // Get input vector value
-                uint32_t col = taskletNonzeros_w->col;
-                uint32_t inVectorTileIdx = col/inVectorTileSize;
-                uint32_t inVectorTileOffset = col%inVectorTileSize;
-                if(inVectorTileIdx != currInVectorTileIdx) {
-                    mram_read((__mram_ptr void const*)(inVector_m + inVectorTileIdx*inVectorTileSize*sizeof(float)), inVectorTile_w, 256);
-                    currInVectorTileIdx = inVectorTileIdx;
-                }
-                float inValue = inVectorTile_w[inVectorTileOffset];
-
-                // Multiply and add
-                outValue += matValue*inValue;
-
-                // Read next nonzero
-                taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader); // Last read will be out of bounds and unused
-
-            }
-
-            // Store output
-            uint32_t outVectorTileIdx = row/outVectorTileSize;
-            uint32_t outVectorTileOffset = row%outVectorTileSize;
-            outVectorTile_w[outVectorTileOffset] = outValue;
-            if(outVectorTileOffset == outVectorTileSize - 1) { // Last element in tile
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), 256);
-            } else if(row == taskletNumRows - 1) { // Last row for tasklet
-                mram_write(outVectorTile_w, (__mram_ptr void*)(taskletOutVector_m + outVectorTileIdx*outVectorTileSize*sizeof(float)), (taskletNumRows%outVectorTileSize)*sizeof(float));
-            }
-
-        }
-    }
-
-    return 0;
+int main()
+{
+
+	if (me() == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Load parameters
+	uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	struct DPUParams *params_w =
+	    (struct DPUParams *)
+	    mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	mram_read((__mram_ptr void const *)params_m, params_w,
+		  ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	uint32_t numRows = params_w->dpuNumRows;
+
+	// Sanity check
+	if (me() == 0) {
+		if (numRows % 2 != 0) {
+			// The number of rows assigned to the DPU must be a multiple of two to ensure that writes to the output vector are aligned to 8 bytes
+			PRINT_ERROR
+			    ("The number of rows is not a multiple of two!");
+		}
+	}
+	// Identify tasklet's rows
+	uint32_t numRowsPerTasklet = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / NR_TASKLETS + 1);	// Multiple of two to ensure that access to rowPtrs and outVector is 8-byte aligned
+	uint32_t taskletRowsStart = me() * numRowsPerTasklet;
+	uint32_t taskletNumRows;
+	if (taskletRowsStart > numRows) {
+		taskletNumRows = 0;
+	} else if (taskletRowsStart + numRowsPerTasklet > numRows) {
+		taskletNumRows = numRows - taskletRowsStart;
+	} else {
+		taskletNumRows = numRowsPerTasklet;
+	}
+
+	// Only process tasklets with nonzero number of rows
+	if (taskletNumRows > 0) {
+
+		// Extract parameters
+		uint32_t rowPtrsOffset = params_w->dpuRowPtrsOffset;
+		uint32_t rowPtrs_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) + params_w->dpuRowPtrs_m;
+		uint32_t nonzeros_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuNonzeros_m;
+		uint32_t inVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuInVector_m;
+		uint32_t outVector_m =
+		    ((uint32_t) DPU_MRAM_HEAP_POINTER) +
+		    params_w->dpuOutVector_m;
+
+		// Initialize row pointer sequential reader
+		uint32_t taskletRowPtrs_m =
+		    rowPtrs_m + taskletRowsStart * sizeof(uint32_t);
+		seqreader_t rowPtrReader;
+		uint32_t *taskletRowPtrs_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletRowPtrs_m,
+				 &rowPtrReader);
+		uint32_t firstRowPtr = *taskletRowPtrs_w;
+
+		// Initialize nonzeros sequential reader
+		uint32_t taskletNonzerosStart = firstRowPtr - rowPtrsOffset;
+		uint32_t taskletNonzeros_m = nonzeros_m + taskletNonzerosStart * sizeof(struct Nonzero);	// 8-byte aligned because Nonzero is 8 bytes
+		seqreader_t nonzerosReader;
+		struct Nonzero *taskletNonzeros_w =
+		    seqread_init(seqread_alloc(),
+				 (__mram_ptr void *)taskletNonzeros_m,
+				 &nonzerosReader);
+
+		// Initialize input vector cache
+		uint32_t inVectorTileSize = 64;
+		float *inVectorTile_w =
+		    mem_alloc(inVectorTileSize * sizeof(float));
+		mram_read((__mram_ptr void const *)inVector_m, inVectorTile_w,
+			  256);
+		uint32_t currInVectorTileIdx = 0;
+
+		// Initialize output vector cache
+		uint32_t taskletOutVector_m =
+		    outVector_m + taskletRowsStart * sizeof(float);
+		uint32_t outVectorTileSize = 64;
+		float *outVectorTile_w =
+		    mem_alloc(outVectorTileSize * sizeof(float));
+
+		// SpMV
+		uint32_t nextRowPtr = firstRowPtr;
+		for (uint32_t row = 0; row < taskletNumRows; ++row) {
+
+			// Find row nonzeros
+			taskletRowPtrs_w =
+			    seqread_get(taskletRowPtrs_w, sizeof(uint32_t),
+					&rowPtrReader);
+			uint32_t rowPtr = nextRowPtr;
+			nextRowPtr = *taskletRowPtrs_w;
+			uint32_t taskletNNZ = nextRowPtr - rowPtr;
+
+			// Multiply row with vector
+			float outValue = 0.0f;
+			for (uint32_t nzIdx = 0; nzIdx < taskletNNZ; ++nzIdx) {
+
+				// Get matrix value
+				float matValue = taskletNonzeros_w->value;
+
+				// Get input vector value
+				uint32_t col = taskletNonzeros_w->col;
+				uint32_t inVectorTileIdx =
+				    col / inVectorTileSize;
+				uint32_t inVectorTileOffset =
+				    col % inVectorTileSize;
+				if (inVectorTileIdx != currInVectorTileIdx) {
+					mram_read((__mram_ptr void const
+						   *)(inVector_m +
+						      inVectorTileIdx *
+						      inVectorTileSize *
+						      sizeof(float)),
+						  inVectorTile_w, 256);
+					currInVectorTileIdx = inVectorTileIdx;
+				}
+				float inValue =
+				    inVectorTile_w[inVectorTileOffset];
+
+				// Multiply and add
+				outValue += matValue * inValue;
+
+				// Read next nonzero
+				taskletNonzeros_w = seqread_get(taskletNonzeros_w, sizeof(struct Nonzero), &nonzerosReader);	// Last read will be out of bounds and unused
+
+			}
+
+			// Store output
+			uint32_t outVectorTileIdx = row / outVectorTileSize;
+			uint32_t outVectorTileOffset = row % outVectorTileSize;
+			outVectorTile_w[outVectorTileOffset] = outValue;
+			if (outVectorTileOffset == outVectorTileSize - 1) {	// Last element in tile
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)), 256);
+			} else if (row == taskletNumRows - 1) {	// Last row for tasklet
+				mram_write(outVectorTile_w,
+					   (__mram_ptr void
+					    *)(taskletOutVector_m +
+					       outVectorTileIdx *
+					       outVectorTileSize *
+					       sizeof(float)),
+					   (taskletNumRows %
+					    outVectorTileSize) * sizeof(float));
+			}
+
+		}
+	}
+
+	return 0;
 }
diff --git a/SpMV/host/app.c b/SpMV/host/app.c
index fe9c751..6cf2861 100644
--- a/SpMV/host/app.c
+++ b/SpMV/host/app.c
@@ -3,9 +3,24 @@
 * SpMV Host Application Source File
 *
 */
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
 
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <assert.h>
 #include <getopt.h>
 #include <stdio.h>
@@ -14,247 +29,301 @@
 #include <unistd.h>
 
 #include "mram-management.h"
-#include "../support/common.h"
-#include "../support/matrix.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "matrix.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
 
 #define DPU_BINARY "./bin/dpu_code"
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#ifndef ENERGY
-#define ENERGY 0
-#endif
+// Main of the Host Application
+int main(int argc, char **argv)
+{
+
+	// Process parameters
+	struct Params p = input_params(argc, argv);
+
+	// Timing and profiling
+	Timer timer;
+	double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime =
+	    0.0f, readTime = 0.0f, freeTime = 0.0f;
 #if ENERGY
-#include <dpu_probe.h>
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-// Main of the Host Application
-int main(int argc, char** argv) {
-
-    // Process parameters
-    struct Params p = input_params(argc, argv);
-
-    // Timing and profiling
-    Timer timer;
-    double allocTime = 0.0f, loadTime = 0.0f, writeTime = 0.0f, dpuTime = 0.0f, readTime = 0.0f, freeTime = 0.0f;
-    #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-    #endif
-
-    // Allocate DPUs and load binary
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t numDPUs, numRanks;
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    stopTimer(&timer);
-    allocTime += getElapsedTime(timer);
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    stopTimer(&timer);
-    loadTime += getElapsedTime(timer);
-
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
-    assert(numDPUs == NR_DPUS);
-    PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
-    // Initialize SpMV data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
-    struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros", cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
-    struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
-    uint32_t numRows = csrMatrix.numRows;
-    uint32_t numCols = csrMatrix.numCols;
-    uint32_t* rowPtrs = csrMatrix.rowPtrs;
-    struct Nonzero* nonzeros = csrMatrix.nonzeros;
-    float* inVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols*sizeof(float)));
-    initVector(inVector, numCols);
-    float* outVector = malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows*sizeof(float)));
-
-    // Partition data structure across DPUs
-    uint32_t numRowsPerDPU = ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1)/numDPUs + 1);
-    PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU", numRowsPerDPU);
-    struct DPUParams dpuParams[numDPUs];
-    unsigned int dpuIdx = 0;
-    PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
-    DPU_FOREACH (dpu_set, dpu) {
-
-        // Allocate parameters
-        struct mram_heap_allocator_t allocator;
-        init_allocator(&allocator);
-        uint32_t dpuParams_m = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
-        // Find DPU's rows
-        uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-        uint32_t dpuNumRows;
-        if(dpuStartRowIdx > numRows) {
-            dpuNumRows = 0;
-        } else if(dpuStartRowIdx + numRowsPerDPU > numRows) {
-            dpuNumRows = numRows - dpuStartRowIdx;
-        } else {
-            dpuNumRows = numRowsPerDPU;
-        }
-        dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
-        PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
-        PRINT_INFO(p.verbosity >= 2, "        Receives %u rows", dpuNumRows);
-
-        // Partition nonzeros and copy data
-        if(dpuNumRows > 0) {
-
-            // Find DPU's CSR matrix partition
-            uint32_t* dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
-            uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
-            struct Nonzero* dpuNonzeros_h = &nonzeros[dpuRowPtrsOffset];
-            uint32_t dpuNumNonzeros = dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
-
-            // Allocate MRAM
-            uint32_t dpuRowPtrs_m = mram_heap_alloc(&allocator, (dpuNumRows + 1)*sizeof(uint32_t));
-            uint32_t dpuNonzeros_m = mram_heap_alloc(&allocator, dpuNumNonzeros*sizeof(struct Nonzero));
-            uint32_t dpuInVector_m = mram_heap_alloc(&allocator, numCols*sizeof(float));
-            uint32_t dpuOutVector_m = mram_heap_alloc(&allocator, dpuNumRows*sizeof(float));
-            assert((dpuNumRows*sizeof(float))%8 == 0 && "Output sub-vector must be a multiple of 8 bytes!");
-            PRINT_INFO(p.verbosity >= 2, "        Total memory allocated is %d bytes", allocator.totalAllocated);
-
-            // Set up DPU parameters
-            dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
-            dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
-            dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
-            dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
-            dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
-
-            // Send data to DPU
-            PRINT_INFO(p.verbosity >= 2, "        Copying data to DPU");
-            startTimer(&timer);
-            copyToDPU(dpu, (uint8_t*)dpuRowPtrs_h, dpuRowPtrs_m, (dpuNumRows + 1)*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNonzeros_h, dpuNonzeros_m, dpuNumNonzeros*sizeof(struct Nonzero));
-            copyToDPU(dpu, (uint8_t*)inVector, dpuInVector_m, numCols*sizeof(float));
-            stopTimer(&timer);
-            writeTime += getElapsedTime(timer);
-
-        }
-
-        // Send parameters to DPU
-        PRINT_INFO(p.verbosity >= 2, "        Copying parameters to DPU");
-        startTimer(&timer);
-        copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m, sizeof(struct DPUParams));
-        stopTimer(&timer);
-        writeTime += getElapsedTime(timer);
-
-        ++dpuIdx;
-
-    }
-    PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms", writeTime*1e3);
-
-    // Run all DPUs
-    PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
-    startTimer(&timer);
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_start(&probe));
-    #endif
-    DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-    #if ENERGY
-    DPU_ASSERT(dpu_probe_stop(&probe));
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
-    #endif
-    stopTimer(&timer);
-    dpuTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime*1e3);
-
-    // Copy back result
-    PRINT_INFO(p.verbosity >= 1, "Copying back the result");
-    startTimer(&timer);
-    dpuIdx = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-        unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
-        if(dpuNumRows > 0) {
-            uint32_t dpuStartRowIdx = dpuIdx*numRowsPerDPU;
-            copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, (uint8_t*)(outVector + dpuStartRowIdx), dpuNumRows*sizeof(float));
-        }
-        ++dpuIdx;
-    }
-    stopTimer(&timer);
-    readTime += getElapsedTime(timer);
-    PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime*1e3);
-
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    float* outVectorReference = malloc(numRows*sizeof(float));
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float sum = 0.0f;
-        for(uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
-            uint32_t colIdx = nonzeros[i].col;
-            float value = nonzeros[i].value;
-            sum += inVector[colIdx]*value;
-        }
-        outVectorReference[rowIdx] = sum;
-    }
-
-    // Verify the result
-    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
-    int status = 1;
-    for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
-        float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx];
-        const float tolerance = 0.00001;
-        if(diff > tolerance || diff < -tolerance) {
-            status = 0;
-            PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]);
-        }
-    }
-
-    startTimer(&timer);
-    DPU_ASSERT(dpu_free(dpu_set));
-    stopTimer(&timer);
-    freeTime += getElapsedTime(timer);
-
-    if (status) {
-        printf("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
-            numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros);
-        printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-            allocTime, loadTime, writeTime, dpuTime, readTime, freeTime);
-        printf(" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-            csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-        printf(" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-            // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
-            csrMatrix.numNonzeros / (dpuTime * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6));
-        printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f",
-            csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((loadTime + writeTime + dpuTime + readTime) * 1e6),
-            csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6));
-    }
-
-    // Display DPU Logs
-    if(p.verbosity >= 2) {
-        PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            PRINT("DPU %u:", dpuIdx);
-            DPU_ASSERT(dpu_log_read(dpu, stdout));
-            ++dpuIdx;
-        }
-    }
-
-    // Deallocate data structures
-    freeCOOMatrix(cooMatrix);
-    freeCSRMatrix(csrMatrix);
-    free(inVector);
-    free(outVector);
-    free(outVectorReference);
-
-    return 0;
+	// Allocate DPUs and load binary
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t numDPUs, numRanks;
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	stopTimer(&timer);
+	allocTime += getElapsedTime(timer);
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	stopTimer(&timer);
+	loadTime += getElapsedTime(timer);
+
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+	assert(numDPUs == NR_DPUS);
+	PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+	// Initialize SpMV data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading matrix %s", p.fileName);
+	struct COOMatrix cooMatrix = readCOOMatrix(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    %u rows, %u columns, %u nonzeros",
+		   cooMatrix.numRows, cooMatrix.numCols, cooMatrix.numNonzeros);
+	struct CSRMatrix csrMatrix = coo2csr(cooMatrix);
+	uint32_t numRows = csrMatrix.numRows;
+	uint32_t numCols = csrMatrix.numCols;
+	uint32_t *rowPtrs = csrMatrix.rowPtrs;
+	struct Nonzero *nonzeros = csrMatrix.nonzeros;
+	float *inVector =
+	    (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)));
+	initVector(inVector, numCols);
+	float *outVector =
+	    (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float)));
+
+	// Partition data structure across DPUs
+	uint32_t numRowsPerDPU =
+	    ROUND_UP_TO_MULTIPLE_OF_2((numRows - 1) / numDPUs + 1);
+	PRINT_INFO(p.verbosity >= 1, "Assigning %u rows per DPU",
+		   numRowsPerDPU);
+	struct DPUParams dpuParams[numDPUs];
+	unsigned int dpuIdx = 0;
+	PRINT_INFO(p.verbosity == 1, "Copying data to DPUs");
+	DPU_FOREACH(dpu_set, dpu) {
+
+		// Allocate parameters
+		struct mram_heap_allocator_t allocator;
+		init_allocator(&allocator);
+		uint32_t dpuParams_m =
+		    mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+		// Find DPU's rows
+		uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+		uint32_t dpuNumRows;
+		if (dpuStartRowIdx > numRows) {
+			dpuNumRows = 0;
+		} else if (dpuStartRowIdx + numRowsPerDPU > numRows) {
+			dpuNumRows = numRows - dpuStartRowIdx;
+		} else {
+			dpuNumRows = numRowsPerDPU;
+		}
+		dpuParams[dpuIdx].dpuNumRows = dpuNumRows;
+		PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
+		PRINT_INFO(p.verbosity >= 2, "        Receives %u rows",
+			   dpuNumRows);
+
+		// Partition nonzeros and copy data
+		if (dpuNumRows > 0) {
+
+			// Find DPU's CSR matrix partition
+			uint32_t *dpuRowPtrs_h = &rowPtrs[dpuStartRowIdx];
+			uint32_t dpuRowPtrsOffset = dpuRowPtrs_h[0];
+			struct Nonzero *dpuNonzeros_h =
+			    &nonzeros[dpuRowPtrsOffset];
+			uint32_t dpuNumNonzeros =
+			    dpuRowPtrs_h[dpuNumRows] - dpuRowPtrsOffset;
+
+			// Allocate MRAM
+			uint32_t dpuRowPtrs_m =
+			    mram_heap_alloc(&allocator,
+					    (dpuNumRows +
+					     1) * sizeof(uint32_t));
+			uint32_t dpuNonzeros_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNonzeros *
+					    sizeof(struct Nonzero));
+			uint32_t dpuInVector_m =
+			    mram_heap_alloc(&allocator,
+					    numCols * sizeof(float));
+			uint32_t dpuOutVector_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumRows * sizeof(float));
+			assert((dpuNumRows * sizeof(float)) % 8 == 0
+			       &&
+			       "Output sub-vector must be a multiple of 8 bytes!");
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Total memory allocated is %d bytes",
+				   allocator.totalAllocated);
+
+			// Set up DPU parameters
+			dpuParams[dpuIdx].dpuRowPtrsOffset = dpuRowPtrsOffset;
+			dpuParams[dpuIdx].dpuRowPtrs_m = dpuRowPtrs_m;
+			dpuParams[dpuIdx].dpuNonzeros_m = dpuNonzeros_m;
+			dpuParams[dpuIdx].dpuInVector_m = dpuInVector_m;
+			dpuParams[dpuIdx].dpuOutVector_m = dpuOutVector_m;
+
+			// Send data to DPU
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Copying data to DPU");
+			startTimer(&timer);
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuRowPtrs_m, (uint8_t *) dpuRowPtrs_h,
+						ROUND_UP_TO_MULTIPLE_OF_8((dpuNumRows + 1) * sizeof(uint32_t))));
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuNonzeros_m, (uint8_t *) dpuNonzeros_h,
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNonzeros * sizeof(struct Nonzero))));
+			DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuInVector_m, (uint8_t *) inVector,
+						ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))));
+			stopTimer(&timer);
+			writeTime += getElapsedTime(timer);
+		}
+		// Send parameters to DPU
+		PRINT_INFO(p.verbosity >= 2,
+			   "        Copying parameters to DPU");
+		startTimer(&timer);
+		DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+					dpuParams_m, (uint8_t *) & dpuParams[dpuIdx],
+					ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+		stopTimer(&timer);
+		writeTime += getElapsedTime(timer);
+
+		++dpuIdx;
+
+	}
+	PRINT_INFO(p.verbosity >= 1, "    CPU-DPU Time: %f ms",
+		   writeTime * 1e3);
+
+	// Run all DPUs
+	PRINT_INFO(p.verbosity >= 1, "Booting DPUs");
+	startTimer(&timer);
+#if ENERGY
+	DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+	DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+#if ENERGY
+	DPU_ASSERT(dpu_probe_stop(&probe));
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	PRINT_INFO(p.verbosity >= 1, "    DPU Energy: %f J", energy);
+#endif
+	stopTimer(&timer);
+	dpuTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU Time: %f ms", dpuTime * 1e3);
+
+	// Copy back result
+	PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+	startTimer(&timer);
+	dpuIdx = 0;
+
+	DPU_FOREACH(dpu_set, dpu) {
+		unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows;
+		if (dpuNumRows > 0) {
+			uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU;
+			DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+						dpuParams[dpuIdx].dpuOutVector_m,
+						(uint8_t *) (outVector + dpuStartRowIdx),
+						ROUND_UP_TO_MULTIPLE_OF_8(dpuNumRows * sizeof(float))));
+		}
+		++dpuIdx;
+	}
+	stopTimer(&timer);
+	readTime += getElapsedTime(timer);
+	PRINT_INFO(p.verbosity >= 1, "    DPU-CPU Time: %f ms", readTime * 1e3);
+
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	float *outVectorReference = (float*)malloc(numRows * sizeof(float));
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float sum = 0.0f;
+		for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) {
+			uint32_t colIdx = nonzeros[i].col;
+			float value = nonzeros[i].value;
+			sum += inVector[colIdx] * value;
+		}
+		outVectorReference[rowIdx] = sum;
+	}
+
+	// Verify the result
+	PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+	int status = 1;
+	for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+		float diff =
+		    (outVectorReference[rowIdx] -
+		     outVector[rowIdx]) / outVectorReference[rowIdx];
+		const float tolerance = 0.00001;
+		if (diff > tolerance || diff < -tolerance) {
+			status = 0;
+			PRINT_ERROR
+			    ("Mismatch at index %u (CPU result = %f, DPU result = %f)",
+			     rowIdx, outVectorReference[rowIdx],
+			     outVector[rowIdx]);
+		}
+	}
+
+	startTimer(&timer);
+	DPU_ASSERT(dpu_free(dpu_set));
+	stopTimer(&timer);
+	freeTime += getElapsedTime(timer);
+
+	if (status) {
+		dfatool_printf
+		    ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ",
+		     numDPUs, numRanks, NR_TASKLETS, "float",
+		     csrMatrix.numNonzeros);
+		dfatool_printf
+		    ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+		     allocTime, loadTime, writeTime, dpuTime, readTime,
+		     freeTime);
+		dfatool_printf
+		    (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros * sizeof(float) /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+		     // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit
+		     csrMatrix.numNonzeros / (dpuTime * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime + readTime +
+		       freeTime) * 1e6));
+		dfatool_printf
+		    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+		     csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) *
+					      1e6),
+		     csrMatrix.numNonzeros /
+		     ((loadTime + writeTime + dpuTime + readTime) * 1e6),
+		     csrMatrix.numNonzeros /
+		     ((allocTime + loadTime + writeTime + dpuTime +
+		       readTime) * 1e6));
+	}
+	// Display DPU Logs
+	if (p.verbosity >= 2) {
+		PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			PRINT("DPU %u:", dpuIdx);
+			DPU_ASSERT(dpu_log_read(dpu, stdout));
+			++dpuIdx;
+		}
+	}
+	// Deallocate data structures
+	freeCOOMatrix(cooMatrix);
+	freeCSRMatrix(csrMatrix);
+	free(inVector);
+	free(outVector);
+	free(outVectorReference);
+
+	return 0;
 }
diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h
index 627dfde..a953d6a 100644
--- a/SpMV/host/mram-management.h
+++ b/SpMV/host/mram-management.h
@@ -1,37 +1,29 @@
+#pragma once
 
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
+#include "common.h"
+#include "utils.h"
 
-#include "../support/common.h"
-#include "../support/utils.h"
-
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20)	// A DPU's capacity is 64 MiB
 
 struct mram_heap_allocator_t {
-    uint32_t totalAllocated;
+	uint32_t totalAllocated;
 };
 
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
-    allocator->totalAllocated = 0;
-}
-
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
-    uint32_t ret = allocator->totalAllocated;
-    allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
-    if(allocator->totalAllocated > DPU_CAPACITY) {
-        PRINT_ERROR("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
-        exit(0);
-    }
-    return ret;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+	allocator->totalAllocated = 0;
 }
 
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
-    DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+				uint32_t size)
+{
+	uint32_t ret = allocator->totalAllocated;
+	allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+	if (allocator->totalAllocated > DPU_CAPACITY) {
+		PRINT_ERROR
+		    ("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+		     allocator->totalAllocated, DPU_CAPACITY);
+		exit(0);
+	}
+	return ret;
 }
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
-    DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
-
diff --git a/SpMV/include/common.h b/SpMV/include/common.h
new file mode 100644
index 0000000..6118814
--- /dev/null
+++ b/SpMV/include/common.h
@@ -0,0 +1,24 @@
+
+/* Common data structures between host and DPUs */
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
+#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
+
+struct DPUParams {
+	uint32_t dpuNumRows;	/* Number of rows assigned to the DPU */
+	uint32_t dpuRowPtrsOffset;	/* Offset of the row pointers */
+	uint32_t dpuRowPtrs_m;
+	uint32_t dpuNonzeros_m;
+	uint32_t dpuInVector_m;
+	uint32_t dpuOutVector_m;
+};
+
+struct Nonzero {
+	uint32_t col;
+	float value;
+};
+
+#endif
diff --git a/SpMV/include/dfatool_host.ah b/SpMV/include/dfatool_host.ah
new file mode 100644
index 0000000..91d44bd
--- /dev/null
+++ b/SpMV/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_rows, n_cols, n_nonzero;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(float);
+	}
+
+	advice call("% input_params(...)"): after() {
+		printf("[>>] SpMV | n_dpus=%u\n", NR_DPUS);
+	}
+
+	advice call("% readCOOMatrix(...)") : after() {
+		struct COOMatrix* c = tjp->result();
+		n_rows = c->numRows;
+		n_cols = c->numCols;
+		n_nonzero = c->numNonzeros;
+		printf("[--] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero);
+	}
+};
diff --git a/SpMV/include/matrix.h b/SpMV/include/matrix.h
new file mode 100644
index 0000000..ce8745e
--- /dev/null
+++ b/SpMV/include/matrix.h
@@ -0,0 +1,138 @@
+
+#ifndef _MATRIX_H_
+#define _MATRIX_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "utils.h"
+
+struct COOMatrix {
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowIdxs;
+	struct Nonzero *nonzeros;
+};
+
+struct CSRMatrix {
+	uint32_t numRows;
+	uint32_t numCols;
+	uint32_t numNonzeros;
+	uint32_t *rowPtrs;
+	struct Nonzero *nonzeros;
+};
+
+static struct COOMatrix readCOOMatrix(const char *fileName)
+{
+
+	struct COOMatrix cooMatrix;
+
+	// Initialize fields
+	FILE *fp = fopen(fileName, "r");
+	assert(fscanf(fp, "%u", &cooMatrix.numRows));
+	if (cooMatrix.numRows % 2 == 1) {
+		PRINT_WARNING
+		    ("Reading matrix %s: number of rows must be even. Padding with an extra row.",
+		     fileName);
+		cooMatrix.numRows++;
+	}
+	assert(fscanf(fp, "%u", &cooMatrix.numCols));
+	assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
+	cooMatrix.rowIdxs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(uint32_t)));
+	cooMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (cooMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Read the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx;
+		assert(fscanf(fp, "%u", &rowIdx));
+		cooMatrix.rowIdxs[i] = rowIdx - 1;	// File format indexes begin at 1
+		uint32_t colIdx;
+		assert(fscanf(fp, "%u", &colIdx));
+		cooMatrix.nonzeros[i].col = colIdx - 1;	// File format indexes begin at 1
+		cooMatrix.nonzeros[i].value = 1.0f;
+	}
+
+	return cooMatrix;
+
+}
+
+static void freeCOOMatrix(struct COOMatrix cooMatrix)
+{
+	free(cooMatrix.rowIdxs);
+	free(cooMatrix.nonzeros);
+}
+
+static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix)
+{
+
+	struct CSRMatrix csrMatrix;
+
+	// Initialize fields
+	csrMatrix.numRows = cooMatrix.numRows;
+	csrMatrix.numCols = cooMatrix.numCols;
+	csrMatrix.numNonzeros = cooMatrix.numNonzeros;
+	csrMatrix.rowPtrs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   ((csrMatrix.numRows + 1) * sizeof(uint32_t)));
+	csrMatrix.nonzeros =
+	    (struct Nonzero *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (csrMatrix.numNonzeros * sizeof(struct Nonzero)));
+
+	// Histogram rowIdxs
+	memset(csrMatrix.rowPtrs, 0,
+	       (csrMatrix.numRows + 1) * sizeof(uint32_t));
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		csrMatrix.rowPtrs[rowIdx]++;
+	}
+
+	// Prefix sum rowPtrs
+	uint32_t sumBeforeNextRow = 0;
+	for (uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
+		uint32_t sumBeforeRow = sumBeforeNextRow;
+		sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
+		csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
+	}
+	csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
+
+	// Bin the nonzeros
+	for (uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
+		uint32_t rowIdx = cooMatrix.rowIdxs[i];
+		uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
+		csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
+	}
+
+	// Restore rowPtrs
+	for (uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
+		csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
+	}
+	csrMatrix.rowPtrs[0] = 0;
+
+	return csrMatrix;
+
+}
+
+static void freeCSRMatrix(struct CSRMatrix csrMatrix)
+{
+	free(csrMatrix.rowPtrs);
+	free(csrMatrix.nonzeros);
+}
+
+static void initVector(float *vec, uint32_t size)
+{
+	for (uint32_t i = 0; i < size; ++i) {
+		vec[i] = 1.0f;
+	}
+}
+
+#endif
diff --git a/SpMV/include/params.h b/SpMV/include/params.h
new file mode 100644
index 0000000..bf60e79
--- /dev/null
+++ b/SpMV/include/params.h
@@ -0,0 +1,51 @@
+
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+#include "utils.h"
+
+static void usage()
+{
+	PRINT("\nUsage:  ./program [options]"
+	      "\n"
+	      "\nBenchmark-specific options:"
+	      "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
+	      "\n"
+	      "\nGeneral options:"
+	      "\n    -v <V>    verbosity" "\n    -h        help" "\n\n");
+}
+
+typedef struct Params {
+	const char *fileName;
+	unsigned int verbosity;
+} Params;
+
+static struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.fileName = "data/bcsstk30.mtx";
+	p.verbosity = 1;
+	int opt;
+	while ((opt = getopt(argc, argv, "f:v:h")) >= 0) {
+		switch (opt) {
+		case 'f':
+			p.fileName = optarg;
+			break;
+		case 'v':
+			p.verbosity = atoi(optarg);
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			PRINT_ERROR("Unrecognized option!");
+			usage();
+			exit(0);
+		}
+	}
+
+	return p;
+}
+
+#endif
diff --git a/SpMV/include/timer.h b/SpMV/include/timer.h
new file mode 100644
index 0000000..cb513cb
--- /dev/null
+++ b/SpMV/include/timer.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <stdio.h>
+#include <sys/time.h>
+
+#if DFATOOL_TIMING
+
+#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0)
+
+typedef struct Timer {
+	struct timeval startTime;
+	struct timeval endTime;
+} Timer;
+
+static void startTimer(Timer *timer)
+{
+	gettimeofday(&(timer->startTime), NULL);
+}
+
+static void stopTimer(Timer *timer)
+{
+	gettimeofday(&(timer->endTime), NULL);
+}
+
+static double getElapsedTime(Timer timer)
+{
+	return ((double)((timer.endTime.tv_sec - timer.startTime.tv_sec)
+			 + (timer.endTime.tv_usec -
+			    timer.startTime.tv_usec) / 1.0e6));
+}
+
+#else
+
+#define dfatool_printf(fmt, ...) do {} while (0)
+
+typedef int Timer;
+
+static void startTimer(Timer* timer)
+{
+	(void)timer;
+}
+
+static void stopTimer(Timer* timer)
+{
+	(void)timer;
+}
+
+static double getElapsedTime(Timer timer)
+{
+	(void)timer;
+	return 0.0;
+}
+
+#endif
diff --git a/SpMV/support/utils.h b/SpMV/include/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/SpMV/support/utils.h
+++ b/SpMV/include/utils.h
@@ -8,4 +8,3 @@
 #define PRINT(fmt, ...)             printf(fmt "\n", ##__VA_ARGS__)
 
 #endif
-
diff --git a/SpMV/run-paper-strong-full.sh b/SpMV/run-paper-strong-full.sh
deleted file mode 100755
index 09b7085..0000000
--- a/SpMV/run-paper-strong-full.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-cd data/generate
-./replicate ../bcsstk30.mtx 64 ../bcsstk30.mtx.64.mtx
-cd ../..
-
-# >2048 is not in upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
-
-rm -f data/bcsstk30.mtx.64.mtx
diff --git a/SpMV/run-paper-strong-rank.sh b/SpMV/run-paper-strong-rank.sh
deleted file mode 100755
index c73a6a0..0000000
--- a/SpMV/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 || true
-			done
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/SpMV/run-paper-weak.sh b/SpMV/run-paper-weak.sh
deleted file mode 100755
index 74683cc..0000000
--- a/SpMV/run-paper-weak.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks SpMV weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	cd data/generate
-	make
-	./replicate ../bcsstk30.mtx ${nr_dpus} /tmp/bcsstk30.mtx.${nr_dpus}.mtx
-	cd ../..
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
-			# repetition is not part of upstream setup
-			for i in `seq 1 50`; do
-				timeout --foreground -k 1m 3m bin/host_code -v 0 -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx || true
-			done
-		fi
-	done
-	rm -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx
-done |
-) tee log-paper-weak.txt
diff --git a/SpMV/support/common.h b/SpMV/support/common.h
deleted file mode 100644
index 58fede8..0000000
--- a/SpMV/support/common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-
-/* Common data structures between host and DPUs */
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#define ROUND_UP_TO_MULTIPLE_OF_2(x)    ((((x) + 1)/2)*2)
-#define ROUND_UP_TO_MULTIPLE_OF_8(x)    ((((x) + 7)/8)*8)
-
-struct DPUParams {
-    uint32_t dpuNumRows; /* Number of rows assigned to the DPU */
-    uint32_t dpuRowPtrsOffset; /* Offset of the row pointers */
-    uint32_t dpuRowPtrs_m;
-    uint32_t dpuNonzeros_m;
-    uint32_t dpuInVector_m;
-    uint32_t dpuOutVector_m;
-};
-
-struct Nonzero {
-    uint32_t col;
-    float value;
-};
-
-#endif
-
diff --git a/SpMV/support/matrix.h b/SpMV/support/matrix.h
deleted file mode 100644
index d25da1b..0000000
--- a/SpMV/support/matrix.h
+++ /dev/null
@@ -1,119 +0,0 @@
-
-#ifndef _MATRIX_H_
-#define _MATRIX_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "common.h"
-#include "utils.h"
-
-struct COOMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowIdxs;
-    struct Nonzero* nonzeros;
-};
-
-struct CSRMatrix {
-    uint32_t numRows;
-    uint32_t numCols;
-    uint32_t numNonzeros;
-    uint32_t* rowPtrs;
-    struct Nonzero* nonzeros;
-};
-
-static struct COOMatrix readCOOMatrix(const char* fileName) {
-
-    struct COOMatrix cooMatrix;
-
-    // Initialize fields
-    FILE* fp = fopen(fileName, "r");
-    assert(fscanf(fp, "%u", &cooMatrix.numRows));
-    if(cooMatrix.numRows%2 == 1) {
-        PRINT_WARNING("Reading matrix %s: number of rows must be even. Padding with an extra row.", fileName);
-        cooMatrix.numRows++;
-    }
-    assert(fscanf(fp, "%u", &cooMatrix.numCols));
-    assert(fscanf(fp, "%u", &cooMatrix.numNonzeros));
-    cooMatrix.rowIdxs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(uint32_t)));
-    cooMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(cooMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Read the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx;
-        assert(fscanf(fp, "%u", &rowIdx));
-        cooMatrix.rowIdxs[i] = rowIdx - 1; // File format indexes begin at 1
-        uint32_t colIdx;
-        assert(fscanf(fp, "%u", &colIdx));
-        cooMatrix.nonzeros[i].col = colIdx - 1; // File format indexes begin at 1
-        cooMatrix.nonzeros[i].value = 1.0f;
-    }
-
-    return cooMatrix;
-
-}
-
-static void freeCOOMatrix(struct COOMatrix cooMatrix) {
-    free(cooMatrix.rowIdxs);
-    free(cooMatrix.nonzeros);
-}
-
-static struct CSRMatrix coo2csr(struct COOMatrix cooMatrix) {
-
-    struct CSRMatrix csrMatrix;
-
-    // Initialize fields
-    csrMatrix.numRows = cooMatrix.numRows;
-    csrMatrix.numCols = cooMatrix.numCols;
-    csrMatrix.numNonzeros = cooMatrix.numNonzeros;
-    csrMatrix.rowPtrs = (uint32_t*) malloc(ROUND_UP_TO_MULTIPLE_OF_8((csrMatrix.numRows + 1)*sizeof(uint32_t)));
-    csrMatrix.nonzeros = (struct Nonzero*) malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrMatrix.numNonzeros*sizeof(struct Nonzero)));
-
-    // Histogram rowIdxs
-    memset(csrMatrix.rowPtrs, 0, (csrMatrix.numRows + 1)*sizeof(uint32_t));
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        csrMatrix.rowPtrs[rowIdx]++;
-    }
-
-    // Prefix sum rowPtrs
-    uint32_t sumBeforeNextRow = 0;
-    for(uint32_t rowIdx = 0; rowIdx < csrMatrix.numRows; ++rowIdx) {
-        uint32_t sumBeforeRow = sumBeforeNextRow;
-        sumBeforeNextRow += csrMatrix.rowPtrs[rowIdx];
-        csrMatrix.rowPtrs[rowIdx] = sumBeforeRow;
-    }
-    csrMatrix.rowPtrs[csrMatrix.numRows] = sumBeforeNextRow;
-
-    // Bin the nonzeros
-    for(uint32_t i = 0; i < cooMatrix.numNonzeros; ++i) {
-        uint32_t rowIdx = cooMatrix.rowIdxs[i];
-        uint32_t nnzIdx = csrMatrix.rowPtrs[rowIdx]++;
-        csrMatrix.nonzeros[nnzIdx] = cooMatrix.nonzeros[i];
-    }
-
-    // Restore rowPtrs
-    for(uint32_t rowIdx = csrMatrix.numRows - 1; rowIdx > 0; --rowIdx) {
-        csrMatrix.rowPtrs[rowIdx] = csrMatrix.rowPtrs[rowIdx - 1];
-    }
-    csrMatrix.rowPtrs[0] = 0;
-
-    return csrMatrix;
-
-}
-
-static void freeCSRMatrix(struct CSRMatrix csrMatrix) {
-    free(csrMatrix.rowPtrs);
-    free(csrMatrix.nonzeros);
-}
-
-static void initVector(float* vec, uint32_t size) {
-    for(uint32_t i = 0; i < size; ++i) {
-        vec[i] = 1.0f;
-    }
-}
-
-#endif
-
diff --git a/SpMV/support/params.h b/SpMV/support/params.h
deleted file mode 100644
index b4b696c..0000000
--- a/SpMV/support/params.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-#include "utils.h"
-
-static void usage() {
-    PRINT(  "\nUsage:  ./program [options]"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -f <F>    input matrix file name (default=data/bcsstk30.mtx)"
-            "\n"
-            "\nGeneral options:"
-            "\n    -v <V>    verbosity"
-            "\n    -h        help"
-            "\n\n");
-}
-
-typedef struct Params {
-  const char* fileName;
-  unsigned int verbosity;
-} Params;
-
-static struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.fileName      = "data/bcsstk30.mtx";
-    p.verbosity     = 1;
-    int opt;
-    while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
-        switch(opt) {
-            case 'f': p.fileName    = optarg;       break;
-            case 'v': p.verbosity   = atoi(optarg); break;
-            case 'h': usage(); exit(0);
-            default:
-                      PRINT_ERROR("Unrecognized option!");
-                      usage();
-                      exit(0);
-        }
-    }
-
-    return p;
-}
-
-#endif
-
diff --git a/SpMV/support/timer.h b/SpMV/support/timer.h
deleted file mode 100644
index 66e9842..0000000
--- a/SpMV/support/timer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-#include <stdio.h>
-#include <sys/time.h>
-
-typedef struct Timer {
-    struct timeval startTime;
-    struct timeval endTime;
-} Timer;
-
-static void startTimer(Timer* timer) {
-    gettimeofday(&(timer->startTime), NULL);
-}
-
-static void stopTimer(Timer* timer) {
-    gettimeofday(&(timer->endTime), NULL);
-}
-
-static double getElapsedTime(Timer timer) {
-    return ((double) ((timer.endTime.tv_sec - timer.startTime.tv_sec)
-                   + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
-}
-
-#endif
-
diff --git a/TRNS/Makefile b/TRNS/Makefile
index fd3f493..302fcd6 100644
--- a/TRNS/Makefile
+++ b/TRNS/Makefile
@@ -1,18 +1,32 @@
 NR_DPUS ?= 1
 NR_TASKLETS ?= 16
 ENERGY ?= 0
-WITH_ALLOC_OVERHEAD ?= 0
-WITH_LOAD_OVERHEAD ?= 0
-WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD}
+aspectc ?= 0
+aspectc_timing ?= *
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 -march=native `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DENERGY=${ENERGY} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
 ifdef verbose
@@ -24,8 +38,11 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
+# cp/rm are needed to work around AspectC++ not liking symlinks
 bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
 bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile
index 236f7bb..2f28738 100644
--- a/TRNS/baselines/cpu/Makefile
+++ b/TRNS/baselines/cpu/Makefile
@@ -32,16 +32,30 @@
 #  THE SOFTWARE.
 # 
 
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
 
-ifeq (${NUMA}, 1)
-	FLAGS += -lnuma
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
 endif
 
 CXX=g++
-CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY}
+CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark}
 
 LIB=-L/usr/lib/ -lm -pthread
 
@@ -52,7 +66,7 @@ EXE=trns
 all: trns
 
 trns: ${SRC}
-	$(CXX) -O2 $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE) $(FLAGS)
+	$(CXX) -O3 $(CXX_FLAGS) ${CFLAGS} $(SRC) $(LIB) -o $(EXE) ${LDFLAGS}
 
 trns_O0: ${SRC}
 	$(CXX) $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE)_O0
diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp
index c8cccaf..b4cd149 100644
--- a/TRNS/baselines/cpu/main.cpp
+++ b/TRNS/baselines/cpu/main.cpp
@@ -36,9 +36,18 @@
 #include "support/setup.h"
 #include "kernel.h"
 #include "support/common.h"
-#include "support/timer.h"
 #include "support/verify.h"
 
+#if WITH_BENCHMARK
+#include "support/timer.h"
+#else
+#include <string>
+struct Timer {
+    inline void start(std::string name) {(void)name;}
+    inline void stop(std::string name) {(void)name;}
+};
+#endif
+
 #include <unistd.h>
 #include <thread>
 #include <string.h>
@@ -362,6 +371,7 @@ int main(int argc, char **argv) {
             timer.stop("free");
 #endif
 
+#if WITH_BENCHMARK
         if (rep >= p.n_warmup) {
 #if NUMA_MEMCPY
             printf("[::] TRNS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
@@ -396,10 +406,8 @@ int main(int argc, char **argv) {
                 timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3"));
 #endif // NUMA_MEMCPY
         }
+#endif // WITH_BENCHMARK
     }
-    //timer.print("Step 1", p.n_reps);
-    //timer.print("Step 2", p.n_reps);
-    //timer.print("Step 3", p.n_reps);
 
     // Verify answer
     //verify(h_local, h_in_backup, M_ * m, N_ * n, 1);
diff --git a/TRNS/baselines/cpu/run-perf.sh b/TRNS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..f16a3b1
--- /dev/null
+++ b/TRNS/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 1 -a 4 -c 4
+perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 4 -a 4 -c 4
diff --git a/TRNS/benchmark-scripts/ccmcc25-sim.sh b/TRNS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..a7aa79c
--- /dev/null
+++ b/TRNS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+# Args: -m m -n n -o M_ -p N_
+#
+# Input: (M_ * m) × (N_ * n) matrix
+# Output: (N_* n) × (M_ * m) matrix
+# Step 1: transpose (M_ * m) × N_ matrix that consists of tiles of size n
+#   CPU version: explicit
+#   DPU version: implicit (M_ * m write operations of #DPUs * n elements to DPUs)
+# Step 2: transpose m × n matrix; this happens N_ * M_ times.
+#   DPU version: Each tasklet transposes a single m × n matrix / tile.
+#   (16 × 8 tile takes up 1 KiB WRAM)
+# Step 3: Transpose M_ × n matrix that consists of tiles of size m.
+#
+# Note for DPU version: if M_ > #DPUs, steps 1 through 3 are repeated.
+# Number of repetitions == ceil(M_ / #DPUS)
+# For Hetsim benchmarks, we set M_ == #DPUs to simplify the task graph (no repetitions that depend on the number of available DPUs).
+# Just in case, there is also a configuration with M_ == 2048 independent of #DPUs
+#
+# input size: uint64(DPU)/double(CPU) * M_ * m * N_ * n
+# output size: uint64(DPU)/double(CPU) * M_ * m * N_ * n -- on DPU only; CPU version operates in-place
+# Upstream DPU version uses int64_t, -p 2048 -o 12288 -x 1 [implicit -m 16 -n 8]
+# Upstream CPU version uses double,  -p 2556 -o 4096 -m 16 -n 8 and fails with -o 12288 (allocation error)
+#
+# -p         2048 -o 2048 -m 16 -n 8 -> matrix size: 4 GiB
+# -p [64 .. 2304] -o 2048 -m 16 -n 8 -> matrix size: 128 MiB .. 4.5 GiB
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}  \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 2 -p ${cols} -o ${rows} -m ${tile_rows} -n ${tile_cols}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  TRNS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 cols={cols} rows={rows} tile_cols={tile_cols} tile_rows={tile_rows} \
+	::: nr_dpus 1 4 16 32 64 \
+	::: rows 64 128 256 512 \
+	::: cols 64 128 256 512 \
+	::: tile_rows 16 \
+	::: tile_cols 8 \
+>> ${fn}.txt
diff --git a/TRNS/benchmark-scripts/ccmcc25.sh b/TRNS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..7c66306
--- /dev/null
+++ b/TRNS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} dfatool_timing=0 aspectc=1 aspectc_timing=1
+	bin/host_code -w 0 -e 4 -p ${cols} -o ${rows} -m ${tile_rows} -n ${tile_cols}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  TRNS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any cols={cols} rows={rows} tile_cols={tile_cols} tile_rows={tile_rows} \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: rows 1024 2048 4096 \
+		::: cols 64 128 256 512 768 1024 1536 2048 2304 \
+		::: tile_rows 16 \
+		::: tile_cols 8 \
+	>> ${fn}.txt
+
+done
diff --git a/TRNS/dimes-hetsim-hbm.sh b/TRNS/dimes-hetsim-hbm.sh
index e2efaee..cc5dc68 100755
--- a/TRNS/dimes-hetsim-hbm.sh
+++ b/TRNS/dimes-hetsim-hbm.sh
@@ -32,7 +32,7 @@ fn=log/$(hostname)/dimes-hetsim-hbm
 
 (
 
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
 
 echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
 
@@ -43,10 +43,9 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
 	:::      ram_in $(seq 0 15) \
 	:::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
 	:::   ram_local $(seq 0 15) \
-	:::+        cpu $(seq 0 7) $(seq 0 7) \
-	::: input_size 167772160
+	:::+        cpu $(seq 0 7) $(seq 0 7)
 
-make -B NUMA=1
+make -B numa=1
 
 echo "CPU single-node operation (2/3)" >&2
 
diff --git a/TRNS/dimes-hetsim-nmc.sh b/TRNS/dimes-hetsim-nmc.sh
index b5f6f13..80987e7 100755
--- a/TRNS/dimes-hetsim-nmc.sh
+++ b/TRNS/dimes-hetsim-nmc.sh
@@ -73,7 +73,7 @@ parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
 ) >> ${fn}.txt
 
 cd baselines/cpu
-make -B NUMA=1
+make -B numa=1
 
 (
 
@@ -97,7 +97,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
 
 ) >> ${fn}.txt
 
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
 
 (
 
diff --git a/TRNS/dpu/task.c b/TRNS/dpu/task.c
index 0f5e4be..9c0e0a8 100644
--- a/TRNS/dpu/task.c
+++ b/TRNS/dpu/task.c
@@ -12,7 +12,7 @@
 #include <mutex.h>
 #include <barrier.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
diff --git a/TRNS/host/app.c b/TRNS/host/app.c
index 452b894..c178a19 100644
--- a/TRNS/host/app.c
+++ b/TRNS/host/app.c
@@ -7,16 +7,32 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+#include <dpu_management.h>
+#include <dpu_target_macros.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 #include <math.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
@@ -26,18 +42,13 @@
 #define DPU_BINARY "./bin/dpu_code"
 #endif
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
-#include <dpu_management.h>
-#include <dpu_target_macros.h>
-
 // Pointer declaration
 static T* A_host;
 static T* A_backup;
 static T* A_result;
 
+unsigned int kernel = 0;
+
 // Create input arrays
 static void read_input(T* A, unsigned int nr_elements) {
     srand(0);
@@ -84,10 +95,10 @@ int main(int argc, char **argv) {
     N_ = p.exp == 0 ? N_ * NR_DPUS : N_;
 
     // Input/output allocation
-    A_host = malloc(M_ * m * N_ * n * sizeof(T));
-    A_backup = malloc(M_ * m * N_ * n * sizeof(T));
-    A_result = malloc(M_ * m * N_ * n * sizeof(T));
-    T* done_host = malloc(M_ * n); // Host array to reset done array of step 3
+    A_host = (T*)malloc(M_ * m * N_ * n * sizeof(T));
+    A_backup = (T*)malloc(M_ * m * N_ * n * sizeof(T));
+    A_result = (T*)malloc(M_ * m * N_ * n * sizeof(T));
+    T* done_host = (T*)malloc(M_ * n); // Host array to reset done array of step 3
     memset(done_host, 0, M_ * n);
 
     // Create an input file with arbitrary data
@@ -131,6 +142,7 @@ int main(int argc, char **argv) {
                 DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
                 stop(&timer, 2);
                 DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+                DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
             } else if (first_round){
                 start(&timer, 1, 0);
                 DPU_ASSERT(dpu_alloc(active_dpus, NULL, &dpu_set));
@@ -174,8 +186,8 @@ int main(int argc, char **argv) {
                 start(&timer, 5, !first_round);
             }
 
-            unsigned int kernel = 0;
-            dpu_arguments_t input_arguments = {m, n, M_, kernel};
+            kernel = 0;
+            dpu_arguments_t input_arguments = {m, n, M_, (enum kernels)kernel};
             // transfer control instructions to DPUs (run first program part)
             DPU_FOREACH(dpu_set, dpu, i) {
                 DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
@@ -215,7 +227,7 @@ int main(int argc, char **argv) {
                 start(&timer, 7, !first_round);
             }
             kernel = 1;
-            dpu_arguments_t input_arguments2 = {m, n, M_, kernel};
+            dpu_arguments_t input_arguments2 = {m, n, M_, (enum kernels)kernel};
             DPU_FOREACH(dpu_set, dpu, i) {
                 DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments2));
             }
@@ -299,7 +311,9 @@ int main(int argc, char **argv) {
         }
         if (status) {
             printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+#if DFATOOL_TIMING
             unsigned long input_size = M_ * m * N_ * n;
+#endif
             if (rep >= p.n_warmup) {
                 /*
                  * timer 0: CPU version
@@ -313,35 +327,35 @@ int main(int argc, char **argv) {
                  * timer 8: run DPU program (second kernel)
                  * timer 9: read transposed matrix
                  */
-                printf("[::] TRNS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%lu numa_node_rank=%d ",
+                dfatool_printf("[::] TRNS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%lu numa_node_rank=%d ",
                     NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), input_size, numa_node_rank);
-                printf("| latency_cpu_us=%f latency_realloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f",
+                dfatool_printf("| latency_cpu_us=%f latency_realloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f",
                     timer.time[0], // CPU
                     timer.time[1], // free + alloc
                     timer.time[2], // load
                     timer.time[3] + timer.time[4] + timer.time[5] + timer.time[7], // write
                     timer.time[6] + timer.time[8], // kernel
                     timer.time[9]); // read
-                printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f latency_write4_us=%f latency_kernel1_us=%f latency_kernel2_us=%f",
+                dfatool_printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f latency_write4_us=%f latency_kernel1_us=%f latency_kernel2_us=%f",
                     timer.time[3],
                     timer.time[4],
                     timer.time[5],
                     timer.time[7],
                     timer.time[6],
                     timer.time[8]);
-                printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+                dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
                     input_size * sizeof(T) / timer.time[0],
                     input_size * sizeof(T) / (timer.time[6] + timer.time[8]),
                     input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]));
-                printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+                dfatool_printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
                     input_size *  sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]),
                     input_size *  sizeof(T) / (timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]),
                     input_size *  sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]));
-                printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+                dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
                     input_size / timer.time[0],
                     input_size / (timer.time[6] + timer.time[8]),
                     input_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]));
-                printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+                dfatool_printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
                     input_size / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]),
                     input_size / (timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]),
                     input_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8] + timer.time[9]));
diff --git a/TRNS/support/common.h b/TRNS/include/common.h
index 2ba56c5..6a94c62 100755..100644
--- a/TRNS/support/common.h
+++ b/TRNS/include/common.h
@@ -14,16 +14,18 @@
 // Data type
 #define T int64_t
 
+enum kernels {
+	kernel1 = 0,
+	kernel2 = 1,
+	nr_kernels = 2,
+};
+
 // Structures used by both the host and the dpu to communicate information 
 typedef struct {
     uint32_t m;
     uint32_t n;
     uint32_t M_;
-	enum kernels {
-	    kernel1 = 0,
-	    kernel2 = 1,
-	    nr_kernels = 2,
-	} kernel;
+	enum kernels kernel;
 } dpu_arguments_t;
 
 #ifndef ENERGY
diff --git a/TRNS/include/dfatool_host.ah b/TRNS/include/dfatool_host.ah
new file mode 100644
index 0000000..72978cc
--- /dev/null
+++ b/TRNS/include/dfatool_host.ah
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned int n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile;
+	unsigned int element_size;
+
+	virtual int getKernel() { return kernel; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		/*
+		 * Input: (M_ * m) × (N_ * n) matrix
+		 */
+		n_rows_outer = p->M_;
+		n_rows_tile = p->m;
+		n_cols_outer = p->N_;
+		n_cols_tile = p->n;
+		printf("[>>] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile);
+	}
+
+	advice call("% trns_host(...)") : after() {
+		printf("[--] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile);
+	}
+};
diff --git a/TRNS/support/params.h b/TRNS/include/params.h
index 6b7e6f2..385490e 100644
--- a/TRNS/support/params.h
+++ b/TRNS/include/params.h
@@ -21,7 +21,7 @@ static void usage() {
         "\n    -h        help"
         "\n    -w <W>    # of untimed warmup iterations (default=1)"
         "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+        "\n    -x <X>    Weak (0) or strong (1) scaling (default=1)"
         "\n"
         "\nBenchmark-specific options:"
         "\n    -m <I>    m (default=16 elements)"
@@ -39,7 +39,7 @@ struct Params input_params(int argc, char **argv) {
     p.n             = 8;
     p.n_warmup      = 1;
     p.n_reps        = 3;
-    p.exp           = 0;
+    p.exp           = 1;
 
     int opt;
     while((opt = getopt(argc, argv, "hw:e:x:m:n:o:p:")) >= 0) {
diff --git a/TRNS/include/timer.h b/TRNS/include/timer.h
new file mode 100644
index 0000000..8d5c3d5
--- /dev/null
+++ b/TRNS/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 10
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/TRNS/run-fgbs24a.sh b/TRNS/run-fgbs24a.sh
deleted file mode 100755
index 6ba8993..0000000
--- a/TRNS/run-fgbs24a.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mkdir -p $(hostname)
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TRNS strong-full (dfatool fgbs24a edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 2304 2048 2543; do
-	for nr_tasklets in 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-			# upstream uses -p 2048, but then the number of DPUs is always constant...
-			timeout --foreground -k 1m 180m bin/host_code -w 0 -e 100 -p $nr_dpus -o 12288 -x 1 || true
-		fi
-	done
-done
-echo "Completed at $(date)"
-) | tee "$(hostname)/fgbs24a.txt"
diff --git a/TRNS/run-paper-strong-full.sh b/TRNS/run-paper-strong-full.sh
deleted file mode 100755
index 9d3792c..0000000
--- a/TRNS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TRNS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 is not in upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-			# upstream uses -p 2048, but then the number of DPUs is always constant...
-			timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true
-		fi
-	done
-done
-
-echo "Completed at $(date)"
-
-) | tee "log-$(hostname)-prim-strong-full.txt"
diff --git a/TRNS/run-paper-strong-rank.sh b/TRNS/run-paper-strong-rank.sh
deleted file mode 100755
index f5f00cb..0000000
--- a/TRNS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TRNS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-			# upstream uses -p 64, but then the number of DPUs is always constant...
-			timeout --foreground -k 1m 60m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true
-		fi
-	done
-done
-
-echo "Completed at $(date)"
-
-) | tee "log-$(hostname)-prim-strong-rank.txt"
diff --git a/TRNS/run-paper-weak.sh b/TRNS/run-paper-weak.sh
deleted file mode 100755
index f02d7d6..0000000
--- a/TRNS/run-paper-weak.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TRNS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-			timeout --foreground -k 1m 60m bin/host_code -w 0 -e 40 -p 1 -o 12288 -x 0 || true
-		fi
-	done
-done | tee log-paper-weak.txt
-
-echo "Completed at $(date)"
-
-) | tee "log-$(hostname)-prim-weak.txt"
diff --git a/TRNS/run-rank.sh b/TRNS/run-rank.sh
deleted file mode 100755
index 00f6898..0000000
--- a/TRNS/run-rank.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i: input size (number of elements, not number of bytes!)
-
-(
-
-echo "prim-benchmarks TRNS (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 4 8 16 32 48 64; do
-	for nr_tasklets in 8 12 16; do
-		# 12288 run-paper-weak, run-paper-strong-full
-		for i in 12288; do
-			echo
-			if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-				# upstream uses -p 2048 in strong-full, but then the number of DPUs is always constant...
-				timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p 1 -o 12288 -x 0 || true
-			fi
-		done
-	done
-done
-
-echo "Completed at $(date)"
-
-) | tee "log-$(hostname)-rank.txt"
diff --git a/TRNS/run.sh b/TRNS/run.sh
deleted file mode 100755
index 8d574a9..0000000
--- a/TRNS/run.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i: input size (number of elements, not number of bytes!)
-
-(
-
-echo "prim-benchmarks TRNS (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 2542 2304 1 4 8 16 32 64 128 256 512 768 1024 1536 2048; do
-	for nr_tasklets in 8 12 16; do
-		# 12288 run-paper-weak, run-paper-strong-full
-		for i in 12288; do
-			echo
-			if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
-				# upstream uses -p 2048 in strong-full, but then the number of DPUs is always constant...
-				timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true
-			fi
-		done
-	done
-done
-
-echo "Completed at $(date)"
-
-) | tee "log-$(hostname).txt"
diff --git a/TRNS/support/timer.h b/TRNS/support/timer.h
deleted file mode 100755
index 786c687..0000000
--- a/TRNS/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[10];
-    struct timeval stopTime[10];
-    double         time[10];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/TS/Makefile b/TS/Makefile
index ac081bd..2fce611 100644
--- a/TS/Makefile
+++ b/TS/Makefile
@@ -5,14 +5,31 @@ WITH_ALLOC_OVERHEAD ?= 0
 WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra  -g -I${COMMON_INCLUDES} -DBL=${BL}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -lm
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra  -g -Iinclude -DBL=${BL}
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} -lm
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
 ifdef verbose
@@ -25,7 +42,9 @@ bin:
 	${QUIET}mkdir -p bin
 
 bin/ts_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
 bin/ts_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h
index 120c225..bfaf052 100644
--- a/TS/baselines/cpu/mprofile.h
+++ b/TS/baselines/cpu/mprofile.h
@@ -10,5 +10,7 @@
 //#define HBM_ALOC
 //#define RANDOM_DIAGS
 
-int loadTimeSeriesFromFile (std::string infilename, std::vector<DTYPE> &A, int &timeSeriesLength);
-int saveProfileToFile(std::string outfilename, DTYPE * profile, int * profileIndex, int timeSeriesLength, int windowSize);
+int loadTimeSeriesFromFile(std::string infilename, std::vector < DTYPE > &A,
+			   int &timeSeriesLength);
+int saveProfileToFile(std::string outfilename, DTYPE * profile,
+		      int *profileIndex, int timeSeriesLength, int windowSize);
diff --git a/TS/baselines/cpu/run-perf.sh b/TS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..453b64b
--- /dev/null
+++ b/TS/baselines/cpu/run-perf.sh
@@ -0,0 +1,8 @@
+#!/bin/zsh
+
+make -B NUMA=1
+
+for i in $(seq 1 20); do
+	OMP_NUM_THREADS=1 perf stat record -o t1.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4
+	OMP_NUM_THREADS=4 perf stat record -o t4.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4
+done
diff --git a/TS/benchmark-scripts/ccmcc25-sim.sh b/TS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..0df03d9
--- /dev/null
+++ b/TS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/ts_host -w 0 -e 5 -n ${ts_size} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  TS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 ts_size={ts_size} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: ts_size 2048 4096 8192 16384 32768 \
+>> ${fn}.txt
diff --git a/TS/benchmark-scripts/ccmcc25.sh b/TS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..74c8371
--- /dev/null
+++ b/TS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/ts_host -w 0 -e 50 -n ${ts_size} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  TS  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 numa_rank=any ts_size={ts_size} \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: ts_size 8388608 16777216 33554432 67108864 \
+	>> ${fn}.txt
+
+done
diff --git a/TS/dpu/task.c b/TS/dpu/task.c
index d704160..5a756aa 100644
--- a/TS/dpu/task.c
+++ b/TS/dpu/task.c
@@ -18,18 +18,18 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 __host dpu_result_t DPU_RESULTS[NR_TASKLETS];
 
 // Dot product
-static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) {
-
-	for(uint32_t i = 0; i <  BLOCK_SIZE / sizeof(DTYPE); i++)
-	{
-		for(uint32_t j = 0; j < DOTPIP; j++)
-		{
-			if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1)
-			{
-				result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)]  * vectorB[i];
-			}
-			else
-			{
+static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB,
+			DTYPE *result)
+{
+
+	for (uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++) {
+		for (uint32_t j = 0; j < DOTPIP; j++) {
+			if ((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1) {
+				result[j] +=
+				    vectorA_aux[(j + i) -
+						BLOCK_SIZE / sizeof(DTYPE)] *
+				    vectorB[i];
+			} else {
 				result[j] += vectorA[j + i] * vectorB[i];
 			}
 		}
@@ -40,43 +40,46 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 extern int main_kernel1(void);
 
-int(*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void){
+int main(void)
+{
 	// Kernel
-	return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
+int main_kernel1()
+{
 	unsigned int tasklet_id = me();
 #if PRINT
 	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-	if(tasklet_id == 0){
-		mem_reset(); // Reset the heap
+	if (tasklet_id == 0) {
+		mem_reset();	// Reset the heap
 	}
 	// Barrier
 	barrier_wait(&my_barrier);
 
 	// Input arguments
-	uint32_t query_length  = DPU_INPUT_ARGUMENTS.query_length;
-	DTYPE query_mean       = DPU_INPUT_ARGUMENTS.query_mean;
-	DTYPE query_std        = DPU_INPUT_ARGUMENTS.query_std;
+	uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length;
+	DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean;
+	DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std;
 	uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu;
 
 	// Boundaries for current tasklet
-	uint32_t myStartElem = tasklet_id  * (slice_per_dpu / (NR_TASKLETS));
-	uint32_t myEndElem   = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;
+	uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS));
+	uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;
 
 	// Check time series limit
-	if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length;
+	if (myEndElem > slice_per_dpu - query_length)
+		myEndElem = slice_per_dpu - query_length;
 
 	// Starting address of the current processing block in MRAM
 	uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER;
 
 	// Starting address of the query subsequence
-	uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset);
+	uint32_t current_mram_block_addr_query = (uint32_t) (mem_offset);
 	mem_offset += query_length * sizeof(DTYPE);
 
 	// Starting address of the time series slice
@@ -86,18 +89,18 @@ int main_kernel1() {
 
 	// Starting address of the time series means
 	mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
-	uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset);
+	uint32_t current_mram_block_addr_TSMean = (uint32_t) (mem_offset);
 
 	// Starting address of the time series standard deviations
 	mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
-	uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset);
+	uint32_t current_mram_block_addr_TSSigma = (uint32_t) (mem_offset);
 
 	// Initialize local caches to store the MRAM blocks
-	DTYPE *cache_TS       = (DTYPE *) mem_alloc(BLOCK_SIZE);
-	DTYPE *cache_TS_aux   = (DTYPE *) mem_alloc(BLOCK_SIZE);
-	DTYPE *cache_query    = (DTYPE *) mem_alloc(BLOCK_SIZE);
-	DTYPE *cache_TSMean   = (DTYPE *) mem_alloc(BLOCK_SIZE);
-	DTYPE *cache_TSSigma  = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE);
+	DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE);
 	DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE);
 
 	// Create result structure pointer
@@ -108,41 +111,56 @@ int main_kernel1() {
 	DTYPE min_distance = DTYPE_MAX;
 	uint32_t min_index = 0;
 
-
-	for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE)))
-	{
-		for(uint32_t d = 0; d < DOTPIP; d++)
+	for (uint32_t i = myStartElem; i < myEndElem;
+	     i += (BLOCK_SIZE / sizeof(DTYPE))) {
+		for (uint32_t d = 0; d < DOTPIP; d++)
 			cache_dotprods[d] = 0;
 
-		current_mram_block_addr_TS    = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE);
-		current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER;
-
-		for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++)
-		{
-			mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE);
-			mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE);
-			mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE);
-
-			current_mram_block_addr_TS    += BLOCK_SIZE;
+		current_mram_block_addr_TS =
+		    (uint32_t) starting_offset_ts + (i -
+						     myStartElem) *
+		    sizeof(DTYPE);
+		current_mram_block_addr_query =
+		    (uint32_t) DPU_MRAM_HEAP_POINTER;
+
+		for (uint32_t j = 0;
+		     j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++) {
+			mram_read((__mram_ptr void const *)
+				  current_mram_block_addr_TS, cache_TS,
+				  BLOCK_SIZE);
+			mram_read((__mram_ptr void const *)
+				  current_mram_block_addr_TS + BLOCK_SIZE,
+				  cache_TS_aux, BLOCK_SIZE);
+			mram_read((__mram_ptr void const *)
+				  current_mram_block_addr_query, cache_query,
+				  BLOCK_SIZE);
+
+			current_mram_block_addr_TS += BLOCK_SIZE;
 			current_mram_block_addr_query += BLOCK_SIZE;
-			dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods);
+			dot_product(cache_TS, cache_TS_aux, cache_query,
+				    cache_dotprods);
 		}
 
-
-		mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE);
-		mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE);
-		current_mram_block_addr_TSMean  += BLOCK_SIZE;
+		mram_read((__mram_ptr void const *)
+			  current_mram_block_addr_TSMean, cache_TSMean,
+			  BLOCK_SIZE);
+		mram_read((__mram_ptr void const *)
+			  current_mram_block_addr_TSSigma, cache_TSSigma,
+			  BLOCK_SIZE);
+		current_mram_block_addr_TSMean += BLOCK_SIZE;
 		current_mram_block_addr_TSSigma += BLOCK_SIZE;
 
-		for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++)
-		{
-			distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k]
-						* query_mean) / (cache_TSSigma[k] * query_std));
-
-			if(distance < min_distance)
-			{
-				min_distance =  distance;
-				min_index    =  i + k;
+		for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++) {
+			distance =
+			    2 * ((DTYPE) query_length -
+				 (cache_dotprods[k] -
+				  (DTYPE) query_length * cache_TSMean[k]
+				  * query_mean) / (cache_TSSigma[k] *
+						   query_std));
+
+			if (distance < min_distance) {
+				min_distance = distance;
+				min_index = i + k;
 			}
 		}
 	}
diff --git a/TS/host/app.c b/TS/host/app.c
index b9faa9c..bfa14df 100644
--- a/TS/host/app.c
+++ b/TS/host/app.c
@@ -7,8 +7,18 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
@@ -31,23 +41,23 @@
 #define MAX_DATA_VAL 127
 
 static DTYPE tSeries[1 << 26];
-static DTYPE query  [1 << 15];
-static DTYPE AMean  [1 << 26];
-static DTYPE ASigma [1 << 26];
+static DTYPE query[1 << 15];
+static DTYPE AMean[1 << 26];
+static DTYPE ASigma[1 << 26];
 static DTYPE minHost;
 static DTYPE minHostIdx;
 
 // Create input arrays
-static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elements) {
+static DTYPE *create_test_file(unsigned int ts_elements,
+			       unsigned int query_elements)
+{
 	srand(0);
 
-	for (uint64_t i = 0; i < ts_elements; i++)
-	{
+	for (uint64_t i = 0; i < ts_elements; i++) {
 		tSeries[i] = i % MAX_DATA_VAL;
 	}
 
-	for (uint64_t i = 0; i < query_elements; i++)
-	{
+	for (uint64_t i = 0; i < query_elements; i++) {
 		query[i] = i % MAX_DATA_VAL;
 	}
 
@@ -55,61 +65,62 @@ static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elem
 }
 
 // Compute output in the host
-static void streamp(DTYPE* tSeries, DTYPE* AMean, DTYPE* ASigma, int ProfileLength,
-		DTYPE* query, int queryLength, DTYPE queryMean, DTYPE queryStdDeviation)
+static void streamp(DTYPE *tSeries, DTYPE *AMean, DTYPE *ASigma,
+		    int ProfileLength, DTYPE *query, int queryLength,
+		    DTYPE queryMean, DTYPE queryStdDeviation)
 {
 	DTYPE distance;
 	DTYPE dotprod;
-	minHost    = INT32_MAX;
+	minHost = INT32_MAX;
 	minHostIdx = 0;
 
-	for (int subseq = 0; subseq < ProfileLength; subseq++)
-	{
+	for (int subseq = 0; subseq < ProfileLength; subseq++) {
 		dotprod = 0;
-		for(int j = 0; j < queryLength; j++)
-		{
+		for (int j = 0; j < queryLength; j++) {
 			dotprod += tSeries[j + subseq] * query[j];
 		}
 
-		distance = 2 * (queryLength - (dotprod - queryLength * AMean[subseq]
-					* queryMean) / (ASigma[subseq] * queryStdDeviation));
+		distance =
+		    2 * (queryLength - (dotprod - queryLength * AMean[subseq]
+					* queryMean) / (ASigma[subseq] *
+							queryStdDeviation));
 
-		if(distance < minHost)
-		{
+		if (distance < minHost) {
 			minHost = distance;
 			minHostIdx = subseq;
 		}
 	}
 }
 
-static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int ProfileLength, unsigned int queryLength)
+static void compute_ts_statistics(unsigned int timeSeriesLength,
+				  unsigned int ProfileLength,
+				  unsigned int queryLength)
 {
-	double* ACumSum = malloc(sizeof(double) * timeSeriesLength);
+	double *ACumSum = (double*)malloc(sizeof(double) * timeSeriesLength);
 	ACumSum[0] = tSeries[0];
 	for (uint64_t i = 1; i < timeSeriesLength; i++)
 		ACumSum[i] = tSeries[i] + ACumSum[i - 1];
-	double* ASqCumSum = malloc(sizeof(double) * timeSeriesLength);
+	double *ASqCumSum = (double*)malloc(sizeof(double) * timeSeriesLength);
 	ASqCumSum[0] = tSeries[0] * tSeries[0];
 	for (uint64_t i = 1; i < timeSeriesLength; i++)
 		ASqCumSum[i] = tSeries[i] * tSeries[i] + ASqCumSum[i - 1];
-	double* ASum = malloc(sizeof(double) * ProfileLength);
+	double *ASum = (double*)malloc(sizeof(double) * ProfileLength);
 	ASum[0] = ACumSum[queryLength - 1];
 	for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++)
 		ASum[i + 1] = ACumSum[queryLength + i] - ACumSum[i];
-	double* ASumSq = malloc(sizeof(double) * ProfileLength);
+	double *ASumSq = (double*)malloc(sizeof(double) * ProfileLength);
 	ASumSq[0] = ASqCumSum[queryLength - 1];
 	for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++)
 		ASumSq[i + 1] = ASqCumSum[queryLength + i] - ASqCumSum[i];
-	double * AMean_tmp = malloc(sizeof(double) * ProfileLength);
+	double *AMean_tmp = (double*)malloc(sizeof(double) * ProfileLength);
 	for (uint64_t i = 0; i < ProfileLength; i++)
 		AMean_tmp[i] = ASum[i] / queryLength;
-	double* ASigmaSq = malloc(sizeof(double) * ProfileLength);
+	double *ASigmaSq = (double*)malloc(sizeof(double) * ProfileLength);
 	for (uint64_t i = 0; i < ProfileLength; i++)
 		ASigmaSq[i] = ASumSq[i] / queryLength - AMean[i] * AMean[i];
-	for (uint64_t i = 0; i < ProfileLength; i++)
-	{
+	for (uint64_t i = 0; i < ProfileLength; i++) {
 		ASigma[i] = sqrt(ASigmaSq[i]);
-		AMean[i]  = (DTYPE) AMean_tmp[i];
+		AMean[i] = (DTYPE) AMean_tmp[i];
 	}
 
 	free(ACumSum);
@@ -121,7 +132,8 @@ static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int Pr
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
 	// Timer declaration
 	Timer timer;
@@ -129,22 +141,28 @@ int main(int argc, char **argv) {
 	struct Params p = input_params(argc, argv);
 	struct dpu_set_t dpu_set, dpu;
 	uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	uint32_t nr_of_ranks;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if DFATOOL_TIMING
+	timer.time[0] = 0;	// alloc
+#endif
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+#if DFATOOL_TIMING
+	timer.time[1] = 0;	// load
+#endif
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+#if DFATOOL_TIMING
+	timer.time[6] = 0;	// free
+#endif
 #endif
 
 #if ENERGY
@@ -152,12 +170,15 @@ int main(int argc, char **argv) {
 	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-	unsigned long int ts_size =  p.input_size_n;
+	unsigned long int ts_size = p.input_size_n;
 	const unsigned int query_length = p.input_size_m;
 
 	// Size adjustment
-	if(ts_size % (NR_DPUS * NR_TASKLETS*query_length))
-		ts_size = ts_size +  (NR_DPUS * NR_TASKLETS * query_length - ts_size % (NR_DPUS * NR_TASKLETS*query_length));
+	if (ts_size % (NR_DPUS * NR_TASKLETS * query_length))
+		ts_size =
+		    ts_size + (NR_DPUS * NR_TASKLETS * query_length -
+			       ts_size % (NR_DPUS * NR_TASKLETS *
+					  query_length));
 
 	// Create an input file with arbitrary data
 	create_test_file(ts_size, query_length);
@@ -165,30 +186,34 @@ int main(int argc, char **argv) {
 
 	DTYPE query_mean;
 	double queryMean = 0;
-	for(unsigned i = 0; i < query_length; i++) queryMean += query[i];
-	queryMean /= (double) query_length;
+	for (unsigned i = 0; i < query_length; i++)
+		queryMean += query[i];
+	queryMean /= (double)query_length;
 	query_mean = (DTYPE) queryMean;
 
 	DTYPE query_std;
 	double queryStdDeviation;
 	double queryVariance = 0;
-	for(unsigned i = 0; i < query_length; i++)
-	{
-		queryVariance += (query[i] - queryMean) * (query[i] - queryMean);
+	for (unsigned i = 0; i < query_length; i++) {
+		queryVariance +=
+		    (query[i] - queryMean) * (query[i] - queryMean);
 	}
-	queryVariance /= (double) query_length;
+	queryVariance /= (double)query_length;
 	queryStdDeviation = sqrt(queryVariance);
 	query_std = (DTYPE) queryStdDeviation;
 
-	DTYPE *bufferTS     = tSeries;
-	DTYPE *bufferQ      = query;
-	DTYPE *bufferAMean  = AMean;
+	DTYPE *bufferTS = tSeries;
+	DTYPE *bufferQ = query;
+	DTYPE *bufferAMean = AMean;
 	DTYPE *bufferASigma = ASigma;
 
 	uint32_t slice_per_dpu = ts_size / NR_DPUS;
 
 	unsigned int kernel = 0;
-	dpu_arguments_t input_arguments = {ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, kernel};
+	dpu_arguments_t input_arguments =
+	    { (uint32_t)ts_size, query_length, query_mean, query_std, slice_per_dpu, 0,
+		(enum kernels) kernel
+	};
 	uint32_t mem_offset;
 
 	dpu_result_t result;
@@ -199,21 +224,30 @@ int main(int argc, char **argv) {
 
 	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
+		if (rep >= p.n_warmup) {
+			start(&timer, 6, 0);
+		}
+		streamp(tSeries, AMean, ASigma, ts_size - query_length - 1,
+			query, query_length, query_mean, query_std);
+		if (rep >= p.n_warmup) {
+			stop(&timer, 6);
+		}
+
 #if WITH_ALLOC_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 0, 0);
 		}
 		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 0);
 		}
 #endif
 #if WITH_LOAD_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 1, 0);
 		}
 		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 1);
 		}
 		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -225,58 +259,70 @@ int main(int argc, char **argv) {
 			start(&timer, 2, 0);
 		}
 		uint32_t i = 0;
-
-		DPU_FOREACH(dpu_set, dpu) {
-			input_arguments.exclusion_zone = 0;
-
-			DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGUMENTS", 0, (const void *) &input_arguments, sizeof(input_arguments)));
-			i++;
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
 		}
+		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
 
 		i = 0;
 		mem_offset = 0;
-		DPU_FOREACH(dpu_set, dpu, i)
-		{
+		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, bufferQ));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, query_length * sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    query_length * sizeof(DTYPE), DPU_XFER_DEFAULT));
 
 		i = 0;
 
 		mem_offset += query_length * sizeof(DTYPE);
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, bufferTS + slice_per_dpu * i));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferTS + slice_per_dpu * i));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset,(slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+			    (slice_per_dpu + query_length) * sizeof(DTYPE),
+			    DPU_XFER_DEFAULT));
 
 		mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE));
 
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, bufferAMean + slice_per_dpu * i));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferAMean + slice_per_dpu * i));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+			    (slice_per_dpu + query_length) * sizeof(DTYPE),
+			    DPU_XFER_DEFAULT));
 
 		i = 0;
 
 		mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE));
 
 		DPU_FOREACH(dpu_set, dpu, i) {
-			DPU_ASSERT(dpu_prepare_xfer(dpu, bufferASigma + slice_per_dpu * i));
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferASigma + slice_per_dpu * i));
 		}
 
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+			    (slice_per_dpu + query_length) * sizeof(DTYPE),
+			    DPU_XFER_DEFAULT));
 
 		if (rep >= p.n_warmup) {
 			stop(&timer, 2);
 		}
-
 		// Run kernel on DPUs
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			start(&timer, 3, 0);
 #if ENERGY
 			DPU_ASSERT(dpu_probe_start(&probe));
@@ -285,37 +331,49 @@ int main(int argc, char **argv) {
 
 		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
 
-		if (rep >= p.n_warmup)
-		{
+		if (rep >= p.n_warmup) {
 			stop(&timer, 3);
 #if ENERGY
 			DPU_ASSERT(dpu_probe_stop(&probe));
 #endif
 		}
 
-		dpu_result_t* results_retrieve[NR_DPUS];
+		dpu_result_t *results_retrieve[NR_DPUS];
 
 		if (rep >= p.n_warmup) {
 			start(&timer, 4, 0);
 		}
 
 		DPU_FOREACH(dpu_set, dpu, i) {
-			results_retrieve[i] = (dpu_result_t*)malloc(NR_TASKLETS * sizeof(dpu_result_t));
+			results_retrieve[i] =
+			    (dpu_result_t *) malloc(NR_TASKLETS *
+						    sizeof(dpu_result_t));
 		}
 
-
 		DPU_FOREACH(dpu_set, dpu, i) {
 			DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
 		}
-		DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_result_t), DPU_XFER_DEFAULT));
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+			    NR_TASKLETS * sizeof(dpu_result_t),
+			    DPU_XFER_DEFAULT));
 
 		i = 0;
 		DPU_FOREACH(dpu_set, dpu, i) {
-			for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) {
-				if(results_retrieve[i][each_tasklet].minValue < result.minValue && results_retrieve[i][each_tasklet].minValue > 0)
-				{
-					result.minValue = results_retrieve[i][each_tasklet].minValue;
-					result.minIndex = (DTYPE)results_retrieve[i][each_tasklet].minIndex + (i * slice_per_dpu);
+			for (unsigned int each_tasklet = 0;
+			     each_tasklet < NR_TASKLETS; each_tasklet++) {
+				if (results_retrieve[i][each_tasklet].minValue <
+				    result.minValue
+				    &&
+				    results_retrieve[i][each_tasklet].minValue >
+				    0) {
+					result.minValue =
+					    results_retrieve[i]
+					    [each_tasklet].minValue;
+					result.minIndex = (DTYPE)
+					    results_retrieve[i]
+					    [each_tasklet].minIndex +
+					    (i * slice_per_dpu);
 				}
 
 			}
@@ -323,11 +381,9 @@ int main(int argc, char **argv) {
 			i++;
 		}
 
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 4);
 		}
-
-
 #if PRINT
 		printf("LOGS\n");
 		DPU_FOREACH(dpu_set, dpu) {
@@ -337,67 +393,89 @@ int main(int argc, char **argv) {
 
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			start(&timer, 5, 0);
 		}
 #endif
 		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-		if(rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 			stop(&timer, 5);
 		}
 #endif
 #endif
 
-		if (rep >= p.n_warmup) {
-			start(&timer, 6, 0);
-		}
-		streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, query, query_length, query_mean, query_std);
-		if(rep >= p.n_warmup) {
-			stop(&timer, 6);
-		}
-
 		int status = (minHost == result.minValue);
 		if (status) {
-			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] results are equal\n");
 			if (rep >= p.n_warmup) {
-				printf("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
-					NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, ts_size);
-				printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ",
-					WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
-				printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ",
-					timer.time[0], // alloc
-					timer.time[1], // load
-					timer.time[2], // write
-					timer.time[3], // kernel
-					timer.time[4], // read
-					timer.time[5], // free
-					timer.time[6]); // CPU
-				printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-					ts_size * sizeof(DTYPE) / timer.time[6],
-					ts_size * sizeof(DTYPE) / (timer.time[3]),
-					ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
-				printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-					ts_size * sizeof(DTYPE) / (timer.time[2] + timer.time[3] + timer.time[4]),
-					ts_size * sizeof(DTYPE) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]),
-					ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
-				printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-					ts_size / timer.time[6],
-					ts_size / (timer.time[3]),
-					ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
-				printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-					ts_size / (timer.time[2] + timer.time[3] + timer.time[4]),
-					ts_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]),
-					ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+				dfatool_printf
+				    ("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
+				     NR_DPUS, nr_of_ranks, NR_TASKLETS,
+				     XSTR(DTYPE), BLOCK_SIZE, ts_size);
+				dfatool_printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD);
+				dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", timer.time[0],	// alloc
+				       timer.time[1],	// load
+				       timer.time[2],	// write
+				       timer.time[3],	// kernel
+				       timer.time[4],	// read
+				       timer.time[5],	// free
+				       timer.time[6]);	// CPU
+				dfatool_printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     ts_size * sizeof(DTYPE) / timer.time[6],
+				     ts_size * sizeof(DTYPE) / (timer.time[3]),
+				     ts_size * sizeof(DTYPE) / (timer.time[0] +
+								timer.time[1] +
+								timer.time[2] +
+								timer.time[3] +
+								timer.time[4] +
+								timer.time[5]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     ts_size * sizeof(DTYPE) / (timer.time[2] +
+								timer.time[3] +
+								timer.time[4]),
+				     ts_size * sizeof(DTYPE) / (timer.time[1] +
+								timer.time[2] +
+								timer.time[3] +
+								timer.time[4]),
+				     ts_size * sizeof(DTYPE) / (timer.time[0] +
+								timer.time[1] +
+								timer.time[2] +
+								timer.time[3] +
+								timer.time[4]));
+				dfatool_printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     ts_size / timer.time[6],
+				     ts_size / (timer.time[3]),
+				     ts_size / (timer.time[0] + timer.time[1] +
+						timer.time[2] + timer.time[3] +
+						timer.time[4] + timer.time[5]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     ts_size / (timer.time[2] + timer.time[3] +
+						timer.time[4]),
+				     ts_size / (timer.time[1] + timer.time[2] +
+						timer.time[3] + timer.time[4]),
+				     ts_size / (timer.time[0] + timer.time[1] +
+						timer.time[2] + timer.time[3] +
+						timer.time[4]));
 			}
 		} else {
-			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] results differ!\n");
 		}
 	}
 
 #if ENERGY
 	double acc_energy, avg_energy, acc_time, avg_time;
-	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+	DPU_ASSERT(dpu_probe_get
+		   (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
 	DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -407,7 +485,6 @@ int main(int argc, char **argv) {
 	printf("Energy (J): %f J\t", avg_energy);
 #endif
 
-
 #if !WITH_ALLOC_OVERHEAD
 	DPU_ASSERT(dpu_free(dpu_set));
 #endif
diff --git a/TS/support/common.h b/TS/include/common.h
index b120bb1..6d37bdc 100755..100644
--- a/TS/support/common.h
+++ b/TS/include/common.h
@@ -14,30 +14,32 @@
 #define DTYPE int32_t
 #define DTYPE_MAX INT32_MAX
 
-typedef struct  {
+enum kernels {
+	kernel1 = 0,
+	nr_kernels = 1,
+} kernel;
+
+typedef struct {
 	uint32_t ts_length;
-    uint32_t query_length;
-    DTYPE query_mean;
-    DTYPE query_std;
-    uint32_t slice_per_dpu;
-    int32_t exclusion_zone;
-    enum kernels {
-		kernel1 = 0,
-		nr_kernels = 1,
-	} kernel;
-}dpu_arguments_t;
+	uint32_t query_length;
+	DTYPE query_mean;
+	DTYPE query_std;
+	uint32_t slice_per_dpu;
+	int32_t exclusion_zone;
+	enum kernels kernel;
+} dpu_arguments_t;
 
-typedef struct  {
-    DTYPE minValue;
-    uint32_t minIndex;
-    DTYPE maxValue;
-    uint32_t maxIndex;
-}dpu_result_t;
+typedef struct {
+	DTYPE minValue;
+	uint32_t minIndex;
+	DTYPE maxValue;
+	uint32_t maxIndex;
+} dpu_result_t;
 
 #ifndef ENERGY
 #define ENERGY 0
 #endif
-#define PRINT 0 
+#define PRINT 0
 
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
diff --git a/TS/include/dfatool_host.ah b/TS/include/dfatool_host.ah
new file mode 100644
index 0000000..4192c73
--- /dev/null
+++ b/TS/include/dfatool_host.ah
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+	unsigned long ts_size, query_length;
+	unsigned int element_size;
+
+	virtual int getKernel() { return kernel; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(DTYPE);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		ts_size = p->input_size_n;
+		query_length = p->input_size_m;
+		printf("[>>] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length);
+	}
+
+	advice call("% streamp(...)") : before() {
+		printf("[--] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length);
+	}
+};
diff --git a/TS/include/params.h b/TS/include/params.h
new file mode 100644
index 0000000..b7d9763
--- /dev/null
+++ b/TS/include/params.h
@@ -0,0 +1,67 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+// Params ---------------------------------------------------------------------
+typedef struct Params {
+	unsigned long input_size_n;
+	unsigned long input_size_m;
+	int n_warmup;
+	int n_reps;
+} Params;
+
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -n <n>    n (TS length. Default=64K elements)"
+		"\n    -m <m>    m (Query length. Default=256 elements)" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size_n = 1 << 16;
+	p.input_size_m = 1 << 8;
+
+	p.n_warmup = 1;
+	p.n_reps = 3;
+
+	int opt;
+	while ((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'n':
+			p.input_size_n = atol(optarg);
+			break;
+		case 'm':
+			p.input_size_m = atol(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+	return p;
+}
+#endif
diff --git a/TS/include/timer.h b/TS/include/timer.h
new file mode 100644
index 0000000..7b80823
--- /dev/null
+++ b/TS/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/TS/run-paper-strong-full.sh b/TS/run-paper-strong-full.sh
deleted file mode 100755
index 5b7656d..0000000
--- a/TS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 is not part of upstream
-# 12 tasklets are not part of upstream (code does not work with 16…)
-for nr_dpus in 2543 2304 256 512 1024 2048; do
-	for nr_tasklets in 1 2 4 8 12 16; do
-		echo
-		# upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
-		# This appears to be faster than BL=10.
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then
-			timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n 33554432 || true
-		fi
-	done
-done
-) | tee log-paper-strong-full.txt
diff --git a/TS/run-paper-strong-rank.sh b/TS/run-paper-strong-rank.sh
deleted file mode 100755
index 58ad641..0000000
--- a/TS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream config space
-for nr_dpus in 128 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		# upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
-		# BL=10 appears to be slightly faster.
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
-			timeout --foreground -k 1m 60m bin/ts_host -w 0 -e 50 -n 524288 || true
-		fi
-	done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/TS/run-paper-weak.sh b/TS/run-paper-weak.sh
deleted file mode 100755
index 64892f4..0000000
--- a/TS/run-paper-weak.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
-# T: data type
-# -w: number of un-timed warmup iterations
-# -e: number of timed iterations
-# -i; ignored, always uses 262144 elements
-
-(
-
-echo "prim-benchmarks TS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# 256 and 512 are not part of upstream
-for nr_dpus in 1 4 16 64; do
-	for nr_tasklets in 1 2 4 8 16; do
-		echo
-		# upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
-		# BL=10 appears to be slightly faster.
-		if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then
-			i=$(( nr_dpus * 524288 ))
-			timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n $i || true
-		fi
-	done
-done
-) | tee log-paper-weak.txt
diff --git a/TS/support/params.h b/TS/support/params.h
deleted file mode 100644
index 4668604..0000000
--- a/TS/support/params.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-
-// Params ---------------------------------------------------------------------
-typedef struct Params {
-  unsigned long  input_size_n;
-  unsigned long  input_size_m;
-  int  n_warmup;
-  int  n_reps;
-}Params;
-
-void usage() {
-  fprintf(stderr,
-    "\nUsage:  ./program [options]"
-    "\n"
-    "\nGeneral options:"
-    "\n    -h        help"
-    "\n    -w <W>    # of untimed warmup iterations (default=1)"
-    "\n    -e <E>    # of timed repetition iterations (default=3)"
-    "\n"
-    "\nBenchmark-specific options:"
-    "\n    -n <n>    n (TS length. Default=64K elements)"
-    "\n    -m <m>    m (Query length. Default=256 elements)"
-    "\n");
-  }
-
-  struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size_n  = 1 << 16;
-    p.input_size_m  = 1 << 8;
-
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-
-    int opt;
-    while((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) {
-      switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'n': p.input_size_n  = atol(optarg); break;
-        case 'm': p.input_size_m  = atol(optarg); break;
-        default:
-        fprintf(stderr, "\nUnrecognized option!\n");
-        usage();
-        exit(0);
-      }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
-
-    return p;
-  }
-#endif
diff --git a/TS/support/timer.h b/TS/support/timer.h
deleted file mode 100755
index ff1ae1b..0000000
--- a/TS/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/VA/Makefile b/VA/Makefile
index 040dd4a..a67c600 100644
--- a/VA/Makefile
+++ b/VA/Makefile
@@ -8,17 +8,34 @@ WITH_LOAD_OVERHEAD ?= 0
 WITH_FREE_OVERHEAD ?= 0
 WITH_DPUINFO ?= 0
 
-COMMON_INCLUDES := support
 HOST_SOURCES := $(wildcard host/*.c)
 DPU_SOURCES := $(wildcard dpu/*.c)
 
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
 DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE}
 
+ifeq (${aspectc_timing}, 1)
+	ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+	HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+	HOST_FLAGS += -std=c11
+endif
+
 QUIET = @
 
-ifdef verbose
+ifeq (${verbose}, 1)
 	QUIET =
 endif
 
@@ -27,10 +44,13 @@ all: bin/host_code bin/dpu_code
 bin:
 	${QUIET}mkdir -p bin
 
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
-	${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+# cp/rm are needed to work around AspectC++ not liking symlinks
+bin/host_code: ${HOST_SOURCES} include bin
+	${QUIET}cp ../include/dfatool_host_dpu.ah include
+	${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+	${QUIET}rm -f include/dfatool_host_dpu.ah
 
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
 	${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
 
 clean:
diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile
index 76a82e1..04aacb6 100644
--- a/VA/baselines/cpu/Makefile
+++ b/VA/baselines/cpu/Makefile
@@ -1,9 +1,23 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
 
-ifeq (${NUMA}, 1)
-	FLAGS += -lnuma
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
 endif
 
 .PHONY: all
@@ -12,7 +26,7 @@ all: va
 TYPE ?= int32_t
 
 va: app_baseline.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 -o va -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} app_baseline.c ${FLAGS}
+	gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o va -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -DT=${TYPE} app_baseline.c ${LDFLAGS}
 
 va_O0: app_baseline.c
 	gcc -o va_O0 -fopenmp app_baseline.c
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c
index 4c8610a..7975200 100644
--- a/VA/baselines/cpu/app_baseline.c
+++ b/VA/baselines/cpu/app_baseline.c
@@ -13,13 +13,19 @@
 #include <stdint.h>
 
 #include <omp.h>
+
+#if WITH_BENCHMARK
 #include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
 
 #if NUMA
 #include <numaif.h>
 #include <numa.h>
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
@@ -49,301 +55,345 @@ static T *B_local;
 /**
 * @brief compute output in the host
 */
-static void vector_addition_host(unsigned int nr_elements, int t) {
-    omp_set_num_threads(t);
-    #pragma omp parallel for
-    for (int i = 0; i < nr_elements; i++) {
+static void vector_addition_host(unsigned long nr_elements, int t)
+{
+	omp_set_num_threads(t);
+#pragma omp parallel for
+	for (long i = 0; i < nr_elements; i++) {
 #if NUMA_MEMCPY
-        C[i] = A_local[i] + B_local[i];
+		C[i] = A_local[i] + B_local[i];
 #else
-        C[i] = A[i] + B[i];
+		C[i] = A[i] + B[i];
 #endif
-    }
+	}
 }
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-    int   input_size;
-    int   n_warmup;
-    int   n_reps;
-    int   exp;
-    int   n_threads;
+	long input_size;
+	int n_warmup;
+	int n_reps;
+	int exp;
+	int n_threads;
 #if NUMA
-    struct bitmask* bitmask_in;
-    struct bitmask* bitmask_out;
-    int numa_node_cpu;
+	struct bitmask *bitmask_in;
+	struct bitmask *bitmask_out;
+	int numa_node_cpu;
 #endif
 #if NUMA_MEMCPY
-    int numa_node_cpu_memcpy;
-    struct bitmask* bitmask_cpu;
+	int numa_node_cpu_memcpy;
+	struct bitmask *bitmask_cpu;
 #endif
-}Params;
-
-void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -t <T>    # of threads (default=8)"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=8M elements)"
-        "\n");
+} Params;
+
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -t <T>    # of threads (default=8)"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=8M elements)" "\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 16777216;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 1;
-    p.n_threads     = 5;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 16777216;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 1;
+	p.n_threads = 5;
 #if NUMA
-    p.bitmask_in     = NULL;
-    p.bitmask_out    = NULL;
-    p.numa_node_cpu  = -1;
+	p.bitmask_in = NULL;
+	p.bitmask_out = NULL;
+	p.numa_node_cpu = -1;
 #endif
 #if NUMA_MEMCPY
-    p.numa_node_cpu_memcpy  = -1;
-    p.bitmask_cpu    = NULL;
+	p.numa_node_cpu_memcpy = -1;
+	p.bitmask_cpu = NULL;
 #endif
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'x': p.exp           = atoi(optarg); break;
-        case 't': p.n_threads     = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atol(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		case 't':
+			p.n_threads = atoi(optarg);
+			break;
 #if NUMA
-        case 'a': p.bitmask_in    = numa_parse_nodestring(optarg); break;
-        case 'b': p.bitmask_out   = numa_parse_nodestring(optarg); break;
-        case 'c': p.numa_node_cpu = atoi(optarg); break;
+		case 'a':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'b':
+			p.bitmask_out = numa_parse_nodestring(optarg);
+			break;
+		case 'c':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #if NUMA_MEMCPY
-        case 'C': p.bitmask_cpu   = numa_parse_nodestring(optarg); break;
-        case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
-#endif // NUMA_MEMCPY
-#endif // NUMA
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(p.n_threads > 0 && "Invalid # of ranks!");
-
-    return p;
+		case 'C':
+			p.bitmask_cpu = numa_parse_nodestring(optarg);
+			break;
+		case 'M':
+			p.numa_node_cpu_memcpy = atoi(optarg);
+			break;
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(p.n_threads > 0 && "Invalid # of ranks!");
+
+	return p;
 }
 
 /**
 * @brief Main of the Host Application.
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    const unsigned int input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
+	const unsigned long input_size =
+	    p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
 
-    // Create an input file with arbitrary data.
+	// Create an input file with arbitrary data.
     /**
     * @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values
     * @param nr_elements how many 32-bit elements we want the file to be
     * @return the buffer address
     */
-    srand(0);
+	srand(0);
 
 #if NUMA
-    if (p.bitmask_in) {
-        numa_set_membind(p.bitmask_in);
-        numa_free_nodemask(p.bitmask_in);
-    }
-    A = (T*) numa_alloc(input_size * sizeof(T));
-    B = (T*) numa_alloc(input_size * sizeof(T));
+	if (p.bitmask_in) {
+		numa_set_membind(p.bitmask_in);
+		numa_free_nodemask(p.bitmask_in);
+	}
+	A = (T *) numa_alloc(input_size * sizeof(T));
+	B = (T *) numa_alloc(input_size * sizeof(T));
 #else
-    A = (T*) malloc(input_size * sizeof(T));
-    B = (T*) malloc(input_size * sizeof(T));
+	A = (T *) malloc(input_size * sizeof(T));
+	B = (T *) malloc(input_size * sizeof(T));
 #endif
 
 #if NUMA
-    if (p.bitmask_out) {
-        numa_set_membind(p.bitmask_out);
-        numa_free_nodemask(p.bitmask_out);
-    }
-    C = (T*) numa_alloc(input_size * sizeof(T));
+	if (p.bitmask_out) {
+		numa_set_membind(p.bitmask_out);
+		numa_free_nodemask(p.bitmask_out);
+	}
+	C = (T *) numa_alloc(input_size * sizeof(T));
 #else
-    C = (T*) malloc(input_size * sizeof(T));
+	C = (T *) malloc(input_size * sizeof(T));
 #endif
 
-    for (unsigned int i = 0; i < input_size; i++) {
-        A[i] = (T) (rand());
-        B[i] = (T) (rand());
-    }
+	for (unsigned long i = 0; i < input_size; i++) {
+		A[i] = (T) (rand());
+		B[i] = (T) (rand());
+	}
 
 #if NUMA
 #if NUMA_MEMCPY
-    if (p.bitmask_cpu) {
-        numa_set_membind(p.bitmask_cpu);
-        numa_free_nodemask(p.bitmask_cpu);
-    }
+	if (p.bitmask_cpu) {
+		numa_set_membind(p.bitmask_cpu);
+		numa_free_nodemask(p.bitmask_cpu);
+	}
 #else
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
 
 #if NUMA
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    mp_pages[0] = C;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(C)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_out = mp_status[0];
-    }
-
-    numa_node_cpu = p.numa_node_cpu;
-    if (p.numa_node_cpu != -1) {
-        if (numa_run_on_node(p.numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	mp_pages[0] = C;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(C)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_out = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (p.numa_node_cpu != -1) {
+		if (numa_run_on_node(p.numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 #if NUMA_MEMCPY
-    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+	numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+				 || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
+#if WITH_BENCHMARK
+	Timer timer;
 #endif
 
-    Timer timer;
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
+#endif
 
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if NUMA_MEMCPY
-        numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
-        start(&timer, 1, 0);
-        if (!numa_node_in_is_local) {
-            A_local = (T*) numa_alloc(input_size * sizeof(T));
-            B_local = (T*) numa_alloc(input_size * sizeof(T));
-        }
-        stop(&timer, 1);
-        if (!numa_node_in_is_local) {
-            if (p.numa_node_cpu_memcpy != -1) {
-                if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
-                    perror("numa_run_on_node");
-                    numa_node_cpu_memcpy = -1;
-                }
-            }
-        }
-        start(&timer, 2, 0);
-        if (!numa_node_in_is_local) {
-            memcpy(A_local, A, input_size * sizeof(T));
-            memcpy(B_local, B, input_size * sizeof(T));
-        } else {
-            A_local = A;
-            B_local = B;
-        }
-        stop(&timer, 2);
-        if (p.numa_node_cpu != -1) {
-            if (numa_run_on_node(p.numa_node_cpu) == -1) {
-                perror("numa_run_on_node");
-                numa_node_cpu = -1;
-            }
-        }
-        mp_pages[0] = A_local;
-        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-            perror("move_pages(A_local)");
-        }
-        else if (mp_status[0] < 0) {
-            printf("move_pages error: %d", mp_status[0]);
-        }
-        else {
-            numa_node_local = mp_status[0];
-        }
+		numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+		start(&timer, 1, 0);
+		if (!numa_node_in_is_local) {
+			A_local = (T *) numa_alloc(input_size * sizeof(T));
+			B_local = (T *) numa_alloc(input_size * sizeof(T));
+		}
+		stop(&timer, 1);
+		if (!numa_node_in_is_local) {
+			if (p.numa_node_cpu_memcpy != -1) {
+				if (numa_run_on_node(p.numa_node_cpu_memcpy) ==
+				    -1) {
+					perror("numa_run_on_node");
+					numa_node_cpu_memcpy = -1;
+				}
+			}
+		}
+		start(&timer, 2, 0);
+		if (!numa_node_in_is_local) {
+			memcpy(A_local, A, input_size * sizeof(T));
+			memcpy(B_local, B, input_size * sizeof(T));
+		} else {
+			A_local = A;
+			B_local = B;
+		}
+		stop(&timer, 2);
+		if (p.numa_node_cpu != -1) {
+			if (numa_run_on_node(p.numa_node_cpu) == -1) {
+				perror("numa_run_on_node");
+				numa_node_cpu = -1;
+			}
+		}
+		mp_pages[0] = A_local;
+		if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+			perror("move_pages(A_local)");
+		} else if (mp_status[0] < 0) {
+			printf("move_pages error: %d", mp_status[0]);
+		} else {
+			numa_node_local = mp_status[0];
+		}
 #endif
 
-        start(&timer, 0, 0);
-        vector_addition_host(input_size, p.n_threads);
-        stop(&timer, 0);
+		start(&timer, 0, 0);
+		vector_addition_host(input_size, p.n_threads);
+		stop(&timer, 0);
 
 #if NUMA_MEMCPY
-        start(&timer, 3, 0);
-        if (!numa_node_in_is_local) {
-            numa_free(A_local, input_size * sizeof(T));
-            numa_free(B_local, input_size * sizeof(T));
-        }
-        stop(&timer, 3);
+		start(&timer, 3, 0);
+		if (!numa_node_in_is_local) {
+			numa_free(A_local, input_size * sizeof(T));
+			numa_free(B_local, input_size * sizeof(T));
+		}
+		stop(&timer, 3);
 #endif
 
-        unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
+		nr_threads++;
 
-        if (rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 #if NUMA_MEMCPY
-            printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
-                " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
-                " | throughput_MBps=%f",
-                nr_threads, XSTR(T), input_size,
-                numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
-                input_size * 3 * sizeof(T) / timer.time[0]);
-            printf(" throughput_MOpps=%f",
-                input_size / timer.time[0]);
-            printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
-                timer.time[0], timer.time[1], timer.time[2], timer.time[3],
-                timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+			printf
+			    ("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+			     " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+			     " | throughput_MBps=%f", nr_threads, XSTR(T),
+			     input_size, numa_node_in, numa_node_local,
+			     numa_node_out, numa_node_cpu, numa_node_cpu_memcpy,
+			     numa_distance(numa_node_in, numa_node_cpu),
+			     numa_distance(numa_node_cpu, numa_node_out),
+			     input_size * 3 * sizeof(T) / timer.time[0]);
+			printf(" throughput_MOpps=%f",
+			       input_size / timer.time[0]);
+			printf
+			    (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+			     timer.time[0], timer.time[1], timer.time[2],
+			     timer.time[3],
+			     timer.time[0] + timer.time[1] + timer.time[2] +
+			     timer.time[3]);
 #else
-            printf("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%d"
+			printf
+			    ("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%ld"
 #if NUMA
-                " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+			     " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
 #endif
-                " | throughput_MBps=%f",
-                nr_threads, XSTR(T), input_size,
+			     " | throughput_MBps=%f",
+			     nr_threads, XSTR(T), input_size,
 #if NUMA
-                numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+			     numa_node_in, numa_node_out, numa_node_cpu,
+			     numa_distance(numa_node_in, numa_node_cpu),
+			     numa_distance(numa_node_cpu, numa_node_out),
+#endif
+			     input_size * 3 * sizeof(T) / timer.time[0]);
+			printf(" throughput_MOpps=%f",
+			       input_size / timer.time[0]);
+			printf(" latency_us=%f\n", timer.time[0]);
+#endif				// NUMA_MEMCPY
+		}
+#endif				// WITH_BENCHMARK
+	}
+
+#if NOP_SYNC
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
-                input_size * 3 * sizeof(T) / timer.time[0]);
-            printf(" throughput_MOpps=%f",
-                input_size / timer.time[0]);
-            printf(" latency_us=%f\n",
-                timer.time[0]);
-#endif // NUMA_MEMCPY
-        }
-    }
 
 #if NUMA
-    numa_free(A, input_size * sizeof(T));
-    numa_free(B, input_size * sizeof(T));
-    numa_free(C, input_size * sizeof(T));
+	numa_free(A, input_size * sizeof(T));
+	numa_free(B, input_size * sizeof(T));
+	numa_free(C, input_size * sizeof(T));
 #else
-    free(A);
-    free(B);
-    free(C);
+	free(A);
+	free(B);
+	free(C);
 #endif
 
-   return 0;
- }
+	return 0;
+}
diff --git a/VA/baselines/cpu/run-perf.sh b/VA/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..8075256
--- /dev/null
+++ b/VA/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 167772160
+perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 167772160
diff --git a/VA/baselines/cpu/run-ws.sh b/VA/baselines/cpu/run-ws.sh
new file mode 100755
index 0000000..ccc4993
--- /dev/null
+++ b/VA/baselines/cpu/run-ws.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B benchmark=0 debug=1 native=0 nop_sync=1 numa=1
+
+~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t1.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 16777216
+~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t4.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 16777216
diff --git a/VA/benchmark-scripts/ccmcc25-sim.sh b/VA/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..386cf90
--- /dev/null
+++ b/VA/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 5 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks  VA  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \
+	::: nr_dpus 1 2 4 8 16 32 48 64 \
+	::: input_size 327680 655360 1310720 2621440 \
+>> ${fn}.txt
diff --git a/VA/benchmark-scripts/ccmcc25.sh b/VA/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..f6d441d
--- /dev/null
+++ b/VA/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+		aspectc=1 aspectc_timing=1 dfatool_timing=0
+	bin/host_code -w 0 -e 50 -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+	echo "prim-benchmarks  VA  $(git describe --all --long)  $(git rev-parse HEAD)  $(date -R)" >> ${fn}.txt
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \
+		::: numa_rank any \
+		::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+		::: input_size 83886080 167772160 335544320 671088640 \
+	>> ${fn}.txt
+
+done
diff --git a/VA/dpu/task.c b/VA/dpu/task.c
index bb41303..91b1176 100644
--- a/VA/dpu/task.c
+++ b/VA/dpu/task.c
@@ -10,15 +10,16 @@
 #include <perfcounter.h>
 #include <barrier.h>
 
-#include "../support/common.h"
+#include "common.h"
 
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
 // vector_addition: Computes the vector addition of a cached block 
-static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
-    for (unsigned int i = 0; i < l_size; i++){
-        bufferB[i] += bufferA[i];
-    }
+static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size)
+{
+	for (unsigned int i = 0; i < l_size; i++) {
+		bufferB[i] += bufferA[i];
+	}
 }
 
 // Barrier
@@ -26,53 +27,67 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 extern int main_kernel1(void);
 
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void) { 
-    // Kernel
-    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
+int main(void)
+{
+	// Kernel
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
-    unsigned int tasklet_id = me();
+int main_kernel1()
+{
+	unsigned int tasklet_id = me();
 #if PRINT
-    printf("tasklet_id = %u\n", tasklet_id);
+	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-    if (tasklet_id == 0){ // Initialize once the cycle counter
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
-    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
-
-    // Address of the current processing block in MRAM
-    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
-    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
-    uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
-
-    // Initialize a local cache to store the MRAM block
-    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
-    T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
-
-    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
-
-        // Bound checking
-        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
-
-        // Load cache with current MRAM block
-        mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
-        mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);
-
-        // Computer vector addition
-        vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
-
-        // Write cache to current MRAM block
-        mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);
-
-    }
-
-    return 0;
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;	// Input size per DPU in bytes
+	uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size;	// Transfer input size per DPU in bytes
+
+	// Address of the current processing block in MRAM
+	uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+	uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	uint32_t mram_base_addr_B =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
+
+	// Initialize a local cache to store the MRAM block
+	T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+	T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
+
+	for (unsigned int byte_index = base_tasklet;
+	     byte_index < input_size_dpu_bytes;
+	     byte_index += BLOCK_SIZE * NR_TASKLETS) {
+
+		// Bound checking
+		uint32_t l_size_bytes =
+		    (byte_index + BLOCK_SIZE >=
+		     input_size_dpu_bytes) ? (input_size_dpu_bytes -
+					      byte_index) : BLOCK_SIZE;
+
+		// Load cache with current MRAM block
+		mram_read((__mram_ptr void const *)(mram_base_addr_A +
+						    byte_index), cache_A,
+			  l_size_bytes);
+		mram_read((__mram_ptr void const *)(mram_base_addr_B +
+						    byte_index), cache_B,
+			  l_size_bytes);
+
+		// Computer vector addition
+		vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
+
+		// Write cache to current MRAM block
+		mram_write(cache_B,
+			   (__mram_ptr void *)(mram_base_addr_B + byte_index),
+			   l_size_bytes);
+
+	}
+
+	return 0;
 }
diff --git a/VA/host/app.c b/VA/host/app.c
index 5fe3f61..27a64f2 100644
--- a/VA/host/app.c
+++ b/VA/host/app.c
@@ -7,15 +7,31 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+
+#if ASPECTC
+extern "C" {
+#endif
+
 #include <dpu.h>
 #include <dpu_log.h>
+#include <dpu_management.h>
+#include <dpu_target_macros.h>
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
 #include <unistd.h>
 #include <getopt.h>
 #include <assert.h>
 
-#include "../support/common.h"
-#include "../support/timer.h"
-#include "../support/params.h"
+#include "common.h"
+#include "timer.h"
+#include "params.h"
 
 // Define the DPU Binary path as DPU_BINARY here
 #ifndef DPU_BINARY
@@ -25,304 +41,368 @@
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-#if ENERGY
-#include <dpu_probe.h>
-#endif
-
-#include <dpu_management.h>
-#include <dpu_target_macros.h>
-
 // Pointer declaration
-static T* A;
-static T* B;
-static T* C;
-static T* C2;
+static T *A;
+static T *B;
+static T *C;
+static T *C2;
 
 // Create input arrays
-static void read_input(T* A, T* B, unsigned int nr_elements) {
-    srand(0);
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        A[i] = (T) (rand());
-        B[i] = (T) (rand());
-    }
+static void read_input(T *A, T *B, unsigned long int nr_elements)
+{
+	srand(0);
+	for (unsigned long int i = 0; i < nr_elements; i++) {
+		A[i] = (T) (rand());
+		B[i] = (T) (rand());
+	}
 }
 
 // Compute output in the host
-static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        C[i] = A[i] + B[i];
-    }
+static void vector_addition_host(T *C, T *A, T *B, unsigned long int nr_elements)
+{
+	for (unsigned long int i = 0; i < nr_elements; i++) {
+		C[i] = A[i] + B[i];
+	}
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t nr_of_dpus;
+	uint32_t nr_of_ranks;
 
 #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-    // Timer declaration
-    Timer timer;
+	// Timer declaration
+	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if DFATOOL_TIMING
+	timer.time[0] = 0;	// alloc
+#endif
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+#if DFATOOL_TIMING
+	timer.time[1] = 0;	// load
+#endif
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+#if DFATOOL_TIMING
+	timer.time[6] = 0;	// free
+#endif
 #endif
 
-    unsigned int i = 0;
-    const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
-    const unsigned int input_size_8bytes = 
-        ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
-    const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
-    const unsigned int input_size_dpu_8bytes = 
-        ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
-
-    // Input/output allocation
-    A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    T *bufferA = A;
-    T *bufferB = B;
-    T *bufferC = C2;
-
-    // Create an input file with arbitrary data
-    read_input(A, B, input_size);
-
-    // Loop over main kernel
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+	unsigned int i = 0;
+	const unsigned long int input_size =
+	    p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
+	const unsigned long int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size;	// Input size per DPU (max.), 8-byte aligned
+	const unsigned long int input_size_dpu = divceil(input_size, NR_DPUS);	// Input size per DPU (max.)
+	const unsigned long int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu;	// Input size per DPU (max.), 8-byte aligned
+
+	// Input/output allocation
+	A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	B = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	C = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	C2 = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	T *bufferA = A;
+	T *bufferB = B;
+	T *bufferC = C2;
+
+	// Create an input file with arbitrary data
+	read_input(A, B, input_size);
+
+	// Loop over main kernel
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if WITH_ALLOC_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 0, 0);
-        }
-        DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 0, 0);
+		}
+		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 0);
+		}
 #endif
 #if WITH_DPUINFO
-        printf("DPUs:");
-        DPU_FOREACH (dpu_set, dpu) {
-            int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            int slice = dpu_get_slice_id(dpu_from_set(dpu));
-            int member = dpu_get_member_id(dpu_from_set(dpu));
-            printf(" %d(%d.%d)", rank, slice, member);
-        }
-        printf("\n");
+		printf("DPUs:");
+		DPU_FOREACH(dpu_set, dpu) {
+			int rank =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			int slice = dpu_get_slice_id(dpu_from_set(dpu));
+			int member = dpu_get_member_id(dpu_from_set(dpu));
+			printf(" %d(%d.%d)", rank, slice, member);
+		}
+		printf("\n");
 #endif
 #if WITH_LOAD_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 1, 0);
-        }
-        DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 1);
-        }
-        DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-        DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-        assert(nr_of_dpus == NR_DPUS);
+		if (rep >= p.n_warmup) {
+			start(&timer, 1, 0);
+		}
+		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 1);
+		}
+		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+		DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+		assert(nr_of_dpus == NR_DPUS);
 #endif
 
-        // int prev_rank_id = -1;
-        int rank_id = -1;
-        DPU_FOREACH (dpu_set, dpu) {
-            rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
-                numa_node_rank = -1;
-            } else {
-                numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
-            }
-            /*
-            if (rank_id != prev_rank_id) {
-                printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-                prev_rank_id = rank_id;
-            }
-            */
-        }
-
-
-        // Compute output on CPU (performance comparison and verification purposes)
-        if(rep >= p.n_warmup) {
-            start(&timer, 2, 0);
-        }
-        vector_addition_host(C, A, B, input_size);
-        if(rep >= p.n_warmup) {
-            stop(&timer, 2);
-        }
-
-        if(rep >= p.n_warmup) {
-            start(&timer, 3, 0);
-        }
-        // Input arguments
-        unsigned int kernel = 0;
-        dpu_arguments_t input_arguments[NR_DPUS];
-        for(i=0; i<nr_of_dpus-1; i++) {
-            input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
-            input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-            input_arguments[i].kernel=kernel;
-        }
-        input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
-        input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-        input_arguments[nr_of_dpus-1].kernel=kernel;
-
-        // Copy input arrays
-        i = 0;
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
-
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
- 
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 3);
-        }
-
-        // Run DPU kernel
-        if(rep >= p.n_warmup) {
-            start(&timer, 4, 0);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_start(&probe));
-            #endif
-        }
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 4);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_stop(&probe));
-            #endif
-        }
-
+		// int prev_rank_id = -1;
+		int rank_id = -1;
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
+				numa_node_rank = -1;
+			} else {
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
+			}
+			/*
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
+		}
+
+		// Compute output on CPU (performance comparison and verification purposes)
+		if (rep >= p.n_warmup) {
+			start(&timer, 2, 0);
+		}
+		vector_addition_host(C, A, B, input_size);
+		if (rep >= p.n_warmup) {
+			stop(&timer, 2);
+		}
+
+		if (rep >= p.n_warmup) {
+			start(&timer, 3, 0);
+		}
+		// Input arguments
+		unsigned int kernel = 0;
+		dpu_arguments_t input_arguments[NR_DPUS];
+		for (i = 0; i < nr_of_dpus - 1; i++) {
+			input_arguments[i].size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].transfer_size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].kernel = (enum kernels)kernel;
+		}
+		input_arguments[nr_of_dpus - 1].size =
+		    (input_size_8bytes -
+		     input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T);
+		input_arguments[nr_of_dpus - 1].transfer_size =
+		    input_size_dpu_8bytes * sizeof(T);
+		input_arguments[nr_of_dpus - 1].kernel = (enum kernels)kernel;
+
+		// Copy input arrays
+		i = 0;
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferA + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferB + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size_dpu_8bytes * sizeof(T),
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 3);
+		}
+		// Run DPU kernel
+		if (rep >= p.n_warmup) {
+			start(&timer, 4, 0);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+		}
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 4);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+		}
 #if PRINT
-        {
-            unsigned int each_dpu = 0;
-            printf("Display DPU Logs\n");
-            DPU_FOREACH (dpu_set, dpu) {
-                printf("DPU#%d:\n", each_dpu);
-                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
-                each_dpu++;
-            }
-        }
+		{
+			unsigned int each_dpu = 0;
+			printf("Display DPU Logs\n");
+			DPU_FOREACH(dpu_set, dpu) {
+				printf("DPU#%d:\n", each_dpu);
+				DPU_ASSERT(dpulog_read_for_dpu
+					   (dpu.dpu, stdout));
+				each_dpu++;
+			}
+		}
 #endif
 
-        if(rep >= p.n_warmup) {
-            start(&timer, 5, 0);
-        }
-        i = 0;
-        // PARALLEL RETRIEVE TRANSFER
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 5);
-        }
-
+		if (rep >= p.n_warmup) {
+			start(&timer, 5, 0);
+		}
+		i = 0;
+		// PARALLEL RETRIEVE TRANSFER
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferC + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size_dpu_8bytes * sizeof(T),
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 5);
+		}
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 6, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 6, 0);
+		}
 #endif
-        DPU_ASSERT(dpu_free(dpu_set));
+		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            stop(&timer, 6);
-        }
+		if (rep >= p.n_warmup) {
+			stop(&timer, 6);
+		}
 #endif
 #endif
 
-        // Check output
-        bool status = true;
-        for (i = 0; i < input_size; i++) {
-            if(C[i] != bufferC[i]){ 
-                status = false;
+		// Check output
+		bool status = true;
+		for (i = 0; i < input_size; i++) {
+			if (C[i] != bufferC[i]) {
+				status = false;
 #if PRINT
-                printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
+				printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
 #endif
-            }
-        }
-        if (status) {
-            printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-            if (rep >= p.n_warmup) {
-                printf("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
-                    nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size / NR_DPUS);
-                printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-                    WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-                printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-                    timer.time[0],
-                    timer.time[1],
-                    timer.time[2],
-                    timer.time[3],
-                    timer.time[4],
-                    timer.time[5],
-                    timer.time[6]);
-                printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-                    input_size * 3 * sizeof(T) / timer.time[2],
-                    input_size * 3 * sizeof(T) / (timer.time[4]),
-                    input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-                    input_size * 3 * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * 3 * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-                printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-                    input_size / timer.time[2],
-                    input_size / (timer.time[4]),
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-                    input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-            }
-        } else {
-            printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
-        }
-    }
+			}
+		}
+		if (status) {
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] Outputs are equal\n");
+			if (rep >= p.n_warmup) {
+				dfatool_printf
+				    ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu n_elements_per_dpu=%lu",
+				     nr_of_dpus, nr_of_ranks, NR_TASKLETS,
+				     XSTR(T), BLOCK_SIZE, input_size,
+				     input_size / NR_DPUS);
+				dfatool_printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD, numa_node_rank);
+				dfatool_printf
+				    ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+				     timer.time[0], timer.time[1],
+				     timer.time[2], timer.time[3],
+				     timer.time[4], timer.time[5],
+				     timer.time[6]);
+				dfatool_printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     input_size * 3 * sizeof(T) / timer.time[2],
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[4]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5] + timer.time[6]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[3] + timer.time[4] +
+				      timer.time[5]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[1] + timer.time[3] +
+				      timer.time[4] + timer.time[5]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5]));
+				dfatool_printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     input_size / timer.time[2],
+				     input_size / (timer.time[4]),
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5] +
+						   timer.time[6]));
+				dfatool_printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     input_size / (timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]));
+			}
+		} else {
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] Outputs differ!\n");
+		}
+	}
 
 #if ENERGY
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    printf("DPU Energy (J): %f\t", energy);
-#endif	
-
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	printf("DPU Energy (J): %f\t", energy);
+#endif
 
-    // Deallocation
-    free(A);
-    free(B);
-    free(C);
-    free(C2);
+	// Deallocation
+	free(A);
+	free(B);
+	free(C);
+	free(C2);
 
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_free(dpu_set));
+	DPU_ASSERT(dpu_free(dpu_set));
 #endif
-	
-    return 0;
+
+	return 0;
 }
diff --git a/VA/support/common.h b/VA/include/common.h
index c1043fd..6ce6e23 100755..100644
--- a/VA/support/common.h
+++ b/VA/include/common.h
@@ -1,14 +1,20 @@
 #ifndef _COMMON_H_
 #define _COMMON_H_
 
+enum kernels {
+	kernel1 = 0,
+	nr_kernels = 1,
+};
+
 // Structures used by both the host and the dpu to communicate information
 typedef struct {
-    uint32_t size;
-    uint32_t transfer_size;
-	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
-	} kernel;
+	/*
+	 * Size per DPU cannot exceed 32 bit, as each DPU only has 64 MiB of memory
+	 * (i.e., only needs 26 bit for addressing).
+	 */
+	uint32_t size;
+	uint32_t transfer_size;
+	enum kernels kernel;
 } dpu_arguments_t;
 
 // Transfer size between MRAM and WRAM
@@ -24,34 +30,34 @@ typedef struct {
 // Data type
 #ifdef UINT32
 #define T uint32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif UINT64
 #define T uint64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif INT32
 #define T int32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif INT64
 #define T int64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif FLOAT
 #define T float
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif DOUBLE
 #define T double
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif CHAR
 #define T char
-#define DIV 0 // Shift right to divide by sizeof(T)
+#define DIV 0			// Shift right to divide by sizeof(T)
 #elif SHORT
 #define T short
-#define DIV 1 // Shift right to divide by sizeof(T)
+#define DIV 1			// Shift right to divide by sizeof(T)
 #endif
 
 #ifndef ENERGY
 #define ENERGY 0
 #endif
-#define PRINT 0 
+#define PRINT 0
 
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
diff --git a/VA/include/dfatool_host.ah b/VA/include/dfatool_host.ah
new file mode 100644
index 0000000..e74f466
--- /dev/null
+++ b/VA/include/dfatool_host.ah
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+	unsigned long n_rows;
+	unsigned int element_size;
+
+	virtual int getKernel() { return 1; }
+
+	DfatoolHostTiming() {
+		element_size = sizeof(T);
+	}
+
+	advice call("% input_params(...)") : after() {
+		Params* p = tjp->result();
+		n_rows = p->input_size;
+		printf("[>>] VA | n_dpus=%u n_rows=%lu\n", NR_DPUS, n_rows);
+	}
+
+	advice call("% vector_addition_host(...)") : after() {
+		printf("[--] VA | n_dpus=%u n_rows=%lu\n", n_dpus, n_rows);
+	}
+
+	advice execution("% main(...)") : after() {
+		printf("[<<] VA | n_dpus=%u n_rows=%lu\n", NR_DPUS, n_rows);
+	}
+};
diff --git a/VA/include/params.h b/VA/include/params.h
new file mode 100644
index 0000000..31327d8
--- /dev/null
+++ b/VA/include/params.h
@@ -0,0 +1,65 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+typedef struct Params {
+	unsigned long int input_size;
+	int n_warmup;
+	int n_reps;
+	int exp;
+} Params;
+
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=1)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=2621440 elements)" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 2621440;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 1;
+
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atol(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+	return p;
+}
+#endif
diff --git a/VA/include/timer.h b/VA/include/timer.h
new file mode 100644
index 0000000..7b80823
--- /dev/null
+++ b/VA/include/timer.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/VA/support/params.h b/VA/support/params.h
deleted file mode 100644
index 8bd71a6..0000000
--- a/VA/support/params.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-
-typedef struct Params {
-    unsigned int   input_size;
-    int   n_warmup;
-    int   n_reps;
-    int   exp;
-}Params;
-
-static void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=2621440 elements)"
-        "\n");
-}
-
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 2621440;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 0;
-
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'x': p.exp           = atoi(optarg); break;
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
-
-    return p;
-}
-#endif
diff --git a/VA/support/timer.h b/VA/support/timer.h
deleted file mode 100755
index 4d597b9..0000000
--- a/VA/support/timer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
diff --git a/include/dfatool_host_dpu.ah b/include/dfatool_host_dpu.ah
new file mode 100644
index 0000000..c676f50
--- /dev/null
+++ b/include/dfatool_host_dpu.ah
@@ -0,0 +1,145 @@
+#pragma once
+
+#include <sys/time.h>
+
+aspect DfatoolHostDPUTiming {
+	struct timeval starttime;
+	struct timeval stoptime;
+	uint32_t n_ranks = 0;
+	uint32_t n_dpus = 0;
+
+	double const M_to_Mi = 1.048576; /* 2^20 / 1e6 */
+
+	virtual int getKernel() = 0;
+
+	advice call("% dpu_get_nr_dpus(...)") : after() {
+		n_dpus = **(tjp->arg<1>());
+	}
+
+	advice call("% dpu_get_nr_ranks(...)") : after() {
+		n_ranks = **(tjp->arg<1>());
+	}
+
+	advice call("% dpu_alloc(...)") : around() {
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		n_dpus = *(tjp->arg<0>());
+		printf("[::] dpu_alloc @ %s:%d | n_dpus=%u | latency_us=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus,
+			(stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec)
+		);
+	}
+
+	advice call("% dpu_alloc_ranks(...)") : around() {
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		n_ranks = *(tjp->arg<0>());
+		printf("[::] dpu_alloc_ranks @ %s:%d | n_ranks=%u | latency_us=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_ranks,
+			(stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec)
+		);
+	}
+
+	advice call("% dpu_load(...)") : around() {
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		printf("[::] dpu_load @ %s:%d | n_dpus=%u n_ranks=%u | latency_us=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus, n_ranks,
+			(stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec)
+		);
+	}
+
+	advice call("% dpu_free(...)") : around() {
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		printf("[::] dpu_free @ %s:%d | n_dpus=%u n_ranks=%u | latency_us=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus, n_ranks,
+			(stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec)
+		);
+	}
+
+	advice call("% dpu_launch(...)") : around() {
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		double latency_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec);
+		printf("[::] dpu_launch @ %s:%d | n_dpus=%u n_ranks=%u e_kernel=kernel%d | latency_us=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus, n_ranks,
+			getKernel(),
+			latency_us
+		);
+	}
+
+	advice call("% dpu_copy_to(...)") : around() {
+		size_t payload_size = *(tjp->arg<4>());
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec);
+		printf("[::] dpu_copy_to @ %s:%d | n_dpus=%u n_ranks=%u payload_B=%lu | latency_us=%f throughput_MiBps=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus, n_ranks,
+			payload_size,
+			time_us,
+			payload_size / (time_us * M_to_Mi)
+		);
+	}
+
+	advice call("% dpu_copy_from(...)") : around() {
+		size_t payload_size = *(tjp->arg<4>());
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec);
+		printf("[::] dpu_copy_from @ %s:%d | n_dpus=%u n_ranks=%u payload_B=%lu | latency_us=%f throughput_MiBps=%f\n",
+			tjp->filename(),
+			tjp->line(),
+			n_dpus, n_ranks,
+			payload_size,
+			time_us,
+			payload_size / (time_us * M_to_Mi)
+		);
+	}
+
+	advice call("% dpu_push_xfer(...)") : around() {
+		size_t payload_size = *(tjp->arg<4>());
+		gettimeofday(&starttime, NULL);
+		tjp->proceed();
+		gettimeofday(&stoptime, NULL);
+		double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec);
+		if (*(tjp->arg<1>()) == DPU_XFER_TO_DPU) {
+			printf("[::] dpu_push_to_dpu @ %s:%d | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n",
+				tjp->filename(),
+				tjp->line(),
+				n_dpus, n_ranks,
+				payload_size * n_dpus, payload_size,
+				time_us,
+				payload_size * n_dpus / (time_us * M_to_Mi)
+			);
+		} else if (*(tjp->arg<1>()) == DPU_XFER_FROM_DPU) {
+			printf("[::] dpu_push_from_dpu @ %s:%d | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n",
+				tjp->filename(),
+				tjp->line(),
+				n_dpus, n_ranks,
+				payload_size * n_dpus, payload_size,
+				time_us,
+				payload_size * n_dpus / (time_us * M_to_Mi)
+			);
+		}
+	}
+};
diff --git a/include/timer_base.h b/include/timer_base.h
new file mode 100644
index 0000000..160136c
--- /dev/null
+++ b/include/timer_base.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <sys/time.h>
+
+#if DFATOOL_TIMING
+
+typedef struct Timer {
+
+	struct timeval startTime[N_TIMERS];
+	struct timeval stopTime[N_TIMERS];
+	double time[N_TIMERS];
+
+} Timer;
+
+#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0)
+
+void start(Timer *timer, int i, int rep)
+{
+	if (rep == 0) {
+		timer->time[i] = 0.0;
+	}
+	gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+	gettimeofday(&timer->stopTime[i], NULL);
+	timer->time[i] +=
+	    (timer->stopTime[i].tv_sec -
+	     timer->startTime[i].tv_sec) * 1000000.0 +
+	    (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void zero(Timer *timer, int i)
+{
+	timer->time[i] = 0;
+}
+
+#else
+
+#define dfatool_printf(fmt, ...) do {} while (0)
+
+typedef int Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+	(void)timer;
+	(void)i;
+	(void)rep;
+}
+
+void stop(Timer *timer, int i)
+{
+	(void)timer;
+	(void)i;
+}
+
+void zero(Timer *timer, int i)
+{
+	(void)timer;
+	(void)i;
+}
+
+#endif
diff --git a/perf-events.txt b/perf-events.txt
new file mode 100644
index 0000000..ab57ab2
--- /dev/null
+++ b/perf-events.txt
@@ -0,0 +1,44 @@
+cache-misses # NMPO
+cache-references
+
+cpu-cycles # NMPO
+instructions # NMPO
+
+page-faults
+
+mem-loads
+mem-loads-aux
+mem-stores
+
+branch-misses # NMPO
+branch-instructions # NMPO
+branch-load-misses # NMPO
+branch-loads # NMPO
+
+l1d_pend_miss.pending # mccalpin2023hpc <https://link.springer.com/chapter/10.1007/978-3-031-40843-4_30>
+l1d_pend_miss.pending_cycles
+
+offcore_requests.all_requests
+offcore_requests.data_rd
+offcore_requests.demand_data_rd
+
+offcore_requests_outstanding.data_rd # mccalpin2023hpc
+offcore_requests_outstanding.cycles_with_data_rd
+offcore_requests_outstanding.cycles_with_demand_data_rd
+offcore_requests_outstanding.demand_data_rd # mccalpin2023hpc
+
+L1-dcache-loads # NMPO
+L1-dcache-load-misses # NMPO
+L1-dcache-stores # NMPO
+L1-icache-load-misses # NMPO
+
+LLC-loads # NMPO
+LLC-load-misses
+LLC-stores # NMPO
+LLC-store-misses # NMPO
+
+l2_lines_out.useless_hwpf
+l2_lines_out.non_silent
+l2_lines_out.silent
+l2_request.all
+l2_request.miss
diff --git a/run_strong_full.py b/run_strong_full.py
deleted file mode 100644
index c65fecd..0000000
--- a/run_strong_full.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import os 
-import sys
-import getpass
-
-rootdir = "/" # Include path to repo
-
-applications = {"VA"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 167772160 -x 1"], 
-                "GEMV"     : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m 163840 -n 4096"],
-                "SpMV"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx"],
-                "SEL"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"],
-                "UNI"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"],
-                "BS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i 16777216"],
-                "TS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n 33554432"],
-                "BFS"      : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/loc-gowalla_edges.txt"],
-                "MLP"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m 163840 -n 4096"],
-                "NW"       : ["NR_DPUS=X NR_TASKLETS=Y BL=32 BL_IN=2 make all", "./bin/nw_host -w 0 -e 1 -n 65536"],
-                "HST-S"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 2"],
-                "HST-L"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 2"],
-                "RED"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i 419430400 -x 1"],
-                "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"],
-                "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"],
-                "TRNS"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p 2048 -o 12288 -x 1"],}
-
-def run(app_name):
-    
-    NR_TASKLETS = [1, 2, 4, 8, 16]
-    NR_DPUS = [256, 512, 1024, 2048]
-    BL = [10] 
-
-    if app_name in applications:
-        print ("------------------------ Running: "+app_name+"----------------------")
-        print ("--------------------------------------------------------------------")
-        if(len(applications[app_name]) > 1):
-            make = applications[app_name][0]
-            run_cmd = applications[app_name][1]
-        
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            os.system("make clean")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-            except OSError:
-                print ("Creation of the direction /bin failed")
-                
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-            except OSError:
-                print ("Creation of the direction /log failed")
-            
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-            except OSError: 
-                print ("Creation of the direction /log/host failed")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction /profile failed")
-        
-
-            for r in NR_DPUS:
-                for t in NR_TASKLETS:
-                    for b in BL:
-                        m = make.replace("X", str(r))
-                        m = m.replace("Y", str(t))
-                        m = m.replace("Z", str(b))
-                        print ("Running = " + m) 
-                        try:
-                            os.system(m)
-                        except: 
-                            pass 
-
-                        r_cmd = run_cmd.replace("#ranks", str(r))
-                        r_cmd = r_cmd +  " >> profile/outss_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) 
-                        
-                        print ("Running = " + app_name + " -> "+ r_cmd)
-                        try:
-                            os.system(r_cmd) 
-                        except:  
-                            pass 
-        else:
-            make = applications[app_name] 
-
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction failed")
-
-            print (make)    
-            os.system(make + ">& profile/out")
-
-    else:
-        print ( "Application "+app_name+" not available" )
-
-def main():
-    if(len(sys.argv) < 2):
-        print ("Usage: python run.py application")
-        print ("Applications available: ")
-        for key, value in applications.items():
-            print (key )
-        print ("All")
-
-    else:
-        cmd = sys.argv[1]
-        print ("Application to run is: " + cmd )
-        if cmd == "All":
-            for key, value in applications.items():
-                run(key)
-                os.chdir(rootdir)
-        else:
-            run(cmd)
-
-if __name__ == "__main__":
-    main()
diff --git a/run_strong_rank.py b/run_strong_rank.py
deleted file mode 100644
index 68f401e..0000000
--- a/run_strong_rank.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import os 
-import sys
-import getpass
-
-rootdir = "/" # Include path to repo
-print("Root dir: " + rootdir)
-
-applications = {"VA"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 2621440 -x 1"], 
-                "GEMV"     : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m 8192 -n 1024"],
-                "SpMV"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0"],
-                "SEL"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"],
-                "UNI"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"],
-                "BS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i 262144"],
-                "TS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n 524288"],
-                "BFS"      : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/loc-gowalla_edges.txt"],
-                "MLP"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m 8192 -n 1024"],
-                "NW"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z BL_IN=2 make all", "./bin/nw_host -w 0 -e 1 -n 2560"],
-                "HST-S"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 1"],
-                "HST-L"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 1"],
-                "RED"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i 6553600 -x 1"],
-                "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"],
-                "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"],
-                "TRNS"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p 64 -o 12288 -x 1"],}
-
-def run(app_name):
-    
-    NR_TASKLETS = [1, 2, 4, 8, 16]
-    NR_DPUS = [1, 4, 16, 64]
-    BL = [10] 
-
-    if app_name in applications:
-        print ("------------------------ Running: "+app_name+"----------------------")
-        print ("--------------------------------------------------------------------")
-        if(len(applications[app_name]) > 1):
-            make = applications[app_name][0]
-            run_cmd = applications[app_name][1]
-        
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            os.system("make clean")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-            except OSError:
-                print ("Creation of the direction /bin failed")
-                
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-            except OSError:
-                print ("Creation of the direction /log failed")
-            
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-            except OSError: 
-                print ("Creation of the direction /log/host failed")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction /profile failed")
-        
-
-            for r in NR_DPUS:
-                for t in NR_TASKLETS:
-                    for b in BL:
-                        m = make.replace("X", str(r))
-                        m = m.replace("Y", str(t))
-                        if (app_name == "NW"):
-                            if (r == 1):
-                                m = m.replace("Z", str(2560))
-                            elif (r == 4):
-                                m = m.replace("Z", str(640))
-                            elif (r == 16):
-                                m = m.replace("Z", str(160))
-                            elif (r == 64):
-                                m = m.replace("Z", str(40))
-                        else: 
-                            m = m.replace("Z", str(b))
-                        print ("Running = " + m) 
-                        try:
-                            os.system(m)
-                        except: 
-                            pass 
-
-                        r_cmd = run_cmd.replace("#ranks", str(r))
-                        r_cmd = r_cmd +  " >> profile/outs_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) 
-                        
-                        print ("Running = " + app_name + " -> "+ r_cmd)
-                        try:
-                            os.system(r_cmd) 
-                        except:  
-                            pass 
-        else:
-            make = applications[app_name] 
-
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction failed")
-
-            print (make)    
-            os.system(make + ">& profile/out")
-
-    else:
-        print ( "Application "+app_name+" not available" )
-
-def main():
-    if(len(sys.argv) < 2):
-        print ("Usage: python run.py application")
-        print ("Applications available: ")
-        for key, value in applications.items():
-            print (key )
-        print ("All")
-
-    else:
-        cmd = sys.argv[1]
-        print ("Application to run is: " + cmd )
-        if cmd == "All":
-            for key, value in applications.items():
-                run(key)
-                os.chdir(rootdir)
-        else:
-            run(cmd)
-
-if __name__ == "__main__":
-    main()
diff --git a/run_weak.py b/run_weak.py
deleted file mode 100644
index a613675..0000000
--- a/run_weak.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import os 
-import sys
-import getpass
-
-rootdir = "/" # Include path to repo
-
-applications = {"VA"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "GEMV"     : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m #elements -n 2048"],
-                "SpMV"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f file_name"],
-                "SEL"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "UNI"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "BS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i #elements"],
-                "TS"       : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n #elements"],
-                "BFS"      : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f file_name"],
-                "MLP"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m #elements -n 1024"],
-                "NW"       : ["NR_DPUS=X NR_TASKLETS=Y BL=512 BL_IN=8 make all", "./bin/nw_host -w 0 -e 1 -n #elements"],
-                "HST-S"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 0"],
-                "HST-L"    : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 0"],
-                "RED"      : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"],
-                "TRNS"     : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p #elements -o 12288 -x 0"],}
-
-def run(app_name):
-    
-    NR_DPUS = [1, 4, 16, 64]
-    NR_TASKLETS = [1, 2, 4, 8, 16]
-    size = 1
-    BL = [10] 
-    if(app_name == "VA"):
-        size = 2621440
-    if(app_name == "GEMV"):
-        size = 1024
-    if(app_name == "SEL" or app_name == "UNI" or app_name == "SCAN-SSA" or app_name == "SCAN-RSS"):
-        size = 3932160
-    if(app_name == "TS"):
-        size = 524288 
-    if(app_name == "BS"):
-        size = 262144
-    if(app_name == "MLP"):
-        size = 1024
-    if(app_name == "RED"):
-        size = 6553600
-    if(app_name == "TRNS"):
-        size = 1
-
-
-    if app_name in applications:
-        print ("------------------------ Running: "+app_name+"----------------------")
-        print ("--------------------------------------------------------------------")
-        if(len(applications[app_name]) > 1):
-            make = applications[app_name][0]
-            run_cmd = applications[app_name][1]
-        
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            os.system("make clean")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-            except OSError:
-                print ("Creation of the direction /bin failed")
-                
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-            except OSError:
-                print ("Creation of the direction /log failed")
-            
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-            except OSError: 
-                print ("Creation of the direction /log/host failed")
-
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction /profile failed")
-        
-
-            for r in NR_DPUS:
-                for t in NR_TASKLETS:
-                    for b in BL:
-                        m = make.replace("X", str(r))
-                        m = m.replace("Y", str(t))
-                        m = m.replace("Z", str(b))
-                        print ("Running = " + m) 
-                        try:
-                            os.system(m)
-                        except: 
-                            pass 
-
-                        if(app_name == "NW"):
-                            if(r == 1):
-                                r_cmd = run_cmd.replace("#elements", str(512))
-                            if(r == 4):
-                                r_cmd = run_cmd.replace("#elements", str(2048))
-                            if(r == 16):
-                                r_cmd = run_cmd.replace("#elements", str(8192))
-                            if(r == 64):
-                                r_cmd = run_cmd.replace("#elements", str(32768))
-                        elif(app_name == "GEMV" or app_name == "MLP" or app_name == "TS" or app_name == "BS"):
-                            r_cmd = run_cmd.replace("#elements", str(r * size))
-                        else:
-                            r_cmd = run_cmd.replace("#elements", str(size))
-                        if(app_name == "BFS"):
-                            if(r == 1):
-                                # Generate rMat graphs using:
-                                # https://github.com/cmuparlay/pbbsbench/blob/master/testData/graphData/rMatGraph.html
-                                # https://github.com/cmuparlay/pbbsbench/blob/master/testData/graphData/rMatGraph.C
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph
-                            if(r == 4):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph
-                            if(r == 16):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph
-                            if(r == 64):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph
-                        if(app_name == "SpMV"):
-                            if(r == 1):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate
-                            if(r == 4):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate
-                            if(r == 16):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate
-                            if(r == 64):
-                                r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate
-                        r_cmd = r_cmd +  " >> profile/out_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) 
-                        
-                        print ("Running = " + app_name + " -> "+ r_cmd)
-                        try:
-                            os.system(r_cmd) 
-                        except:  
-                            pass 
-        else:
-            make = applications[app_name] 
-
-            os.chdir(rootdir + "/"+app_name)
-            os.getcwd()
-        
-            try:
-                os.mkdir(rootdir + "/"+ app_name +"/bin")
-                os.mkdir(rootdir + "/"+ app_name +"/log")
-                os.mkdir(rootdir + "/"+ app_name +"/log/host")
-                os.mkdir(rootdir + "/"+ app_name +"/profile")
-            except OSError:
-                print ("Creation of the direction failed")
-
-            print (make)    
-            os.system(make + ">& profile/out")
-
-    else:
-        print ( "Application "+app_name+" not available" )
-
-def main():
-    if(len(sys.argv) < 2):
-        print ("Usage: python run.py application")
-        print ("Applications available: ")
-        for key, value in applications.items():
-            print (key )
-        print ("All")
-
-    else:
-        cmd = sys.argv[1]
-        print ("Application to run is: " + cmd )
-        if cmd == "All":
-            for key, value in applications.items():
-                run(key)
-                os.chdir(rootdir)
-        else:
-            run(cmd)
-
-if __name__ == "__main__":
-    main()
diff --git a/set-root-dir.sh b/set-root-dir.sh
deleted file mode 100755
index 35be69f..0000000
--- a/set-root-dir.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-sed -i 's!rootdir = "/"!rootdir = "'"$(pwd)"'"!' *.py