BFS: indent -linux

author: Birte Kristina Friesel <birte.friesel@uos.de> 2025-01-16 08:04:02 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2025-01-16 08:04:02 +0100
commit: 4afee0981b81b09104bba9745d09795b85957f27 (patch)
tree: 90d41a977613b983c9bec29c10956d5af20dad24 /BFS
parent: f0ec6cec9304ca0dd558cf8ec5777d0bb2da4307 (diff)
9 files changed, 792 insertions, 629 deletions
diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h
index b02c073..dc986d2 100644
--- a/BFS/dpu/dpu-utils.h
+++ b/BFS/dpu/dpu-utils.h
@@ -6,39 +6,46 @@
 
 #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m   "fmt"\n", ##__VA_ARGS__)
 
-static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8);
-    return cache_w[0];
+static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+	mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)),
+		  cache_w, 8);
+	return cache_w[0];
 }
 
-static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    cache_w[0] = val;
-    mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8);
+static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx,
+		    uint64_t *cache_w)
+{
+	cache_w[0] = val;
+	mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)),
+		   8);
 }
 
-static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    // Load 8B
-    uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
-    uint32_t offset = ((uint32_t)ptr_idx_m)%8;
-    uint32_t ptr_block_m = ptr_idx_m - offset;
-    mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
-    // Extract 4B
-    uint32_t* cache_32_w = (uint32_t*) cache_w;
-    return cache_32_w[offset/4];
+static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+	// Load 8B
+	uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+	uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+	uint32_t ptr_block_m = ptr_idx_m - offset;
+	mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+	// Extract 4B
+	uint32_t *cache_32_w = (uint32_t *) cache_w;
+	return cache_32_w[offset / 4];
 }
 
-static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
-    // Load 8B
-    uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
-    uint32_t offset = ((uint32_t)ptr_idx_m)%8;
-    uint32_t ptr_block_m = ptr_idx_m - offset;
-    mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
-    // Modify 4B
-    uint32_t* cache_32_w = (uint32_t*) cache_w;
-    cache_32_w[offset/4] = val;
-    // Write back 8B
-    mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8);
+static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx,
+		    uint64_t *cache_w)
+{
+	// Load 8B
+	uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+	uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+	uint32_t ptr_block_m = ptr_idx_m - offset;
+	mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+	// Modify 4B
+	uint32_t *cache_32_w = (uint32_t *) cache_w;
+	cache_32_w[offset / 4] = val;
+	// Write back 8B
+	mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8);
 }
 
 #endif
-
diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c
index 43a2d0f..44ec214 100644
--- a/BFS/dpu/task.c
+++ b/BFS/dpu/task.c
@@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS);
 MUTEX_INIT(nextFrontierMutex);
 
 // main
-int main() {
-
-    if(me() == 0) {
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    // Load parameters
-    uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
-    struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-    mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-
-    // Extract parameters
-    uint32_t numGlobalNodes = params_w->numNodes;
-    uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
-    uint32_t numNodes = params_w->dpuNumNodes;
-    uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
-    uint32_t level = params_w->level;
-    uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
-    uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
-    uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
-    uint32_t visited_m = params_w->dpuVisited_m;
-    uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
-    uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
-
-    if(numNodes > 0) {
-
-        // Sanity check
-        if(me() == 0) {
-            if(numGlobalNodes%64 != 0) {
-                //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
-            }
-            if(startNodeIdx%64 != 0 || numNodes%64 != 0) {
-                //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
-            }
-        }
-
-        // Allocate WRAM cache for each tasklet to use throughout
-        uint64_t* cache_w = mem_alloc(sizeof(uint64_t));
-
-        // Update current frontier and visited list based on the next frontier from the previous iteration
-        for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) {
-
-            // Get the next frontier tile from MRAM
-            uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w);
-
-            // Process next frontier tile if it is not empty 
-            if(nextFrontierTile) {
-
-                // Mark everything that was previously added to the next frontier as visited
-                uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w);
-                visitedTile |= nextFrontierTile;
-                store8B(visitedTile, visited_m, nodeTileIdx, cache_w);
-
-                // Clear the next frontier
-                store8B(0, nextFrontier_m, nodeTileIdx, cache_w);
-
-            }
-
-            // Extract the current frontier from the previous next frontier and update node levels
-            uint32_t startTileIdx = startNodeIdx/64;
-            uint32_t numTiles = numNodes/64;
-            if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) {
-
-                // Update current frontier
-                store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w);
-
-                // Update node levels
-                if(nextFrontierTile) {
-                    for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                        if(isSet(nextFrontierTile, node%64)) {
-                            store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
-                        }
-                    }
-                }
-            }
-
-        }
-
-        // Wait until all tasklets have updated the current frontier
-        barrier_wait(&bfsBarrier);
-
-        // Identify tasklet's nodes
-        uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS;
-        uint32_t taskletNodesStart = me()*numNodesPerTasklet;
-        uint32_t taskletNumNodes;
-        if(taskletNodesStart > numNodes) {
-            taskletNumNodes = 0;
-        } else if(taskletNodesStart + numNodesPerTasklet > numNodes) {
-            taskletNumNodes = numNodes - taskletNodesStart;
-        } else {
-            taskletNumNodes = numNodesPerTasklet;
-        }
-
-        // Visit neighbors of the current frontier
-        mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
-        for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) {
-            uint32_t nodeTileIdx = node/64;
-            uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
-            if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
-                // Visit its neighbors
-                uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset;
-                uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
-                for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
-                    uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
-                    uint32_t neighborTileIdx = neighbor/64;
-                    uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w);
-                    if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited
-                        // Add neighbor to next frontier
-                        mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
-                        uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w);
-                        setBit(nextFrontierTile, neighbor%64);
-                        store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w);
-                        mutex_unlock(mutexID);
-                    }
-                }
-            }
-        }
-
-    }
-
-    return 0;
+int main()
+{
+
+	if (me() == 0) {
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	// Load parameters
+	uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	struct DPUParams *params_w =
+	    (struct DPUParams *)
+	    mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+	mram_read((__mram_ptr void const *)params_m, params_w,
+		  ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+
+	// Extract parameters
+	uint32_t numGlobalNodes = params_w->numNodes;
+	uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
+	uint32_t numNodes = params_w->dpuNumNodes;
+	uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
+	uint32_t level = params_w->level;
+	uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
+	uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
+	uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
+	uint32_t visited_m = params_w->dpuVisited_m;
+	uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
+	uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
+
+	if (numNodes > 0) {
+
+		// Sanity check
+		if (me() == 0) {
+			if (numGlobalNodes % 64 != 0) {
+				//PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
+			}
+			if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) {
+				//PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
+			}
+		}
+		// Allocate WRAM cache for each tasklet to use throughout
+		uint64_t *cache_w = mem_alloc(sizeof(uint64_t));
+
+		// Update current frontier and visited list based on the next frontier from the previous iteration
+		for (uint32_t nodeTileIdx = me();
+		     nodeTileIdx < numGlobalNodes / 64;
+		     nodeTileIdx += NR_TASKLETS) {
+
+			// Get the next frontier tile from MRAM
+			uint64_t nextFrontierTile =
+			    load8B(nextFrontier_m, nodeTileIdx, cache_w);
+
+			// Process next frontier tile if it is not empty 
+			if (nextFrontierTile) {
+
+				// Mark everything that was previously added to the next frontier as visited
+				uint64_t visitedTile =
+				    load8B(visited_m, nodeTileIdx, cache_w);
+				visitedTile |= nextFrontierTile;
+				store8B(visitedTile, visited_m, nodeTileIdx,
+					cache_w);
+
+				// Clear the next frontier
+				store8B(0, nextFrontier_m, nodeTileIdx,
+					cache_w);
+
+			}
+			// Extract the current frontier from the previous next frontier and update node levels
+			uint32_t startTileIdx = startNodeIdx / 64;
+			uint32_t numTiles = numNodes / 64;
+			if (startTileIdx <= nodeTileIdx
+			    && nodeTileIdx < startTileIdx + numTiles) {
+
+				// Update current frontier
+				store8B(nextFrontierTile, currentFrontier_m,
+					nodeTileIdx - startTileIdx, cache_w);
+
+				// Update node levels
+				if (nextFrontierTile) {
+					for (uint32_t node = nodeTileIdx * 64;
+					     node < (nodeTileIdx + 1) * 64;
+					     ++node) {
+						if (isSet
+						    (nextFrontierTile,
+						     node % 64)) {
+							store4B(level, nodeLevel_m, node - startNodeIdx, cache_w);	// No false sharing so no need for locks
+						}
+					}
+				}
+			}
+
+		}
+
+		// Wait until all tasklets have updated the current frontier
+		barrier_wait(&bfsBarrier);
+
+		// Identify tasklet's nodes
+		uint32_t numNodesPerTasklet =
+		    (numNodes + NR_TASKLETS - 1) / NR_TASKLETS;
+		uint32_t taskletNodesStart = me() * numNodesPerTasklet;
+		uint32_t taskletNumNodes;
+		if (taskletNodesStart > numNodes) {
+			taskletNumNodes = 0;
+		} else if (taskletNodesStart + numNodesPerTasklet > numNodes) {
+			taskletNumNodes = numNodes - taskletNodesStart;
+		} else {
+			taskletNumNodes = numNodesPerTasklet;
+		}
+
+		// Visit neighbors of the current frontier
+		mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
+		for (uint32_t node = taskletNodesStart;
+		     node < taskletNodesStart + taskletNumNodes; ++node) {
+			uint32_t nodeTileIdx = node / 64;
+			uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w);	// TODO: Optimize: load tile then loop over nodes in the tile
+			if (isSet(currentFrontierTile, node % 64)) {	// If the node is in the current frontier
+				// Visit its neighbors
+				uint32_t nodePtr =
+				    load4B(nodePtrs_m, node,
+					   cache_w) - nodePtrsOffset;
+				uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset;	// TODO: Optimize: might be in the same 8B as nodePtr
+				for (uint32_t i = nodePtr; i < nextNodePtr; ++i) {
+					uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w);	// TODO: Optimize: sequential access to neighbors can use sequential reader
+					uint32_t neighborTileIdx =
+					    neighbor / 64;
+					uint64_t visitedTile =
+					    load8B(visited_m, neighborTileIdx,
+						   cache_w);
+					if (!isSet(visitedTile, neighbor % 64)) {	// Neighbor not previously visited
+						// Add neighbor to next frontier
+						mutex_lock(mutexID);	// TODO: Optimize: use more locks to reduce contention
+						uint64_t nextFrontierTile =
+						    load8B(nextFrontier_m,
+							   neighborTileIdx,
+							   cache_w);
+						setBit(nextFrontierTile,
+						       neighbor % 64);
+						store8B(nextFrontierTile,
+							nextFrontier_m,
+							neighborTileIdx,
+							cache_w);
+						mutex_unlock(mutexID);
+					}
+				}
+			}
+		}
+
+	}
+
+	return 0;
 }
diff --git a/BFS/host/app.c b/BFS/host/app.c
index 126fb7c..9ba7ffb 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -30,341 +30,429 @@
 #define DPU_BINARY "./bin/dpu_code"
 
 // Main of the Host Application
-int main(int argc, char** argv) {
+int main(int argc, char **argv)
+{
 
-    // Process parameters
-    struct Params p = input_params(argc, argv);
+	// Process parameters
+	struct Params p = input_params(argc, argv);
 
-    // Timer and profiling
-    Timer timer;
-    #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
-    double tenergy=0;
-    #endif
+	// Timer and profiling
+	Timer timer;
+#if ENERGY
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	double tenergy = 0;
+#endif
 
-    printf("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
+	printf
+	    ("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n",
+	     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
 
-    // Allocate DPUs and load binary
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t numDPUs, numRanks;
+	// Allocate DPUs and load binary
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t numDPUs, numRanks;
 
 #if WITH_ALLOC_OVERHEAD
-    startTimer(&timer, 0, 0);
+	startTimer(&timer, 0, 0);
 #endif
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
 #if WITH_ALLOC_OVERHEAD
-    stopTimer(&timer, 0);
+	stopTimer(&timer, 0);
 #else
-    timer.time[0] = 0;
+	timer.time[0] = 0;
 #endif
 
 #if WITH_LOAD_OVERHEAD
-    startTimer(&timer, 1, 0);
+	startTimer(&timer, 1, 0);
 #endif
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
 #if WITH_LOAD_OVERHEAD
-    stopTimer(&timer, 0);
+	stopTimer(&timer, 0);
 #else
-    timer.time[1] = 0;
+	timer.time[1] = 0;
 #endif
 
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
-    assert(NR_DPUS == numDPUs);
-    PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
-    // Initialize BFS data structures
-    PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
-    struct COOGraph cooGraph = readCOOGraph(p.fileName);
-    PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
-    struct CSRGraph csrGraph = coo2csr(cooGraph);
-    uint32_t numNodes = csrGraph.numNodes;
-    uint32_t* nodePtrs = csrGraph.nodePtrs;
-    uint32_t* neighborIdxs = csrGraph.neighborIdxs;
-    uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
-    uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
-    setBit(nextFrontier[0], 0); // Initialize frontier to first node
-    uint32_t level = 1;
-
-    // Partition data structure across DPUs
-    uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1);
-    PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU);
-    struct DPUParams dpuParams[numDPUs];
-    uint32_t dpuParams_m[numDPUs];
-    unsigned int dpuIdx = 0;
-    unsigned int t0ini = 0;
-    unsigned int t1ini = 0;
-    unsigned int t2ini = 0;
-    unsigned int t3ini = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-
-        // Allocate parameters
-        struct mram_heap_allocator_t allocator;
-        init_allocator(&allocator);
-        dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
-        // Find DPU's nodes
-        uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
-        uint32_t dpuNumNodes;
-        if(dpuStartNodeIdx > numNodes) {
-            dpuNumNodes = 0;
-        } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) {
-            dpuNumNodes = numNodes - dpuStartNodeIdx;
-        } else {
-            dpuNumNodes = numNodesPerDPU;
-        }
-        dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
-        PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
-        PRINT_INFO(p.verbosity >= 2, "        Receives %u nodes", dpuNumNodes);
-
-        // Partition edges and copy data
-        if(dpuNumNodes > 0) {
-
-            // Find DPU's CSR graph partition
-            uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
-            uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
-            uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset;
-            uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
-            uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
-
-            // Allocate MRAM
-            uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t));
-            uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t));
-            uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t));
-            uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
-            uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t));
-            uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
-            PRINT_INFO(p.verbosity >= 2, "        Total memory allocated is %d bytes", allocator.totalAllocated);
-
-            // Set up DPU parameters
-            dpuParams[dpuIdx].numNodes = numNodes;
-            dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
-            dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
-            dpuParams[dpuIdx].level = level;
-            dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
-            dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
-            dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
-            dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
-            dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m;
-            dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
-
-            // Send data to DPU
-            PRINT_INFO(p.verbosity >= 2, "        Copying data to DPU");
-            startTimer(&timer, 2, t0ini++);
-            copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
-            copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
-            copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
-            // NOTE: No need to copy current frontier because it is written before being read
-            stopTimer(&timer, 2);
-            //loadTime += getElapsedTime(timer);
-
-        }
-
-        // Send parameters to DPU
-        PRINT_INFO(p.verbosity >= 2, "        Copying parameters to DPU");
-        startTimer(&timer, 2, t1ini++);
-        copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
-        stopTimer(&timer, 2);
-        //loadTime += getElapsedTime(timer);
-
-        ++dpuIdx;
-
-    }
-
-    // Iterate until next frontier is empty
-    uint32_t nextFrontierEmpty = 0;
-    while(!nextFrontierEmpty) {
-
-        PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level);
-
-	#if ENERGY
-	DPU_ASSERT(dpu_probe_start(&probe));
-	#endif
-        // Run all DPUs
-        PRINT_INFO(p.verbosity >= 1, "    Booting DPUs");
-        startTimer(&timer, 3, t2ini++);
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        stopTimer(&timer, 3);
-        //dpuTime += getElapsedTime(timer);
-	#if ENERGY
-    	DPU_ASSERT(dpu_probe_stop(&probe));
-    	double energy;
-    	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-	tenergy += energy;
-	#endif
-
-
-
-        // Copy back next frontier from all DPUs and compute their union as the current frontier
-        startTimer(&timer, 4, t3ini++);
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-            if(dpuNumNodes > 0) {
-                if(dpuIdx == 0) {
-                    copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t));
-                } else {
-                    copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t));
-                    for(uint32_t i = 0; i < numNodes/64; ++i) {
-                        currentFrontier[i] |= nextFrontier[i];
-                    }
-                }
-                ++dpuIdx;
-            }
-        }
-
-        // Check if the next frontier is empty, and copy data to DPU if not empty
-        nextFrontierEmpty = 1;
-        for(uint32_t i = 0; i < numNodes/64; ++i) {
-            if(currentFrontier[i]) {
-                nextFrontierEmpty = 0;
-                break;
-            }
-        }
-        if(!nextFrontierEmpty) {
-            ++level;
-            dpuIdx = 0;
-            DPU_FOREACH (dpu_set, dpu) {
-                uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-                if(dpuNumNodes > 0) {
-                    // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
-                    copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
-                    // Copy new level to DPU
-                    dpuParams[dpuIdx].level = level;
-                    copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
-                    ++dpuIdx;
-                }
-            }
-        }
-        stopTimer(&timer, 4);
-        //hostTime += getElapsedTime(timer);
-
-    }
-
-    // Copy back node levels
-    PRINT_INFO(p.verbosity >= 1, "Copying back the result");
-    startTimer(&timer, 5, 0);
-    dpuIdx = 0;
-    DPU_FOREACH (dpu_set, dpu) {
-        uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
-        if(dpuNumNodes > 0) {
-            uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
-            copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float));
-        }
-        ++dpuIdx;
-    }
-    stopTimer(&timer, 5);
-    //retrieveTime += getElapsedTime(timer);
-    //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f    DPU Kernel Time (ms): %f    Inter-DPU Time (ms): %f    DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
-
-    // Calculating result on CPU
-    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
-    uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
-    memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t));
-    setBit(nextFrontier[0], 0); // Initialize frontier to first node
-    nextFrontierEmpty = 0;
-    level = 1;
-    startTimer(&timer, 6, 0);
-    while(!nextFrontierEmpty) {
-        // Update current frontier and visited list based on the next frontier from the previous iteration
-        for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
-            uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
-            currentFrontier[nodeTileIdx] = nextFrontierTile;
-            if(nextFrontierTile) {
-                visited[nodeTileIdx] |= nextFrontierTile;
-                nextFrontier[nodeTileIdx] = 0;
-                for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                    if(isSet(nextFrontierTile, node%64)) {
-                        nodeLevelReference[node] = level;
-                    }
-                }
-            }
-        }
-        // Visit neighbors of the current frontier
-        nextFrontierEmpty = 1;
-        for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
-            uint64_t currentFrontierTile = currentFrontier[nodeTileIdx];
-            if(currentFrontierTile) {
-                for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
-                    if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
-                        // Visit its neighbors
-                        uint32_t nodePtr = nodePtrs[node];
-                        uint32_t nextNodePtr = nodePtrs[node + 1];
-                        for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
-                            uint32_t neighbor = neighborIdxs[i];
-                            if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited
-                                // Add neighbor to next frontier
-                                setBit(nextFrontier[neighbor/64], neighbor%64);
-                                nextFrontierEmpty = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        ++level;
-    }
-    stopTimer(&timer, 6);
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+	assert(NR_DPUS == numDPUs);
+	PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+	// Initialize BFS data structures
+	PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+	struct COOGraph cooGraph = readCOOGraph(p.fileName);
+	PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges",
+		   cooGraph.numNodes, cooGraph.numEdges);
+	struct CSRGraph csrGraph = coo2csr(cooGraph);
+	uint32_t numNodes = csrGraph.numNodes;
+	uint32_t *nodePtrs = csrGraph.nodePtrs;
+	uint32_t *neighborIdxs = csrGraph.neighborIdxs;
+	uint32_t *nodeLevel = calloc(numNodes, sizeof(uint32_t));	// Node's BFS level (initially all 0 meaning not reachable)
+	uint64_t *visited = calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	uint64_t *currentFrontier = calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	uint64_t *nextFrontier = calloc(numNodes / 64, sizeof(uint64_t));	// Bit vector with one bit per node
+	setBit(nextFrontier[0], 0);	// Initialize frontier to first node
+	uint32_t level = 1;
+
+	// Partition data structure across DPUs
+	uint32_t numNodesPerDPU =
+	    ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1);
+	PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU",
+		   numNodesPerDPU);
+	struct DPUParams dpuParams[numDPUs];
+	uint32_t dpuParams_m[numDPUs];
+	unsigned int dpuIdx = 0;
+	unsigned int t0ini = 0;
+	unsigned int t1ini = 0;
+	unsigned int t2ini = 0;
+	unsigned int t3ini = 0;
+	DPU_FOREACH(dpu_set, dpu) {
+
+		// Allocate parameters
+		struct mram_heap_allocator_t allocator;
+		init_allocator(&allocator);
+		dpuParams_m[dpuIdx] =
+		    mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+		// Find DPU's nodes
+		uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+		uint32_t dpuNumNodes;
+		if (dpuStartNodeIdx > numNodes) {
+			dpuNumNodes = 0;
+		} else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) {
+			dpuNumNodes = numNodes - dpuStartNodeIdx;
+		} else {
+			dpuNumNodes = numNodesPerDPU;
+		}
+		dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
+		PRINT_INFO(p.verbosity >= 2, "    DPU %u:", dpuIdx);
+		PRINT_INFO(p.verbosity >= 2, "        Receives %u nodes",
+			   dpuNumNodes);
+
+		// Partition edges and copy data
+		if (dpuNumNodes > 0) {
+
+			// Find DPU's CSR graph partition
+			uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
+			uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
+			uint32_t *dpuNeighborIdxs_h =
+			    neighborIdxs + dpuNodePtrsOffset;
+			uint32_t dpuNumNeighbors =
+			    dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
+			uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
+
+			// Allocate MRAM
+			uint32_t dpuNodePtrs_m =
+			    mram_heap_alloc(&allocator,
+					    (dpuNumNodes +
+					     1) * sizeof(uint32_t));
+			uint32_t dpuNeighborIdxs_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNeighbors * sizeof(uint32_t));
+			uint32_t dpuNodeLevel_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNodes * sizeof(uint32_t));
+			uint32_t dpuVisited_m =
+			    mram_heap_alloc(&allocator,
+					    numNodes / 64 * sizeof(uint64_t));
+			uint32_t dpuCurrentFrontier_m =
+			    mram_heap_alloc(&allocator,
+					    dpuNumNodes / 64 *
+					    sizeof(uint64_t));
+			uint32_t dpuNextFrontier_m =
+			    mram_heap_alloc(&allocator,
+					    numNodes / 64 * sizeof(uint64_t));
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Total memory allocated is %d bytes",
+				   allocator.totalAllocated);
+
+			// Set up DPU parameters
+			dpuParams[dpuIdx].numNodes = numNodes;
+			dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
+			dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
+			dpuParams[dpuIdx].level = level;
+			dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
+			dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
+			dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
+			dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
+			dpuParams[dpuIdx].dpuCurrentFrontier_m =
+			    dpuCurrentFrontier_m;
+			dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
+
+			// Send data to DPU
+			PRINT_INFO(p.verbosity >= 2,
+				   "        Copying data to DPU");
+			startTimer(&timer, 2, t0ini++);
+			copyToDPU(dpu, (uint8_t *) dpuNodePtrs_h, dpuNodePtrs_m,
+				  (dpuNumNodes + 1) * sizeof(uint32_t));
+			copyToDPU(dpu, (uint8_t *) dpuNeighborIdxs_h,
+				  dpuNeighborIdxs_m,
+				  dpuNumNeighbors * sizeof(uint32_t));
+			copyToDPU(dpu, (uint8_t *) dpuNodeLevel_h,
+				  dpuNodeLevel_m,
+				  dpuNumNodes * sizeof(uint32_t));
+			copyToDPU(dpu, (uint8_t *) visited, dpuVisited_m,
+				  numNodes / 64 * sizeof(uint64_t));
+			copyToDPU(dpu, (uint8_t *) nextFrontier,
+				  dpuNextFrontier_m,
+				  numNodes / 64 * sizeof(uint64_t));
+			// NOTE: No need to copy current frontier because it is written before being read
+			stopTimer(&timer, 2);
+			//loadTime += getElapsedTime(timer);
+
+		}
+		// Send parameters to DPU
+		PRINT_INFO(p.verbosity >= 2,
+			   "        Copying parameters to DPU");
+		startTimer(&timer, 2, t1ini++);
+		copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx],
+			  dpuParams_m[dpuIdx], sizeof(struct DPUParams));
+		stopTimer(&timer, 2);
+		//loadTime += getElapsedTime(timer);
+
+		++dpuIdx;
+
+	}
+
+	// Iterate until next frontier is empty
+	uint32_t nextFrontierEmpty = 0;
+	while (!nextFrontierEmpty) {
+
+		PRINT_INFO(p.verbosity >= 1,
+			   "Processing current frontier for level %u", level);
+
+#if ENERGY
+		DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+		// Run all DPUs
+		PRINT_INFO(p.verbosity >= 1, "    Booting DPUs");
+		startTimer(&timer, 3, t2ini++);
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		stopTimer(&timer, 3);
+		//dpuTime += getElapsedTime(timer);
+#if ENERGY
+		DPU_ASSERT(dpu_probe_stop(&probe));
+		double energy;
+		DPU_ASSERT(dpu_probe_get
+			   (&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+		tenergy += energy;
+#endif
+
+		// Copy back next frontier from all DPUs and compute their union as the current frontier
+		startTimer(&timer, 4, t3ini++);
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+			if (dpuNumNodes > 0) {
+				if (dpuIdx == 0) {
+					copyFromDPU(dpu,
+						    dpuParams[dpuIdx].
+						    dpuNextFrontier_m,
+						    (uint8_t *) currentFrontier,
+						    numNodes / 64 *
+						    sizeof(uint64_t));
+				} else {
+					copyFromDPU(dpu,
+						    dpuParams[dpuIdx].
+						    dpuNextFrontier_m,
+						    (uint8_t *) nextFrontier,
+						    numNodes / 64 *
+						    sizeof(uint64_t));
+					for (uint32_t i = 0; i < numNodes / 64;
+					     ++i) {
+						currentFrontier[i] |=
+						    nextFrontier[i];
+					}
+				}
+				++dpuIdx;
+			}
+		}
+
+		// Check if the next frontier is empty, and copy data to DPU if not empty
+		nextFrontierEmpty = 1;
+		for (uint32_t i = 0; i < numNodes / 64; ++i) {
+			if (currentFrontier[i]) {
+				nextFrontierEmpty = 0;
+				break;
+			}
+		}
+		if (!nextFrontierEmpty) {
+			++level;
+			dpuIdx = 0;
+			DPU_FOREACH(dpu_set, dpu) {
+				uint32_t dpuNumNodes =
+				    dpuParams[dpuIdx].dpuNumNodes;
+				if (dpuNumNodes > 0) {
+					// Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
+					copyToDPU(dpu,
+						  (uint8_t *) currentFrontier,
+						  dpuParams[dpuIdx].
+						  dpuNextFrontier_m,
+						  numNodes / 64 *
+						  sizeof(uint64_t));
+					// Copy new level to DPU
+					dpuParams[dpuIdx].level = level;
+					copyToDPU(dpu,
+						  (uint8_t *) &
+						  dpuParams[dpuIdx],
+						  dpuParams_m[dpuIdx],
+						  sizeof(struct DPUParams));
+					++dpuIdx;
+				}
+			}
+		}
+		stopTimer(&timer, 4);
+		//hostTime += getElapsedTime(timer);
+
+	}
+
+	// Copy back node levels
+	PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+	startTimer(&timer, 5, 0);
+	dpuIdx = 0;
+	DPU_FOREACH(dpu_set, dpu) {
+		uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+		if (dpuNumNodes > 0) {
+			uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+			copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m,
+				    (uint8_t *) (nodeLevel + dpuStartNodeIdx),
+				    dpuNumNodes * sizeof(float));
+		}
+		++dpuIdx;
+	}
+	stopTimer(&timer, 5);
+	//retrieveTime += getElapsedTime(timer);
+	//if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f    DPU Kernel Time (ms): %f    Inter-DPU Time (ms): %f    DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+
+	// Calculating result on CPU
+	PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+	uint32_t *nodeLevelReference = calloc(numNodes, sizeof(uint32_t));	// Node's BFS level (initially all 0 meaning not reachable)
+	memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t));
+	setBit(nextFrontier[0], 0);	// Initialize frontier to first node
+	nextFrontierEmpty = 0;
+	level = 1;
+	startTimer(&timer, 6, 0);
+	while (!nextFrontierEmpty) {
+		// Update current frontier and visited list based on the next frontier from the previous iteration
+		for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+		     ++nodeTileIdx) {
+			uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
+			currentFrontier[nodeTileIdx] = nextFrontierTile;
+			if (nextFrontierTile) {
+				visited[nodeTileIdx] |= nextFrontierTile;
+				nextFrontier[nodeTileIdx] = 0;
+				for (uint32_t node = nodeTileIdx * 64;
+				     node < (nodeTileIdx + 1) * 64; ++node) {
+					if (isSet(nextFrontierTile, node % 64)) {
+						nodeLevelReference[node] =
+						    level;
+					}
+				}
+			}
+		}
+		// Visit neighbors of the current frontier
+		nextFrontierEmpty = 1;
+		for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+		     ++nodeTileIdx) {
+			uint64_t currentFrontierTile =
+			    currentFrontier[nodeTileIdx];
+			if (currentFrontierTile) {
+				for (uint32_t node = nodeTileIdx * 64;
+				     node < (nodeTileIdx + 1) * 64; ++node) {
+					if (isSet(currentFrontierTile, node % 64)) {	// If the node is in the current frontier
+						// Visit its neighbors
+						uint32_t nodePtr =
+						    nodePtrs[node];
+						uint32_t nextNodePtr =
+						    nodePtrs[node + 1];
+						for (uint32_t i = nodePtr;
+						     i < nextNodePtr; ++i) {
+							uint32_t neighbor =
+							    neighborIdxs[i];
+							if (!isSet(visited[neighbor / 64], neighbor % 64)) {	// Neighbor not previously visited
+								// Add neighbor to next frontier
+								setBit
+								    (nextFrontier
+								     [neighbor /
+								      64],
+								     neighbor %
+								     64);
+								nextFrontierEmpty
+								    = 0;
+							}
+						}
+					}
+				}
+			}
+		}
+		++level;
+	}
+	stopTimer(&timer, 6);
 
 #if WITH_FREE_OVERHEAD
-    startTimer(&timer, 7);
+	startTimer(&timer, 7);
 #endif
-    DPU_ASSERT(dpu_free(dpu_set));
+	DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-    stopTimer(&timer, 7);
+	stopTimer(&timer, 7);
 #else
-    timer.time[7] = 0;
+	timer.time[7] = 0;
 #endif
 
-    // Verify the result
-    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
-    int status = 1;
-    for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
-        if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
-            PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
-            status = 0;
-        }
-    }
-
-    if (status) {
-        printf("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
-            "| throughput_pim_MBps=%f throughput_MBps=%f",
-            numDPUs, NR_TASKLETS, "uint32_t", numNodes,
-            numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]),
-            numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
-        printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
-            numNodes / (timer.time[2] + timer.time[3]),
-            numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
-        printf(" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
-            timer.time[0], timer.time[1], timer.time[2], timer.time[3], timer.time[4], timer.time[5], timer.time[6], timer.time[7]);
-    }
-
-    // Display DPU Logs
-    if(p.verbosity >= 2) {
-        PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
-        dpuIdx = 0;
-        DPU_FOREACH (dpu_set, dpu) {
-            PRINT("DPU %u:", dpuIdx);
-            DPU_ASSERT(dpu_log_read(dpu, stdout));
-            ++dpuIdx;
-        }
-    }
-
-    // Deallocate data structures
-    freeCOOGraph(cooGraph);
-    freeCSRGraph(csrGraph);
-    free(nodeLevel);
-    free(visited);
-    free(currentFrontier);
-    free(nextFrontier);
-    free(nodeLevelReference);
-
-    return 0;
+	// Verify the result
+	PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+	int status = 1;
+	for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
+		if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
+			PRINT_ERROR
+			    ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)",
+			     nodeIdx, nodeLevelReference[nodeIdx],
+			     nodeLevel[nodeIdx]);
+			status = 0;
+		}
+	}
+
+	if (status) {
+		printf
+		    ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
+		     "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs,
+		     NR_TASKLETS, "uint32_t", numNodes,
+		     numNodes * sizeof(uint32_t) / (timer.time[2] +
+						    timer.time[3]),
+		     numNodes * sizeof(uint32_t) / (timer.time[0] +
+						    timer.time[1] +
+						    timer.time[2] +
+						    timer.time[3] +
+						    timer.time[4]));
+		printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
+		       numNodes / (timer.time[2] + timer.time[3]),
+		       numNodes / (timer.time[0] + timer.time[1] +
+				   timer.time[2] + timer.time[3] +
+				   timer.time[4]));
+		printf
+		    (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
+		     timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+		     timer.time[4], timer.time[5], timer.time[6],
+		     timer.time[7]);
+	}
+	// Display DPU Logs
+	if (p.verbosity >= 2) {
+		PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+		dpuIdx = 0;
+		DPU_FOREACH(dpu_set, dpu) {
+			PRINT("DPU %u:", dpuIdx);
+			DPU_ASSERT(dpu_log_read(dpu, stdout));
+			++dpuIdx;
+		}
+	}
+	// Deallocate data structures
+	freeCOOGraph(cooGraph);
+	freeCSRGraph(csrGraph);
+	free(nodeLevel);
+	free(visited);
+	free(currentFrontier);
+	free(nextFrontier);
+	free(nodeLevelReference);
+
+	return 0;
 
 }
-
diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h
index 627dfde..f2ee031 100644
--- a/BFS/host/mram-management.h
+++ b/BFS/host/mram-management.h
@@ -5,33 +5,45 @@
 #include "../support/common.h"
 #include "../support/utils.h"
 
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20)	// A DPU's capacity is 64 MiB
 
 struct mram_heap_allocator_t {
-    uint32_t totalAllocated;
+	uint32_t totalAllocated;
 };
 
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
-    allocator->totalAllocated = 0;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+	allocator->totalAllocated = 0;
 }
 
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
-    uint32_t ret = allocator->totalAllocated;
-    allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
-    if(allocator->totalAllocated > DPU_CAPACITY) {
-        PRINT_ERROR("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
-        exit(0);
-    }
-    return ret;
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+				uint32_t size)
+{
+	uint32_t ret = allocator->totalAllocated;
+	allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+	if (allocator->totalAllocated > DPU_CAPACITY) {
+		PRINT_ERROR
+		    ("        Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+		     allocator->totalAllocated, DPU_CAPACITY);
+		exit(0);
+	}
+	return ret;
 }
 
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
-    DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
+		      uint32_t size)
+{
+	DPU_ASSERT(dpu_copy_to
+		   (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+		    ROUND_UP_TO_MULTIPLE_OF_8(size)));
 }
 
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
-    DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
+			uint8_t *hostPtr, uint32_t size)
+{
+	DPU_ASSERT(dpu_copy_from
+		   (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+		    ROUND_UP_TO_MULTIPLE_OF_8(size)));
 }
 
 #endif
-
diff --git a/BFS/support/common.h b/BFS/support/common.h
index ced324c..5f2aa0d 100644
--- a/BFS/support/common.h
+++ b/BFS/support/common.h
@@ -9,18 +9,17 @@
 #define isSet(val, idx)  ((val) & (1 << (idx)))
 
 struct DPUParams {
-    uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
-    uint32_t numNodes; /* Total number of nodes in the graph  */
-    uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU  */
-    uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
-    uint32_t level; /* The current BFS level */
-    uint32_t dpuNodePtrs_m;
-    uint32_t dpuNeighborIdxs_m;
-    uint32_t dpuNodeLevel_m;
-    uint32_t dpuVisited_m;
-    uint32_t dpuCurrentFrontier_m;
-    uint32_t dpuNextFrontier_m;
+	uint32_t dpuNumNodes;	/* The number of nodes assigned to this DPU */
+	uint32_t numNodes;	/* Total number of nodes in the graph  */
+	uint32_t dpuStartNodeIdx;	/* The index of the first node assigned to this DPU  */
+	uint32_t dpuNodePtrsOffset;	/* Offset of the node pointers */
+	uint32_t level;		/* The current BFS level */
+	uint32_t dpuNodePtrs_m;
+	uint32_t dpuNeighborIdxs_m;
+	uint32_t dpuNodeLevel_m;
+	uint32_t dpuVisited_m;
+	uint32_t dpuCurrentFrontier_m;
+	uint32_t dpuNextFrontier_m;
 };
 
 #endif
-
diff --git a/BFS/support/graph.h b/BFS/support/graph.h
index f89ff5c..2a19f67 100644
--- a/BFS/support/graph.h
+++ b/BFS/support/graph.h
@@ -9,108 +9,125 @@
 #include "utils.h"
 
 struct COOGraph {
-    uint32_t numNodes;
-    uint32_t numEdges;
-    uint32_t* nodeIdxs;
-    uint32_t* neighborIdxs;
+	uint32_t numNodes;
+	uint32_t numEdges;
+	uint32_t *nodeIdxs;
+	uint32_t *neighborIdxs;
 };
 
 struct CSRGraph {
-    uint32_t numNodes;
-    uint32_t numEdges;
-    uint32_t* nodePtrs;
-    uint32_t* neighborIdxs;
+	uint32_t numNodes;
+	uint32_t numEdges;
+	uint32_t *nodePtrs;
+	uint32_t *neighborIdxs;
 };
 
-static struct COOGraph readCOOGraph(const char* fileName) {
-
-    struct COOGraph cooGraph;
-
-    // Initialize fields
-    FILE* fp = fopen(fileName, "r");
-    uint32_t numNodes, numCols;
-    assert(fscanf(fp, "%u", &numNodes));
-    assert(fscanf(fp, "%u", &numCols));
-    if(numNodes == numCols) {
-        cooGraph.numNodes = numNodes;
-    } else {
-        PRINT_WARNING("    Adjacency matrix is not square. Padding matrix to be square.");
-        cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols;
-    }
-    if(cooGraph.numNodes%64 != 0) {
-        PRINT_WARNING("    Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes);
-        cooGraph.numNodes += (64 - cooGraph.numNodes%64);
-        PRINT_WARNING("        Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes);
-    }
-    assert(fscanf(fp, "%u", &cooGraph.numEdges));
-    cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-    cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-
-    // Read the neighborIdxs
-    for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
-        uint32_t nodeIdx;
-        assert(fscanf(fp, "%u", &nodeIdx));
-        cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
-        uint32_t neighborIdx;
-        assert(fscanf(fp, "%u", &neighborIdx));
-        cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
-    }
-
-    return cooGraph;
+static struct COOGraph readCOOGraph(const char *fileName)
+{
+
+	struct COOGraph cooGraph;
+
+	// Initialize fields
+	FILE *fp = fopen(fileName, "r");
+	uint32_t numNodes, numCols;
+	assert(fscanf(fp, "%u", &numNodes));
+	assert(fscanf(fp, "%u", &numCols));
+	if (numNodes == numCols) {
+		cooGraph.numNodes = numNodes;
+	} else {
+		PRINT_WARNING
+		    ("    Adjacency matrix is not square. Padding matrix to be square.");
+		cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols;
+	}
+	if (cooGraph.numNodes % 64 != 0) {
+		PRINT_WARNING
+		    ("    Adjacency matrix dimension is %u which is not a multiple of 64 nodes.",
+		     cooGraph.numNodes);
+		cooGraph.numNodes += (64 - cooGraph.numNodes % 64);
+		PRINT_WARNING
+		    ("        Padding to %u which is a multiple of 64 nodes.",
+		     cooGraph.numNodes);
+	}
+	assert(fscanf(fp, "%u", &cooGraph.numEdges));
+	cooGraph.nodeIdxs =
+	    (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+	cooGraph.neighborIdxs =
+	    (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+
+	// Read the neighborIdxs
+	for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
+		uint32_t nodeIdx;
+		assert(fscanf(fp, "%u", &nodeIdx));
+		cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
+		uint32_t neighborIdx;
+		assert(fscanf(fp, "%u", &neighborIdx));
+		cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
+	}
+
+	return cooGraph;
 
 }
 
-static void freeCOOGraph(struct COOGraph cooGraph) {
-    free(cooGraph.nodeIdxs);
-    free(cooGraph.neighborIdxs);
+static void freeCOOGraph(struct COOGraph cooGraph)
+{
+	free(cooGraph.nodeIdxs);
+	free(cooGraph.neighborIdxs);
 }
 
-static struct CSRGraph coo2csr(struct COOGraph cooGraph) {
-
-    struct CSRGraph csrGraph;
-
-    // Initialize fields
-    csrGraph.numNodes = cooGraph.numNodes;
-    csrGraph.numEdges = cooGraph.numEdges;
-    csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t));
-    csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t)));
-
-    // Histogram nodeIdxs
-    for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
-        uint32_t nodeIdx = cooGraph.nodeIdxs[i];
-        csrGraph.nodePtrs[nodeIdx]++;
-    }
-
-    // Prefix sum nodePtrs
-    uint32_t sumBeforeNextNode = 0;
-    for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
-        uint32_t sumBeforeNode = sumBeforeNextNode;
-        sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
-        csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
-    }
-    csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
-
-    // Bin the neighborIdxs
-    for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
-        uint32_t nodeIdx = cooGraph.nodeIdxs[i];
-        uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
-        csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i];
-    }
-
-    // Restore nodePtrs
-    for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
-        csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
-    }
-    csrGraph.nodePtrs[0] = 0;
-
-    return csrGraph;
+static struct CSRGraph coo2csr(struct COOGraph cooGraph)
+{
+
+	struct CSRGraph csrGraph;
+
+	// Initialize fields
+	csrGraph.numNodes = cooGraph.numNodes;
+	csrGraph.numEdges = cooGraph.numEdges;
+	csrGraph.nodePtrs =
+	    (uint32_t *)
+	    calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1),
+		   sizeof(uint32_t));
+	csrGraph.neighborIdxs =
+	    (uint32_t *)
+	    malloc(ROUND_UP_TO_MULTIPLE_OF_8
+		   (csrGraph.numEdges * sizeof(uint32_t)));
+
+	// Histogram nodeIdxs
+	for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+		uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+		csrGraph.nodePtrs[nodeIdx]++;
+	}
+
+	// Prefix sum nodePtrs
+	uint32_t sumBeforeNextNode = 0;
+	for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+		uint32_t sumBeforeNode = sumBeforeNextNode;
+		sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
+		csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
+	}
+	csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
+
+	// Bin the neighborIdxs
+	for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+		uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+		uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
+		csrGraph.neighborIdxs[neighborListIdx] =
+		    cooGraph.neighborIdxs[i];
+	}
+
+	// Restore nodePtrs
+	for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
+		csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
+	}
+	csrGraph.nodePtrs[0] = 0;
+
+	return csrGraph;
 
 }
 
-static void freeCSRGraph(struct CSRGraph csrGraph) {
-    free(csrGraph.nodePtrs);
-    free(csrGraph.neighborIdxs);
+static void freeCSRGraph(struct CSRGraph csrGraph)
+{
+	free(csrGraph.nodePtrs);
+	free(csrGraph.neighborIdxs);
 }
 
 #endif
-
diff --git a/BFS/support/params.h b/BFS/support/params.h
index 960a0d0..f9169bc 100644
--- a/BFS/support/params.h
+++ b/BFS/support/params.h
@@ -5,54 +5,63 @@
 #include "common.h"
 #include "utils.h"
 
-static void usage() {
-    PRINT(  "\nUsage:  ./program [options]"
-            "\n"
-            "\nBenchmark-specific options:"
-            "\n    -f <F>    input matrix file name (default=data/roadNet-CA.txt)"
-            "\n"
-            "\nGeneral options:"
-            "\n    -v <V>    verbosity"
-            "\n    -h        help"
-            "\n\n");
+static void usage()
+{
+	PRINT("\nUsage:  ./program [options]"
+	      "\n"
+	      "\nBenchmark-specific options:"
+	      "\n    -f <F>    input matrix file name (default=data/roadNet-CA.txt)"
+	      "\n"
+	      "\nGeneral options:"
+	      "\n    -v <V>    verbosity" "\n    -h        help" "\n\n");
 }
 
 typedef struct Params {
-  const char* fileName;
-  unsigned int verbosity;
+	const char *fileName;
+	unsigned int verbosity;
 #if NUMA
-  struct bitmask* bitmask_in;
-  int numa_node_cpu;
+	struct bitmask *bitmask_in;
+	int numa_node_cpu;
 #endif
 } Params;
 
-static struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.fileName      = "data/roadNet-CA.txt";
-    p.verbosity     = 0;
+static struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.fileName = "data/roadNet-CA.txt";
+	p.verbosity = 0;
 #if NUMA
-    p.bitmask_in = NULL;
-    p.numa_node_cpu = -1;
+	p.bitmask_in = NULL;
+	p.numa_node_cpu = -1;
 #endif
-    int opt;
-    while((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) {
-        switch(opt) {
-            case 'f': p.fileName    = optarg;       break;
-            case 'v': p.verbosity   = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) {
+		switch (opt) {
+		case 'f':
+			p.fileName = optarg;
+			break;
+		case 'v':
+			p.verbosity = atoi(optarg);
+			break;
 #if NUMA
-            case 'A': p.bitmask_in    = numa_parse_nodestring(optarg); break;
-            case 'C': p.numa_node_cpu = atoi(optarg); break;
+		case 'A':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'C':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #endif
-            case 'h': usage(); exit(0);
-            default:
-                      PRINT_ERROR("Unrecognized option!");
-                      usage();
-                      exit(0);
-        }
-    }
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			PRINT_ERROR("Unrecognized option!");
+			usage();
+			exit(0);
+		}
+	}
 
-    return p;
+	return p;
 }
 
 #endif
-
diff --git a/BFS/support/timer.h b/BFS/support/timer.h
index 466ad95..63b5567 100644
--- a/BFS/support/timer.h
+++ b/BFS/support/timer.h
@@ -6,22 +6,26 @@
 #include <sys/time.h>
 
 typedef struct Timer {
-    struct timeval startTime[8];
-    struct timeval stopTime[8];
-    double         time[8];
+	struct timeval startTime[8];
+	struct timeval stopTime[8];
+	double time[8];
 } Timer;
 
-static void startTimer(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
+static void startTimer(Timer *timer, int i, int rep)
+{
+	if (rep == 0) {
+		timer->time[i] = 0.0;
+	}
+	gettimeofday(&timer->startTime[i], NULL);
 }
 
-static void stopTimer(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+static void stopTimer(Timer *timer, int i)
+{
+	gettimeofday(&timer->stopTime[i], NULL);
+	timer->time[i] +=
+	    (timer->stopTime[i].tv_sec -
+	     timer->startTime[i].tv_sec) * 1000000.0 +
+	    (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
 }
 
 #endif
diff --git a/BFS/support/utils.h b/BFS/support/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/BFS/support/utils.h
+++ b/BFS/support/utils.h
@@ -8,4 +8,3 @@
 #define PRINT(fmt, ...)             printf(fmt "\n", ##__VA_ARGS__)
 
 #endif
-
author	Birte Kristina Friesel <birte.friesel@uos.de>	2025-01-16 08:04:02 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2025-01-16 08:04:02 +0100
commit	4afee0981b81b09104bba9745d09795b85957f27 (patch)
tree	90d41a977613b983c9bec29c10956d5af20dad24 /BFS
parent	f0ec6cec9304ca0dd558cf8ec5777d0bb2da4307 (diff)