summaryrefslogtreecommitdiff
path: root/BFS
diff options
context:
space:
mode:
Diffstat (limited to 'BFS')
-rw-r--r--BFS/Makefile36
-rw-r--r--BFS/baselines/cpu/Makefile26
-rw-r--r--BFS/baselines/cpu/app.c41
-rwxr-xr-xBFS/benchmark-scripts/ccmcc25-sim.sh27
-rwxr-xr-xBFS/benchmark-scripts/ccmcc25.sh32
-rw-r--r--BFS/dpu/dpu-utils.h61
-rw-r--r--BFS/dpu/task.c276
-rw-r--r--BFS/host/app.c750
-rw-r--r--BFS/host/mram-management.h48
-rw-r--r--BFS/include/common.h25
-rw-r--r--BFS/include/dfatool_host.ah30
-rw-r--r--BFS/include/graph.h133
-rw-r--r--BFS/include/params.h67
-rw-r--r--BFS/include/timer.h8
-rw-r--r--BFS/include/utils.h (renamed from BFS/support/utils.h)1
-rwxr-xr-xBFS/run-paper-strong-full.sh23
-rwxr-xr-xBFS/run-paper-strong-rank.sh23
-rwxr-xr-xBFS/run-paper-weak.sh24
-rwxr-xr-xBFS/run.sh25
-rw-r--r--BFS/support/common.h26
-rw-r--r--BFS/support/graph.h116
-rw-r--r--BFS/support/params.h46
-rw-r--r--BFS/support/timer.h34
23 files changed, 1058 insertions, 820 deletions
diff --git a/BFS/Makefile b/BFS/Makefile
index a4ea69d..a773b38 100644
--- a/BFS/Makefile
+++ b/BFS/Makefile
@@ -1,17 +1,37 @@
NR_DPUS ?= 1
NR_TASKLETS ?= 16
+WITH_ALLOC_OVERHEAD ?= 0
+WITH_LOAD_OVERHEAD ?= 0
+WITH_FREE_OVERHEAD ?= 0
-COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard host/*.c)
DPU_SOURCES := $(wildcard dpu/*.c)
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+aspectc ?= 0
+aspectc_timing ?= 0
+dfatool_timing ?= 1
+
+HOST_CC := ${CC}
+
+COMMON_FLAGS := -Wall -Wextra -g -Iinclude
+HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
+ifeq (${aspectc_timing}, 1)
+ ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah
+endif
+
+ASPECTC_HOST_FLAGS ?= -a0
+
+ifeq (${aspectc}, 1)
+ HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler
+else
+ HOST_FLAGS += -std=c11
+endif
+
QUIET = @
-ifdef verbose
+ifeq (${verbose}, 1)
QUIET =
endif
@@ -20,11 +40,13 @@ all: bin/host_code bin/dpu_code
bin:
${QUIET}mkdir -p bin
-bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+bin/dpu_code: ${DPU_SOURCES} include bin
${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
- ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/host_code: ${HOST_SOURCES} include bin
+ ${QUIET}cp ../include/dfatool_host_dpu.ah include
+ ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+ ${QUIET}rm -f include/dfatool_host_dpu.ah
clean:
${QUIET}rm -rf bin
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile
index 1f6ed3c..1efe457 100644
--- a/BFS/baselines/cpu/Makefile
+++ b/BFS/baselines/cpu/Makefile
@@ -1,8 +1,26 @@
-.PHONY: all
-all: bfs
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+LDFLAGS =
+CFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
+endif
bfs: app.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -o bfs -fopenmp app.c
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -o bfs -fopenmp app.c ${LDFLAGS}
bfs_O0: app.c
gcc -o bfs_O0 -fopenmp app.c
@@ -27,3 +45,5 @@ run_O2: bfs_O2
.PHONY: clean
clean:
rm -f bfs bfs_O0 bfs_O2
+
+.PHONY: all
diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c
index caf4cbc..390b1f9 100644
--- a/BFS/baselines/cpu/app.c
+++ b/BFS/baselines/cpu/app.c
@@ -8,12 +8,30 @@
#include <omp.h>
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+struct bitmask* bitmask_in;
+int numa_node_in = -1;
+int numa_node_cpu = -1;
+#endif
+
#include "../../support/common.h"
#include "../../support/graph.h"
#include "../../support/params.h"
-#include "../../support/timer.h"
#include "../../support/utils.h"
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define startTimer(...)
+#define stopTimer(...)
+#endif
+
int main(int argc, char** argv) {
// Process parameters
@@ -24,8 +42,9 @@ int main(int argc, char** argv) {
struct COOGraph cooGraph = readCOOGraph(p.fileName);
PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
-
+#if WITH_BENCHMARK
Timer timer;
+#endif
for(int rep = 0; rep < 100; rep++) {
struct CSRGraph csrGraph = coo2csr(cooGraph);
@@ -43,6 +62,12 @@ int main(int argc, char** argv) {
uint32_t* prevFrontier = buffer1;
uint32_t* currFrontier = buffer2;
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
// Calculating result on CPU
startTimer(&timer, 0, 0);
nodeLevel[srcNode] = 0;
@@ -86,6 +111,12 @@ int main(int argc, char** argv) {
}
stopTimer(&timer, 0);
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
freeCSRGraph(csrGraph);
free(buffer1);
free(buffer2);
@@ -135,6 +166,7 @@ int main(int argc, char** argv) {
}
stopTimer(&timer, 1);
+#if WITH_BENCHMARK
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
@@ -158,8 +190,11 @@ int main(int argc, char** argv) {
printf(" throughput_seq_MOpps=%f throughput_MOpps=%f",
csrGraph.numNodes / timer.time[1],
csrGraph.numNodes / timer.time[0]);
- printAll(&timer, 1);
+ printf(" latency_us=%f latency_seq_us=%f\n",
+ timer.time[0],
+ timer.time[1]);
}
+#endif // WITH_BENCHMARK
freeCSRGraph(csrGraph);
free(nodeLevel);
diff --git a/BFS/benchmark-scripts/ccmcc25-sim.sh b/BFS/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..bcbe284
--- /dev/null
+++ b/BFS/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+ aspectc=1 aspectc_timing=1 dfatool_timing=0
+ bin/host_code -f ${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+# BFS does not support repeated kernel invocations → repeat it here
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \
+ ::: i $(seq 0 4) \
+ ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \
+ ::: nr_dpus 1 2 4 8 16 32 48 64 \
+>> ${fn}.txt
diff --git a/BFS/benchmark-scripts/ccmcc25.sh b/BFS/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..0dcf4bb
--- /dev/null
+++ b/BFS/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \
+ aspectc=1 aspectc_timing=1 dfatool_timing=0
+ bin/host_code -f ${data} 2>&1
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+ fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+ source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+ echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+ # BFS does not support repeated kernel invocations → repeat it here
+ parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any data={data} \
+ ::: i $(seq 0 10) \
+ ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \
+ ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+ >> ${fn}.txt
+
+done
diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h
index b02c073..dc986d2 100644
--- a/BFS/dpu/dpu-utils.h
+++ b/BFS/dpu/dpu-utils.h
@@ -6,39 +6,46 @@
#define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__)
-static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8);
- return cache_w[0];
+static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+ mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)),
+ cache_w, 8);
+ return cache_w[0];
}
-static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- cache_w[0] = val;
- mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8);
+static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx,
+ uint64_t *cache_w)
+{
+ cache_w[0] = val;
+ mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)),
+ 8);
}
-static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- // Load 8B
- uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
- uint32_t offset = ((uint32_t)ptr_idx_m)%8;
- uint32_t ptr_block_m = ptr_idx_m - offset;
- mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
- // Extract 4B
- uint32_t* cache_32_w = (uint32_t*) cache_w;
- return cache_32_w[offset/4];
+static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+ // Load 8B
+ uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+ uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+ uint32_t ptr_block_m = ptr_idx_m - offset;
+ mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+ // Extract 4B
+ uint32_t *cache_32_w = (uint32_t *) cache_w;
+ return cache_32_w[offset / 4];
}
-static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- // Load 8B
- uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
- uint32_t offset = ((uint32_t)ptr_idx_m)%8;
- uint32_t ptr_block_m = ptr_idx_m - offset;
- mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
- // Modify 4B
- uint32_t* cache_32_w = (uint32_t*) cache_w;
- cache_32_w[offset/4] = val;
- // Write back 8B
- mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8);
+static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx,
+ uint64_t *cache_w)
+{
+ // Load 8B
+ uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+ uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+ uint32_t ptr_block_m = ptr_idx_m - offset;
+ mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+ // Modify 4B
+ uint32_t *cache_32_w = (uint32_t *) cache_w;
+ cache_32_w[offset / 4] = val;
+ // Write back 8B
+ mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8);
}
#endif
-
diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c
index 43a2d0f..5275047 100644
--- a/BFS/dpu/task.c
+++ b/BFS/dpu/task.c
@@ -12,7 +12,7 @@
#include <perfcounter.h>
#include "dpu-utils.h"
-#include "../support/common.h"
+#include "common.h"
BARRIER_INIT(my_barrier, NR_TASKLETS);
@@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS);
MUTEX_INIT(nextFrontierMutex);
// main
-int main() {
-
- if(me() == 0) {
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- // Load parameters
- uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
- struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-
- // Extract parameters
- uint32_t numGlobalNodes = params_w->numNodes;
- uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
- uint32_t numNodes = params_w->dpuNumNodes;
- uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
- uint32_t level = params_w->level;
- uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
- uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
- uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
- uint32_t visited_m = params_w->dpuVisited_m;
- uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
- uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
-
- if(numNodes > 0) {
-
- // Sanity check
- if(me() == 0) {
- if(numGlobalNodes%64 != 0) {
- //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
- }
- if(startNodeIdx%64 != 0 || numNodes%64 != 0) {
- //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
- }
- }
-
- // Allocate WRAM cache for each tasklet to use throughout
- uint64_t* cache_w = mem_alloc(sizeof(uint64_t));
-
- // Update current frontier and visited list based on the next frontier from the previous iteration
- for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) {
-
- // Get the next frontier tile from MRAM
- uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w);
-
- // Process next frontier tile if it is not empty
- if(nextFrontierTile) {
-
- // Mark everything that was previously added to the next frontier as visited
- uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w);
- visitedTile |= nextFrontierTile;
- store8B(visitedTile, visited_m, nodeTileIdx, cache_w);
-
- // Clear the next frontier
- store8B(0, nextFrontier_m, nodeTileIdx, cache_w);
-
- }
-
- // Extract the current frontier from the previous next frontier and update node levels
- uint32_t startTileIdx = startNodeIdx/64;
- uint32_t numTiles = numNodes/64;
- if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) {
-
- // Update current frontier
- store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w);
-
- // Update node levels
- if(nextFrontierTile) {
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(nextFrontierTile, node%64)) {
- store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
- }
- }
- }
- }
-
- }
-
- // Wait until all tasklets have updated the current frontier
- barrier_wait(&bfsBarrier);
-
- // Identify tasklet's nodes
- uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS;
- uint32_t taskletNodesStart = me()*numNodesPerTasklet;
- uint32_t taskletNumNodes;
- if(taskletNodesStart > numNodes) {
- taskletNumNodes = 0;
- } else if(taskletNodesStart + numNodesPerTasklet > numNodes) {
- taskletNumNodes = numNodes - taskletNodesStart;
- } else {
- taskletNumNodes = numNodesPerTasklet;
- }
-
- // Visit neighbors of the current frontier
- mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
- for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) {
- uint32_t nodeTileIdx = node/64;
- uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
- if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
- // Visit its neighbors
- uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset;
- uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
- for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
- uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
- uint32_t neighborTileIdx = neighbor/64;
- uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w);
- if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited
- // Add neighbor to next frontier
- mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
- uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w);
- setBit(nextFrontierTile, neighbor%64);
- store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w);
- mutex_unlock(mutexID);
- }
- }
- }
- }
-
- }
-
- return 0;
+int main()
+{
+
+ if (me() == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Load parameters
+ uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ struct DPUParams *params_w =
+ (struct DPUParams *)
+ mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ mram_read((__mram_ptr void const *)params_m, params_w,
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+
+ // Extract parameters
+ uint32_t numGlobalNodes = params_w->numNodes;
+ uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
+ uint32_t numNodes = params_w->dpuNumNodes;
+ uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
+ uint32_t level = params_w->level;
+ uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
+ uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
+ uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
+ uint32_t visited_m = params_w->dpuVisited_m;
+ uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
+ uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
+
+ if (numNodes > 0) {
+
+ // Sanity check
+ if (me() == 0) {
+ if (numGlobalNodes % 64 != 0) {
+ //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
+ }
+ if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) {
+ //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
+ }
+ }
+ // Allocate WRAM cache for each tasklet to use throughout
+ uint64_t *cache_w = mem_alloc(sizeof(uint64_t));
+
+ // Update current frontier and visited list based on the next frontier from the previous iteration
+ for (uint32_t nodeTileIdx = me();
+ nodeTileIdx < numGlobalNodes / 64;
+ nodeTileIdx += NR_TASKLETS) {
+
+ // Get the next frontier tile from MRAM
+ uint64_t nextFrontierTile =
+ load8B(nextFrontier_m, nodeTileIdx, cache_w);
+
+ // Process next frontier tile if it is not empty
+ if (nextFrontierTile) {
+
+ // Mark everything that was previously added to the next frontier as visited
+ uint64_t visitedTile =
+ load8B(visited_m, nodeTileIdx, cache_w);
+ visitedTile |= nextFrontierTile;
+ store8B(visitedTile, visited_m, nodeTileIdx,
+ cache_w);
+
+ // Clear the next frontier
+ store8B(0, nextFrontier_m, nodeTileIdx,
+ cache_w);
+
+ }
+ // Extract the current frontier from the previous next frontier and update node levels
+ uint32_t startTileIdx = startNodeIdx / 64;
+ uint32_t numTiles = numNodes / 64;
+ if (startTileIdx <= nodeTileIdx
+ && nodeTileIdx < startTileIdx + numTiles) {
+
+ // Update current frontier
+ store8B(nextFrontierTile, currentFrontier_m,
+ nodeTileIdx - startTileIdx, cache_w);
+
+ // Update node levels
+ if (nextFrontierTile) {
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64;
+ ++node) {
+ if (isSet
+ (nextFrontierTile,
+ node % 64)) {
+ store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
+ }
+ }
+ }
+ }
+
+ }
+
+ // Wait until all tasklets have updated the current frontier
+ barrier_wait(&bfsBarrier);
+
+ // Identify tasklet's nodes
+ uint32_t numNodesPerTasklet =
+ (numNodes + NR_TASKLETS - 1) / NR_TASKLETS;
+ uint32_t taskletNodesStart = me() * numNodesPerTasklet;
+ uint32_t taskletNumNodes;
+ if (taskletNodesStart > numNodes) {
+ taskletNumNodes = 0;
+ } else if (taskletNodesStart + numNodesPerTasklet > numNodes) {
+ taskletNumNodes = numNodes - taskletNodesStart;
+ } else {
+ taskletNumNodes = numNodesPerTasklet;
+ }
+
+ // Visit neighbors of the current frontier
+ mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
+ for (uint32_t node = taskletNodesStart;
+ node < taskletNodesStart + taskletNumNodes; ++node) {
+ uint32_t nodeTileIdx = node / 64;
+ uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
+ if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier
+ // Visit its neighbors
+ uint32_t nodePtr =
+ load4B(nodePtrs_m, node,
+ cache_w) - nodePtrsOffset;
+ uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
+ for (uint32_t i = nodePtr; i < nextNodePtr; ++i) {
+ uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
+ uint32_t neighborTileIdx =
+ neighbor / 64;
+ uint64_t visitedTile =
+ load8B(visited_m, neighborTileIdx,
+ cache_w);
+ if (!isSet(visitedTile, neighbor % 64)) { // Neighbor not previously visited
+ // Add neighbor to next frontier
+ mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
+ uint64_t nextFrontierTile =
+ load8B(nextFrontier_m,
+ neighborTileIdx,
+ cache_w);
+ setBit(nextFrontierTile,
+ neighbor % 64);
+ store8B(nextFrontierTile,
+ nextFrontier_m,
+ neighborTileIdx,
+ cache_w);
+ mutex_unlock(mutexID);
+ }
+ }
+ }
+ }
+
+ }
+
+ return 0;
}
diff --git a/BFS/host/app.c b/BFS/host/app.c
index 54b9cdc..4431193 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -3,9 +3,24 @@
* BFS Host Application Source File
*
*/
+#if ASPECTC
+extern "C" {
+#endif
+
#include <dpu.h>
#include <dpu_log.h>
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+#if ASPECTC
+}
+#endif
+
#include <assert.h>
#include <getopt.h>
#include <stdio.h>
@@ -14,321 +29,436 @@
#include <unistd.h>
#include "mram-management.h"
-#include "../support/common.h"
-#include "../support/graph.h"
-#include "../support/params.h"
-#include "../support/timer.h"
-#include "../support/utils.h"
+#include "common.h"
+#include "graph.h"
+#include "params.h"
+#include "timer.h"
+#include "utils.h"
-#ifndef ENERGY
-#define ENERGY 0
+#define DPU_BINARY "./bin/dpu_code"
+
+// Main of the Host Application
+int main(int argc, char **argv)
+{
+
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Timer and profiling
+ Timer timer;
+#if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ double tenergy = 0;
+#endif
+
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs, numRanks;
+
+#if WITH_ALLOC_OVERHEAD
+ startTimer(&timer, 0, 0);
+#endif
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if WITH_ALLOC_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ zeroTimer(&timer, 0);
+#endif
+
+#if WITH_LOAD_OVERHEAD
+ startTimer(&timer, 1, 0);
+#endif
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+#if WITH_LOAD_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ zeroTimer(&timer, 1);
+#endif
+
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+ assert(NR_DPUS == numDPUs);
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize BFS data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+ struct COOGraph cooGraph = readCOOGraph(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges",
+ cooGraph.numNodes, cooGraph.numEdges);
+ struct CSRGraph csrGraph = coo2csr(cooGraph);
+ uint32_t numNodes = csrGraph.numNodes;
+ uint32_t *nodePtrs = csrGraph.nodePtrs;
+ uint32_t *neighborIdxs = csrGraph.neighborIdxs;
+ uint32_t *nodeLevel = (uint32_t*)calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ uint64_t *visited = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *currentFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *nextFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ uint32_t level = 1;
+
+ // Partition data structure across DPUs
+ uint32_t numNodesPerDPU =
+ ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU",
+ numNodesPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ uint32_t dpuParams_m[numDPUs];
+ unsigned int dpuIdx = 0;
+ unsigned int t0ini = 0;
+ unsigned int t1ini = 0;
+ unsigned int t2ini = 0;
+ unsigned int t3ini = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ dpuParams_m[dpuIdx] =
+ mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's nodes
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ uint32_t dpuNumNodes;
+ if (dpuStartNodeIdx > numNodes) {
+ dpuNumNodes = 0;
+ } else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) {
+ dpuNumNodes = numNodes - dpuStartNodeIdx;
+ } else {
+ dpuNumNodes = numNodesPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u nodes",
+ dpuNumNodes);
+
+ // Partition edges and copy data
+ if (dpuNumNodes > 0) {
+
+ // Find DPU's CSR graph partition
+ uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
+ uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
+ uint32_t *dpuNeighborIdxs_h =
+ neighborIdxs + dpuNodePtrsOffset;
+ uint32_t dpuNumNeighbors =
+ dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
+ uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
+
+ // Allocate MRAM
+ uint32_t dpuNodePtrs_m =
+ mram_heap_alloc(&allocator,
+ (dpuNumNodes +
+ 1) * sizeof(uint32_t));
+ uint32_t dpuNeighborIdxs_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNeighbors * sizeof(uint32_t));
+ uint32_t dpuNodeLevel_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes * sizeof(uint32_t));
+ uint32_t dpuVisited_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ uint32_t dpuCurrentFrontier_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes / 64 *
+ sizeof(uint64_t));
+ uint32_t dpuNextFrontier_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ PRINT_INFO(p.verbosity >= 2,
+ " Total memory allocated is %d bytes",
+ allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].numNodes = numNodes;
+ dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
+ dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
+ dpuParams[dpuIdx].level = level;
+ dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
+ dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
+ dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
+ dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
+ dpuParams[dpuIdx].dpuCurrentFrontier_m =
+ dpuCurrentFrontier_m;
+ dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying data to DPU");
+ startTimer(&timer, 2, t0ini++);
+
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNodePtrs_m, (uint8_t *) dpuNodePtrs_h,
+ ROUND_UP_TO_MULTIPLE_OF_8((dpuNumNodes + 1) * sizeof(uint32_t))));
+
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNeighborIdxs_m, (uint8_t *) dpuNeighborIdxs_h,
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNeighbors * sizeof(uint32_t))));
+
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNodeLevel_m, (uint8_t *) dpuNodeLevel_h,
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(uint32_t))));
+
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuVisited_m, (uint8_t *) visited,
+ ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuNextFrontier_m, (uint8_t *) nextFrontier,
+ ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+
+ // NOTE: No need to copy current frontier because it is written before being read
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ }
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying parameters to DPU");
+ startTimer(&timer, 2, t1ini++);
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams_m[dpuIdx], (uint8_t *) & dpuParams[dpuIdx],
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+
+ // Iterate until next frontier is empty
+ uint32_t nextFrontierEmpty = 0;
+ while (!nextFrontierEmpty) {
+
+ PRINT_INFO(p.verbosity >= 1,
+ "Processing current frontier for level %u", level);
+
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
#endif
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
+ startTimer(&timer, 3, t2ini++);
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ stopTimer(&timer, 3);
+ //dpuTime += getElapsedTime(timer);
#if ENERGY
-#include <dpu_probe.h>
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ tenergy += energy;
#endif
-#define DPU_BINARY "./bin/dpu_code"
+ // Copy back next frontier from all DPUs and compute their union as the current frontier
+ startTimer(&timer, 4, t3ini++);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ if (dpuIdx == 0) {
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuNextFrontier_m,
+ (uint8_t *) currentFrontier,
+ ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+ } else {
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuNextFrontier_m,
+ (uint8_t *) nextFrontier,
+ ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+ for (uint32_t i = 0; i < numNodes / 64;
+ ++i) {
+ currentFrontier[i] |=
+ nextFrontier[i];
+ }
+ }
+ ++dpuIdx;
+ }
+ }
+
+ // Check if the next frontier is empty, and copy data to DPU if not empty
+ nextFrontierEmpty = 1;
+ for (uint32_t i = 0; i < numNodes / 64; ++i) {
+ if (currentFrontier[i]) {
+ nextFrontierEmpty = 0;
+ break;
+ }
+ }
+ if (!nextFrontierEmpty) {
+ ++level;
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes =
+ dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuNextFrontier_m,
+ (uint8_t *) currentFrontier,
+ ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t))));
+ // Copy new level to DPU
+ dpuParams[dpuIdx].level = level;
+ DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams_m[dpuIdx], (uint8_t *) &dpuParams[dpuIdx],
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))));
+ ++dpuIdx;
+ }
+ }
+ }
+ stopTimer(&timer, 4);
+ //hostTime += getElapsedTime(timer);
+
+ }
+
+ // Copy back node levels
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer, 5, 0);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME,
+ dpuParams[dpuIdx].dpuNodeLevel_m,
+ (uint8_t *) (nodeLevel + dpuStartNodeIdx),
+ ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(float))));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer, 5);
+ //retrieveTime += getElapsedTime(timer);
+ //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ uint32_t *nodeLevelReference = (uint32_t*) calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t));
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ nextFrontierEmpty = 0;
+ level = 1;
+ startTimer(&timer, 6, 0);
+ while (!nextFrontierEmpty) {
+ // Update current frontier and visited list based on the next frontier from the previous iteration
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
+ currentFrontier[nodeTileIdx] = nextFrontierTile;
+ if (nextFrontierTile) {
+ visited[nodeTileIdx] |= nextFrontierTile;
+ nextFrontier[nodeTileIdx] = 0;
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(nextFrontierTile, node % 64)) {
+ nodeLevelReference[node] =
+ level;
+ }
+ }
+ }
+ }
+ // Visit neighbors of the current frontier
+ nextFrontierEmpty = 1;
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t currentFrontierTile =
+ currentFrontier[nodeTileIdx];
+ if (currentFrontierTile) {
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier
+ // Visit its neighbors
+ uint32_t nodePtr =
+ nodePtrs[node];
+ uint32_t nextNodePtr =
+ nodePtrs[node + 1];
+ for (uint32_t i = nodePtr;
+ i < nextNodePtr; ++i) {
+ uint32_t neighbor =
+ neighborIdxs[i];
+ if (!isSet(visited[neighbor / 64], neighbor % 64)) { // Neighbor not previously visited
+ // Add neighbor to next frontier
+ setBit
+ (nextFrontier
+ [neighbor /
+ 64],
+ neighbor %
+ 64);
+ nextFrontierEmpty
+ = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ ++level;
+ }
+ stopTimer(&timer, 6);
+
+#if WITH_FREE_OVERHEAD
+ startTimer(&timer, 7);
+#endif
+ DPU_ASSERT(dpu_free(dpu_set));
+#if WITH_FREE_OVERHEAD
+ stopTimer(&timer, 7);
+#else
+ zeroTimer(&timer, 7);
+#endif
-// Main of the Host Application
-int main(int argc, char** argv) {
-
- // Process parameters
- struct Params p = input_params(argc, argv);
-
- // Timer and profiling
- Timer timer;
- #if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- double tenergy=0;
- #endif
-
- // Allocate DPUs and load binary
- struct dpu_set_t dpu_set, dpu;
- uint32_t numDPUs;
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
- PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
- // Initialize BFS data structures
- PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
- struct COOGraph cooGraph = readCOOGraph(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
- struct CSRGraph csrGraph = coo2csr(cooGraph);
- uint32_t numNodes = csrGraph.numNodes;
- uint32_t* nodePtrs = csrGraph.nodePtrs;
- uint32_t* neighborIdxs = csrGraph.neighborIdxs;
- uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- uint32_t level = 1;
-
- // Partition data structure across DPUs
- uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1);
- PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU);
- struct DPUParams dpuParams[numDPUs];
- uint32_t dpuParams_m[numDPUs];
- unsigned int dpuIdx = 0;
- unsigned int t0ini = 0;
- unsigned int t1ini = 0;
- unsigned int t2ini = 0;
- unsigned int t3ini = 0;
- DPU_FOREACH (dpu_set, dpu) {
-
- // Allocate parameters
- struct mram_heap_allocator_t allocator;
- init_allocator(&allocator);
- dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
- // Find DPU's nodes
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- uint32_t dpuNumNodes;
- if(dpuStartNodeIdx > numNodes) {
- dpuNumNodes = 0;
- } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) {
- dpuNumNodes = numNodes - dpuStartNodeIdx;
- } else {
- dpuNumNodes = numNodesPerDPU;
- }
- dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
- PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
- PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", dpuNumNodes);
-
- // Partition edges and copy data
- if(dpuNumNodes > 0) {
-
- // Find DPU's CSR graph partition
- uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
- uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
- uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset;
- uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
- uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
-
- // Allocate MRAM
- uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t));
- uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t));
- uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t));
- uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t));
- uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
-
- // Set up DPU parameters
- dpuParams[dpuIdx].numNodes = numNodes;
- dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
- dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
- dpuParams[dpuIdx].level = level;
- dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
- dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
- dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
- dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
- dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m;
- dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
-
- // Send data to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer, 0, t0ini++);
- copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
- copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // NOTE: No need to copy current frontier because it is written before being read
- stopTimer(&timer, 0);
- //loadTime += getElapsedTime(timer);
-
- }
-
- // Send parameters to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer, 1, t1ini++);
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- stopTimer(&timer, 1);
- //loadTime += getElapsedTime(timer);
-
- ++dpuIdx;
-
- }
-
- // Iterate until next frontier is empty
- uint32_t nextFrontierEmpty = 0;
- while(!nextFrontierEmpty) {
-
- PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level);
-
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- // Run all DPUs
- PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
- startTimer(&timer, 2, t2ini++);
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- stopTimer(&timer, 2);
- //dpuTime += getElapsedTime(timer);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- tenergy += energy;
- #endif
-
-
-
- // Copy back next frontier from all DPUs and compute their union as the current frontier
- startTimer(&timer, 3, t3ini++);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- if(dpuIdx == 0) {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t));
- } else {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t));
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- currentFrontier[i] |= nextFrontier[i];
- }
- }
- ++dpuIdx;
- }
- }
-
- // Check if the next frontier is empty, and copy data to DPU if not empty
- nextFrontierEmpty = 1;
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- if(currentFrontier[i]) {
- nextFrontierEmpty = 0;
- break;
- }
- }
- if(!nextFrontierEmpty) {
- ++level;
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
- copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // Copy new level to DPU
- dpuParams[dpuIdx].level = level;
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- ++dpuIdx;
- }
- }
- }
- stopTimer(&timer, 3);
- //hostTime += getElapsedTime(timer);
-
- }
-
- // Copy back node levels
- PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer, 4, 0);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float));
- }
- ++dpuIdx;
- }
- stopTimer(&timer, 4);
- //retrieveTime += getElapsedTime(timer);
- //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
-
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t));
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- nextFrontierEmpty = 0;
- level = 1;
- while(!nextFrontierEmpty) {
- // Update current frontier and visited list based on the next frontier from the previous iteration
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
- currentFrontier[nodeTileIdx] = nextFrontierTile;
- if(nextFrontierTile) {
- visited[nodeTileIdx] |= nextFrontierTile;
- nextFrontier[nodeTileIdx] = 0;
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(nextFrontierTile, node%64)) {
- nodeLevelReference[node] = level;
- }
- }
- }
- }
- // Visit neighbors of the current frontier
- nextFrontierEmpty = 1;
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t currentFrontierTile = currentFrontier[nodeTileIdx];
- if(currentFrontierTile) {
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
- // Visit its neighbors
- uint32_t nodePtr = nodePtrs[node];
- uint32_t nextNodePtr = nodePtrs[node + 1];
- for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
- uint32_t neighbor = neighborIdxs[i];
- if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited
- // Add neighbor to next frontier
- setBit(nextFrontier[neighbor/64], neighbor%64);
- nextFrontierEmpty = 0;
- }
- }
- }
- }
- }
- }
- ++level;
- }
-
- // Verify the result
- PRINT_INFO(p.verbosity >= 1, "Verifying the result");
- int status = 1;
- for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
- if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
- PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
- status = 0;
- }
- }
-
- if (status) {
- printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
- "| throughput_pim_MBps=%f throughput_MBps=%f",
- numDPUs, NR_TASKLETS, "uint32_t", numNodes,
- numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]),
- numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
- numNodes / (timer.time[2] + timer.time[3]),
- numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printAll(&timer, 4);
- }
-
- // Display DPU Logs
- if(p.verbosity >= 2) {
- PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- PRINT("DPU %u:", dpuIdx);
- DPU_ASSERT(dpu_log_read(dpu, stdout));
- ++dpuIdx;
- }
- }
-
- // Deallocate data structures
- freeCOOGraph(cooGraph);
- freeCSRGraph(csrGraph);
- free(nodeLevel);
- free(visited);
- free(currentFrontier);
- free(nextFrontier);
- free(nodeLevelReference);
-
- return 0;
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
+ for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
+ if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
+ PRINT_ERROR
+ ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)",
+ nodeIdx, nodeLevelReference[nodeIdx],
+ nodeLevel[nodeIdx]);
+ status = 0;
+ }
+ }
+
+ if (status) {
+ dfatool_printf
+ ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
+ "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, numRanks,
+ NR_TASKLETS, "uint32_t", numNodes,
+ numNodes * sizeof(uint32_t) / (timer.time[2] +
+ timer.time[3]),
+ numNodes * sizeof(uint32_t) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4]));
+ dfatool_printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
+ numNodes / (timer.time[2] + timer.time[3]),
+ numNodes / (timer.time[0] + timer.time[1] +
+ timer.time[2] + timer.time[3] +
+ timer.time[4]));
+ dfatool_printf
+ (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[4], timer.time[5], timer.time[6],
+ timer.time[7]);
+ }
+ // Display DPU Logs
+ if (p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+ // Deallocate data structures
+ freeCOOGraph(cooGraph);
+ freeCSRGraph(csrGraph);
+ free(nodeLevel);
+ free(visited);
+ free(currentFrontier);
+ free(nextFrontier);
+ free(nodeLevelReference);
+
+ return 0;
}
-
diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h
index 627dfde..a953d6a 100644
--- a/BFS/host/mram-management.h
+++ b/BFS/host/mram-management.h
@@ -1,37 +1,29 @@
+#pragma once
-#ifndef _MRAM_MANAGEMENT_H_
-#define _MRAM_MANAGEMENT_H_
+#include "common.h"
+#include "utils.h"
-#include "../support/common.h"
-#include "../support/utils.h"
-
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
struct mram_heap_allocator_t {
- uint32_t totalAllocated;
+ uint32_t totalAllocated;
};
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
- allocator->totalAllocated = 0;
-}
-
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
- uint32_t ret = allocator->totalAllocated;
- allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
- if(allocator->totalAllocated > DPU_CAPACITY) {
- PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
- exit(0);
- }
- return ret;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+ allocator->totalAllocated = 0;
}
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
- DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+ uint32_t size)
+{
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if (allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR
+ (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+ allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
}
-
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
- DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
-}
-
-#endif
-
diff --git a/BFS/include/common.h b/BFS/include/common.h
new file mode 100644
index 0000000..5f2aa0d
--- /dev/null
+++ b/BFS/include/common.h
@@ -0,0 +1,25 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2)
+#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8)
+#define ROUND_UP_TO_MULTIPLE_OF_64(x) ((((x) + 63)/64)*64)
+
+#define setBit(val, idx) (val) |= (1 << (idx))
+#define isSet(val, idx) ((val) & (1 << (idx)))
+
+struct DPUParams {
+ uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
+ uint32_t numNodes; /* Total number of nodes in the graph */
+ uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */
+ uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
+ uint32_t level; /* The current BFS level */
+ uint32_t dpuNodePtrs_m;
+ uint32_t dpuNeighborIdxs_m;
+ uint32_t dpuNodeLevel_m;
+ uint32_t dpuVisited_m;
+ uint32_t dpuCurrentFrontier_m;
+ uint32_t dpuNextFrontier_m;
+};
+
+#endif
diff --git a/BFS/include/dfatool_host.ah b/BFS/include/dfatool_host.ah
new file mode 100644
index 0000000..b2677e1
--- /dev/null
+++ b/BFS/include/dfatool_host.ah
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <sys/time.h>
+#include "dfatool_host_dpu.ah"
+
+aspect DfatoolHostTiming : public DfatoolHostDPUTiming {
+
+ unsigned long input_size;
+ unsigned int element_size;
+
+ virtual int getKernel() { return 1; }
+
+ DfatoolHostTiming() {
+ element_size = sizeof(uint32_t);
+ }
+
+ advice call("% input_params(...)"): after() {
+ printf("[>>] BFS | n_dpus=%u\n", NR_DPUS);
+ }
+
+ advice call("% coo2csr(...)") : after() {
+ struct CSRGraph *g = tjp->result();
+ input_size = g->numNodes;
+ printf("[--] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size);
+ }
+
+ advice execution("% main(...)") : after() {
+ printf("[<<] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size);
+ }
+};
diff --git a/BFS/include/graph.h b/BFS/include/graph.h
new file mode 100644
index 0000000..2a19f67
--- /dev/null
+++ b/BFS/include/graph.h
@@ -0,0 +1,133 @@
+
+#ifndef _GRAPH_H_
+#define _GRAPH_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "utils.h"
+
+struct COOGraph {
+ uint32_t numNodes;
+ uint32_t numEdges;
+ uint32_t *nodeIdxs;
+ uint32_t *neighborIdxs;
+};
+
+struct CSRGraph {
+ uint32_t numNodes;
+ uint32_t numEdges;
+ uint32_t *nodePtrs;
+ uint32_t *neighborIdxs;
+};
+
+static struct COOGraph readCOOGraph(const char *fileName)
+{
+
+ struct COOGraph cooGraph;
+
+ // Initialize fields
+ FILE *fp = fopen(fileName, "r");
+ uint32_t numNodes, numCols;
+ assert(fscanf(fp, "%u", &numNodes));
+ assert(fscanf(fp, "%u", &numCols));
+ if (numNodes == numCols) {
+ cooGraph.numNodes = numNodes;
+ } else {
+ PRINT_WARNING
+ (" Adjacency matrix is not square. Padding matrix to be square.");
+ cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols;
+ }
+ if (cooGraph.numNodes % 64 != 0) {
+ PRINT_WARNING
+ (" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.",
+ cooGraph.numNodes);
+ cooGraph.numNodes += (64 - cooGraph.numNodes % 64);
+ PRINT_WARNING
+ (" Padding to %u which is a multiple of 64 nodes.",
+ cooGraph.numNodes);
+ }
+ assert(fscanf(fp, "%u", &cooGraph.numEdges));
+ cooGraph.nodeIdxs =
+ (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+ cooGraph.neighborIdxs =
+ (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+
+ // Read the neighborIdxs
+ for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
+ uint32_t nodeIdx;
+ assert(fscanf(fp, "%u", &nodeIdx));
+ cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
+ uint32_t neighborIdx;
+ assert(fscanf(fp, "%u", &neighborIdx));
+ cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
+ }
+
+ return cooGraph;
+
+}
+
+static void freeCOOGraph(struct COOGraph cooGraph)
+{
+ free(cooGraph.nodeIdxs);
+ free(cooGraph.neighborIdxs);
+}
+
+static struct CSRGraph coo2csr(struct COOGraph cooGraph)
+{
+
+ struct CSRGraph csrGraph;
+
+ // Initialize fields
+ csrGraph.numNodes = cooGraph.numNodes;
+ csrGraph.numEdges = cooGraph.numEdges;
+ csrGraph.nodePtrs =
+ (uint32_t *)
+ calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1),
+ sizeof(uint32_t));
+ csrGraph.neighborIdxs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (csrGraph.numEdges * sizeof(uint32_t)));
+
+ // Histogram nodeIdxs
+ for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+ uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+ csrGraph.nodePtrs[nodeIdx]++;
+ }
+
+ // Prefix sum nodePtrs
+ uint32_t sumBeforeNextNode = 0;
+ for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+ uint32_t sumBeforeNode = sumBeforeNextNode;
+ sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
+ csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
+ }
+ csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
+
+ // Bin the neighborIdxs
+ for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+ uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+ uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
+ csrGraph.neighborIdxs[neighborListIdx] =
+ cooGraph.neighborIdxs[i];
+ }
+
+ // Restore nodePtrs
+ for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
+ csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
+ }
+ csrGraph.nodePtrs[0] = 0;
+
+ return csrGraph;
+
+}
+
+static void freeCSRGraph(struct CSRGraph csrGraph)
+{
+ free(csrGraph.nodePtrs);
+ free(csrGraph.neighborIdxs);
+}
+
+#endif
diff --git a/BFS/include/params.h b/BFS/include/params.h
new file mode 100644
index 0000000..f9169bc
--- /dev/null
+++ b/BFS/include/params.h
@@ -0,0 +1,67 @@
+
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+#include "utils.h"
+
+static void usage()
+{
+ PRINT("\nUsage: ./program [options]"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)"
+ "\n"
+ "\nGeneral options:"
+ "\n -v <V> verbosity" "\n -h help" "\n\n");
+}
+
+typedef struct Params {
+ const char *fileName;
+ unsigned int verbosity;
+#if NUMA
+ struct bitmask *bitmask_in;
+ int numa_node_cpu;
+#endif
+} Params;
+
+static struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.fileName = "data/roadNet-CA.txt";
+ p.verbosity = 0;
+#if NUMA
+ p.bitmask_in = NULL;
+ p.numa_node_cpu = -1;
+#endif
+ int opt;
+ while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) {
+ switch (opt) {
+ case 'f':
+ p.fileName = optarg;
+ break;
+ case 'v':
+ p.verbosity = atoi(optarg);
+ break;
+#if NUMA
+ case 'A':
+ p.bitmask_in = numa_parse_nodestring(optarg);
+ break;
+ case 'C':
+ p.numa_node_cpu = atoi(optarg);
+ break;
+#endif
+ case 'h':
+ usage();
+ exit(0);
+ default:
+ PRINT_ERROR("Unrecognized option!");
+ usage();
+ exit(0);
+ }
+ }
+
+ return p;
+}
+
+#endif
diff --git a/BFS/include/timer.h b/BFS/include/timer.h
new file mode 100644
index 0000000..e85490f
--- /dev/null
+++ b/BFS/include/timer.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#define N_TIMERS 8
+#define startTimer start
+#define stopTimer stop
+#define zeroTimer zero
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/BFS/support/utils.h b/BFS/include/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/BFS/support/utils.h
+++ b/BFS/include/utils.h
@@ -8,4 +8,3 @@
#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__)
#endif
-
diff --git a/BFS/run-paper-strong-full.sh b/BFS/run-paper-strong-full.sh
deleted file mode 100755
index 42806a2..0000000
--- a/BFS/run-paper-strong-full.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS strong-full (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >2048 are not part of upstream
-for nr_dpus in 2543 2304 256 512 1024 2048; do
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true
- done
- fi
- done
-done
-) | tee log-paper-strong-full.txt
diff --git a/BFS/run-paper-strong-rank.sh b/BFS/run-paper-strong-rank.sh
deleted file mode 100755
index e01d18a..0000000
--- a/BFS/run-paper-strong-rank.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS strong-rank (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# >64 are not part of upstream
-for nr_dpus in 128 1 4 16 64; do
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true
- done
- fi
- done
-done
-) | tee log-paper-strong-rank.txt
diff --git a/BFS/run-paper-weak.sh b/BFS/run-paper-weak.sh
deleted file mode 100755
index 121758a..0000000
--- a/BFS/run-paper-weak.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-set -e
-
-(
-
-echo "prim-benchmarks BFS weak (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# 256 and 512 are not part of upstream
-for nr_dpus in 256 512 1 4 16 64; do
- for nr_tasklets in 1 2 4 8 16; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then
- # repetition is not part of upstream setup
- for i in `seq 1 50`; do
- # upstream code uses some kind of generated rMat graphs, but does not provide instructions for reproduction
- timeout --foreground -k 1m 3m bin/host_code -f data/loc-gowalla_edges.txt || true
- done
- fi
- done
-done |
-) tee log-paper-weak.txt
diff --git a/BFS/run.sh b/BFS/run.sh
deleted file mode 100755
index 8f5bfb8..0000000
--- a/BFS/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# -f: input file (i.e., input size)
-# bin/host_code -f data/loc-gowalla_edges.txt
-
-# input size depends on file -> strong scaling only
-
-echo "prim-benchmarks BFS (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do
- for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do
- for f in loc-gowalla_edges roadNet-CA; do
- echo
- if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
- for i in `seq 1 20`; do
- timeout --foreground -k 1m 30m bin/host_code -f data/${f}.txt || true
- done
- fi
- done
- done
-done
diff --git a/BFS/support/common.h b/BFS/support/common.h
deleted file mode 100644
index ced324c..0000000
--- a/BFS/support/common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2)
-#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8)
-#define ROUND_UP_TO_MULTIPLE_OF_64(x) ((((x) + 63)/64)*64)
-
-#define setBit(val, idx) (val) |= (1 << (idx))
-#define isSet(val, idx) ((val) & (1 << (idx)))
-
-struct DPUParams {
- uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
- uint32_t numNodes; /* Total number of nodes in the graph */
- uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */
- uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
- uint32_t level; /* The current BFS level */
- uint32_t dpuNodePtrs_m;
- uint32_t dpuNeighborIdxs_m;
- uint32_t dpuNodeLevel_m;
- uint32_t dpuVisited_m;
- uint32_t dpuCurrentFrontier_m;
- uint32_t dpuNextFrontier_m;
-};
-
-#endif
-
diff --git a/BFS/support/graph.h b/BFS/support/graph.h
deleted file mode 100644
index f89ff5c..0000000
--- a/BFS/support/graph.h
+++ /dev/null
@@ -1,116 +0,0 @@
-
-#ifndef _GRAPH_H_
-#define _GRAPH_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "common.h"
-#include "utils.h"
-
-struct COOGraph {
- uint32_t numNodes;
- uint32_t numEdges;
- uint32_t* nodeIdxs;
- uint32_t* neighborIdxs;
-};
-
-struct CSRGraph {
- uint32_t numNodes;
- uint32_t numEdges;
- uint32_t* nodePtrs;
- uint32_t* neighborIdxs;
-};
-
-static struct COOGraph readCOOGraph(const char* fileName) {
-
- struct COOGraph cooGraph;
-
- // Initialize fields
- FILE* fp = fopen(fileName, "r");
- uint32_t numNodes, numCols;
- assert(fscanf(fp, "%u", &numNodes));
- assert(fscanf(fp, "%u", &numCols));
- if(numNodes == numCols) {
- cooGraph.numNodes = numNodes;
- } else {
- PRINT_WARNING(" Adjacency matrix is not square. Padding matrix to be square.");
- cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols;
- }
- if(cooGraph.numNodes%64 != 0) {
- PRINT_WARNING(" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes);
- cooGraph.numNodes += (64 - cooGraph.numNodes%64);
- PRINT_WARNING(" Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes);
- }
- assert(fscanf(fp, "%u", &cooGraph.numEdges));
- cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
- cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-
- // Read the neighborIdxs
- for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
- uint32_t nodeIdx;
- assert(fscanf(fp, "%u", &nodeIdx));
- cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
- uint32_t neighborIdx;
- assert(fscanf(fp, "%u", &neighborIdx));
- cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
- }
-
- return cooGraph;
-
-}
-
-static void freeCOOGraph(struct COOGraph cooGraph) {
- free(cooGraph.nodeIdxs);
- free(cooGraph.neighborIdxs);
-}
-
-static struct CSRGraph coo2csr(struct COOGraph cooGraph) {
-
- struct CSRGraph csrGraph;
-
- // Initialize fields
- csrGraph.numNodes = cooGraph.numNodes;
- csrGraph.numEdges = cooGraph.numEdges;
- csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t));
- csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t)));
-
- // Histogram nodeIdxs
- for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
- uint32_t nodeIdx = cooGraph.nodeIdxs[i];
- csrGraph.nodePtrs[nodeIdx]++;
- }
-
- // Prefix sum nodePtrs
- uint32_t sumBeforeNextNode = 0;
- for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
- uint32_t sumBeforeNode = sumBeforeNextNode;
- sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
- csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
- }
- csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
-
- // Bin the neighborIdxs
- for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
- uint32_t nodeIdx = cooGraph.nodeIdxs[i];
- uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
- csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i];
- }
-
- // Restore nodePtrs
- for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
- csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
- }
- csrGraph.nodePtrs[0] = 0;
-
- return csrGraph;
-
-}
-
-static void freeCSRGraph(struct CSRGraph csrGraph) {
- free(csrGraph.nodePtrs);
- free(csrGraph.neighborIdxs);
-}
-
-#endif
-
diff --git a/BFS/support/params.h b/BFS/support/params.h
deleted file mode 100644
index f4f12e7..0000000
--- a/BFS/support/params.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef _PARAMS_H_
-#define _PARAMS_H_
-
-#include "common.h"
-#include "utils.h"
-
-static void usage() {
- PRINT( "\nUsage: ./program [options]"
- "\n"
- "\nBenchmark-specific options:"
- "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)"
- "\n"
- "\nGeneral options:"
- "\n -v <V> verbosity"
- "\n -h help"
- "\n\n");
-}
-
-typedef struct Params {
- const char* fileName;
- unsigned int verbosity;
-} Params;
-
-static struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.fileName = "data/roadNet-CA.txt";
- p.verbosity = 0;
- int opt;
- while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
- switch(opt) {
- case 'f': p.fileName = optarg; break;
- case 'v': p.verbosity = atoi(optarg); break;
- case 'h': usage(); exit(0);
- default:
- PRINT_ERROR("Unrecognized option!");
- usage();
- exit(0);
- }
- }
-
- return p;
-}
-
-#endif
-
diff --git a/BFS/support/timer.h b/BFS/support/timer.h
deleted file mode 100644
index 80719cf..0000000
--- a/BFS/support/timer.h
+++ /dev/null
@@ -1,34 +0,0 @@
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-#include <stdio.h>
-#include <sys/time.h>
-
-typedef struct Timer {
- struct timeval startTime[5];
- struct timeval stopTime[5];
- double time[5];
-} Timer;
-
-static void startTimer(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-static void stopTimer(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-static void printAll(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
-
-#endif