diff options
Diffstat (limited to 'BFS')
-rw-r--r-- | BFS/Makefile | 36 | ||||
-rw-r--r-- | BFS/baselines/cpu/Makefile | 26 | ||||
-rw-r--r-- | BFS/baselines/cpu/app.c | 41 | ||||
-rwxr-xr-x | BFS/benchmark-scripts/ccmcc25-sim.sh | 27 | ||||
-rwxr-xr-x | BFS/benchmark-scripts/ccmcc25.sh | 32 | ||||
-rw-r--r-- | BFS/dpu/dpu-utils.h | 61 | ||||
-rw-r--r-- | BFS/dpu/task.c | 276 | ||||
-rw-r--r-- | BFS/host/app.c | 750 | ||||
-rw-r--r-- | BFS/host/mram-management.h | 48 | ||||
-rw-r--r-- | BFS/include/common.h | 25 | ||||
-rw-r--r-- | BFS/include/dfatool_host.ah | 30 | ||||
-rw-r--r-- | BFS/include/graph.h | 133 | ||||
-rw-r--r-- | BFS/include/params.h | 67 | ||||
-rw-r--r-- | BFS/include/timer.h | 8 | ||||
-rw-r--r-- | BFS/include/utils.h (renamed from BFS/support/utils.h) | 1 | ||||
-rwxr-xr-x | BFS/run-paper-strong-full.sh | 23 | ||||
-rwxr-xr-x | BFS/run-paper-strong-rank.sh | 23 | ||||
-rwxr-xr-x | BFS/run-paper-weak.sh | 24 | ||||
-rwxr-xr-x | BFS/run.sh | 25 | ||||
-rw-r--r-- | BFS/support/common.h | 26 | ||||
-rw-r--r-- | BFS/support/graph.h | 116 | ||||
-rw-r--r-- | BFS/support/params.h | 46 | ||||
-rw-r--r-- | BFS/support/timer.h | 34 |
23 files changed, 1058 insertions, 820 deletions
diff --git a/BFS/Makefile b/BFS/Makefile index a4ea69d..a773b38 100644 --- a/BFS/Makefile +++ b/BFS/Makefile @@ -1,17 +1,37 @@ NR_DPUS ?= 1 NR_TASKLETS ?= 16 +WITH_ALLOC_OVERHEAD ?= 0 +WITH_LOAD_OVERHEAD ?= 0 +WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -20,11 +40,13 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah clean: ${QUIET}rm -rf bin diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile index 1f6ed3c..1efe457 100644 --- a/BFS/baselines/cpu/Makefile +++ b/BFS/baselines/cpu/Makefile @@ -1,8 +1,26 @@ -.PHONY: all -all: bfs +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 + +LDFLAGS = +CFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma +endif bfs: app.c - gcc -Wall -Wextra -pedantic -march=native -O2 -o bfs -fopenmp app.c + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -o bfs -fopenmp app.c ${LDFLAGS} bfs_O0: app.c gcc -o bfs_O0 -fopenmp app.c @@ -27,3 +45,5 @@ run_O2: bfs_O2 .PHONY: clean clean: rm -f bfs bfs_O0 bfs_O2 + +.PHONY: all diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c index caf4cbc..390b1f9 100644 --- a/BFS/baselines/cpu/app.c +++ b/BFS/baselines/cpu/app.c @@ -8,12 +8,30 @@ #include <omp.h> +#if NUMA +#include <numaif.h> +#include <numa.h> + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +struct bitmask* bitmask_in; +int numa_node_in = -1; +int numa_node_cpu = -1; +#endif + #include "../../support/common.h" #include "../../support/graph.h" #include "../../support/params.h" -#include "../../support/timer.h" #include "../../support/utils.h" +#if WITH_BENCHMARK +#include "../../support/timer.h" +#else +#define startTimer(...) +#define stopTimer(...) +#endif + int main(int argc, char** argv) { // Process parameters @@ -24,8 +42,9 @@ int main(int argc, char** argv) { struct COOGraph cooGraph = readCOOGraph(p.fileName); PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); - +#if WITH_BENCHMARK Timer timer; +#endif for(int rep = 0; rep < 100; rep++) { struct CSRGraph csrGraph = coo2csr(cooGraph); @@ -43,6 +62,12 @@ int main(int argc, char** argv) { uint32_t* prevFrontier = buffer1; uint32_t* currFrontier = buffer2; +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + // Calculating result on CPU startTimer(&timer, 0, 0); nodeLevel[srcNode] = 0; @@ -86,6 +111,12 @@ int main(int argc, char** argv) { } stopTimer(&timer, 0); +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + freeCSRGraph(csrGraph); free(buffer1); free(buffer2); @@ -135,6 +166,7 @@ int main(int argc, char** argv) { } stopTimer(&timer, 1); +#if WITH_BENCHMARK unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic @@ -158,8 +190,11 @@ int main(int argc, char** argv) { printf(" throughput_seq_MOpps=%f throughput_MOpps=%f", csrGraph.numNodes / timer.time[1], csrGraph.numNodes / timer.time[0]); - printAll(&timer, 1); + printf(" latency_us=%f latency_seq_us=%f\n", + timer.time[0], + timer.time[1]); } +#endif // WITH_BENCHMARK freeCSRGraph(csrGraph); free(nodeLevel); diff --git a/BFS/benchmark-scripts/ccmcc25-sim.sh b/BFS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..bcbe284 --- /dev/null +++ b/BFS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -f ${data} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +# BFS does not support repeated kernel invocations → repeat it here +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \ + ::: i $(seq 0 4) \ + ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ +>> ${fn}.txt diff --git a/BFS/benchmark-scripts/ccmcc25.sh b/BFS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..0dcf4bb --- /dev/null +++ b/BFS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -f ${data} 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + # BFS does not support repeated kernel invocations → repeat it here + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any data={data} \ + ::: i $(seq 0 10) \ + ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + >> ${fn}.txt + +done diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h index b02c073..dc986d2 100644 --- a/BFS/dpu/dpu-utils.h +++ b/BFS/dpu/dpu-utils.h @@ -6,39 +6,46 @@ #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__) -static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8); - return cache_w[0]; +static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w) +{ + mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)), + cache_w, 8); + return cache_w[0]; } -static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - cache_w[0] = val; - mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8); +static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, + uint64_t *cache_w) +{ + cache_w[0] = val; + mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)), + 8); } -static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - // Load 8B - uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t); - uint32_t offset = ((uint32_t)ptr_idx_m)%8; - uint32_t ptr_block_m = ptr_idx_m - offset; - mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8); - // Extract 4B - uint32_t* cache_32_w = (uint32_t*) cache_w; - return cache_32_w[offset/4]; +static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w) +{ + // Load 8B + uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t); + uint32_t offset = ((uint32_t) ptr_idx_m) % 8; + uint32_t ptr_block_m = ptr_idx_m - offset; + mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8); + // Extract 4B + uint32_t *cache_32_w = (uint32_t *) cache_w; + return cache_32_w[offset / 4]; } -static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - // Load 8B - uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t); - uint32_t offset = ((uint32_t)ptr_idx_m)%8; - uint32_t ptr_block_m = ptr_idx_m - offset; - mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8); - // Modify 4B - uint32_t* cache_32_w = (uint32_t*) cache_w; - cache_32_w[offset/4] = val; - // Write back 8B - mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8); +static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, + uint64_t *cache_w) +{ + // Load 8B + uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t); + uint32_t offset = ((uint32_t) ptr_idx_m) % 8; + uint32_t ptr_block_m = ptr_idx_m - offset; + mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8); + // Modify 4B + uint32_t *cache_32_w = (uint32_t *) cache_w; + cache_32_w[offset / 4] = val; + // Write back 8B + mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8); } #endif - diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c index 43a2d0f..5275047 100644 --- a/BFS/dpu/task.c +++ b/BFS/dpu/task.c @@ -12,7 +12,7 @@ #include <perfcounter.h> #include "dpu-utils.h" -#include "../support/common.h" +#include "common.h" BARRIER_INIT(my_barrier, NR_TASKLETS); @@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS); MUTEX_INIT(nextFrontierMutex); // main -int main() { - - if(me() == 0) { - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); - - // Load parameters - uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER; - struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); - mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); - - // Extract parameters - uint32_t numGlobalNodes = params_w->numNodes; - uint32_t startNodeIdx = params_w->dpuStartNodeIdx; - uint32_t numNodes = params_w->dpuNumNodes; - uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset; - uint32_t level = params_w->level; - uint32_t nodePtrs_m = params_w->dpuNodePtrs_m; - uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m; - uint32_t nodeLevel_m = params_w->dpuNodeLevel_m; - uint32_t visited_m = params_w->dpuVisited_m; - uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m; - uint32_t nextFrontier_m = params_w->dpuNextFrontier_m; - - if(numNodes > 0) { - - // Sanity check - if(me() == 0) { - if(numGlobalNodes%64 != 0) { - //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!"); - } - if(startNodeIdx%64 != 0 || numNodes%64 != 0) { - //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!"); - } - } - - // Allocate WRAM cache for each tasklet to use throughout - uint64_t* cache_w = mem_alloc(sizeof(uint64_t)); - - // Update current frontier and visited list based on the next frontier from the previous iteration - for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) { - - // Get the next frontier tile from MRAM - uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w); - - // Process next frontier tile if it is not empty - if(nextFrontierTile) { - - // Mark everything that was previously added to the next frontier as visited - uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w); - visitedTile |= nextFrontierTile; - store8B(visitedTile, visited_m, nodeTileIdx, cache_w); - - // Clear the next frontier - store8B(0, nextFrontier_m, nodeTileIdx, cache_w); - - } - - // Extract the current frontier from the previous next frontier and update node levels - uint32_t startTileIdx = startNodeIdx/64; - uint32_t numTiles = numNodes/64; - if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) { - - // Update current frontier - store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w); - - // Update node levels - if(nextFrontierTile) { - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(nextFrontierTile, node%64)) { - store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks - } - } - } - } - - } - - // Wait until all tasklets have updated the current frontier - barrier_wait(&bfsBarrier); - - // Identify tasklet's nodes - uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS; - uint32_t taskletNodesStart = me()*numNodesPerTasklet; - uint32_t taskletNumNodes; - if(taskletNodesStart > numNodes) { - taskletNumNodes = 0; - } else if(taskletNodesStart + numNodesPerTasklet > numNodes) { - taskletNumNodes = numNodes - taskletNodesStart; - } else { - taskletNumNodes = numNodesPerTasklet; - } - - // Visit neighbors of the current frontier - mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex); - for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) { - uint32_t nodeTileIdx = node/64; - uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile - if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier - // Visit its neighbors - uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset; - uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr - for(uint32_t i = nodePtr; i < nextNodePtr; ++i) { - uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader - uint32_t neighborTileIdx = neighbor/64; - uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w); - if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited - // Add neighbor to next frontier - mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention - uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w); - setBit(nextFrontierTile, neighbor%64); - store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w); - mutex_unlock(mutexID); - } - } - } - } - - } - - return 0; +int main() +{ + + if (me() == 0) { + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + // Load parameters + uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER; + struct DPUParams *params_w = + (struct DPUParams *) + mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); + mram_read((__mram_ptr void const *)params_m, params_w, + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); + + // Extract parameters + uint32_t numGlobalNodes = params_w->numNodes; + uint32_t startNodeIdx = params_w->dpuStartNodeIdx; + uint32_t numNodes = params_w->dpuNumNodes; + uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset; + uint32_t level = params_w->level; + uint32_t nodePtrs_m = params_w->dpuNodePtrs_m; + uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m; + uint32_t nodeLevel_m = params_w->dpuNodeLevel_m; + uint32_t visited_m = params_w->dpuVisited_m; + uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m; + uint32_t nextFrontier_m = params_w->dpuNextFrontier_m; + + if (numNodes > 0) { + + // Sanity check + if (me() == 0) { + if (numGlobalNodes % 64 != 0) { + //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!"); + } + if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) { + //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!"); + } + } + // Allocate WRAM cache for each tasklet to use throughout + uint64_t *cache_w = mem_alloc(sizeof(uint64_t)); + + // Update current frontier and visited list based on the next frontier from the previous iteration + for (uint32_t nodeTileIdx = me(); + nodeTileIdx < numGlobalNodes / 64; + nodeTileIdx += NR_TASKLETS) { + + // Get the next frontier tile from MRAM + uint64_t nextFrontierTile = + load8B(nextFrontier_m, nodeTileIdx, cache_w); + + // Process next frontier tile if it is not empty + if (nextFrontierTile) { + + // Mark everything that was previously added to the next frontier as visited + uint64_t visitedTile = + load8B(visited_m, nodeTileIdx, cache_w); + visitedTile |= nextFrontierTile; + store8B(visitedTile, visited_m, nodeTileIdx, + cache_w); + + // Clear the next frontier + store8B(0, nextFrontier_m, nodeTileIdx, + cache_w); + + } + // Extract the current frontier from the previous next frontier and update node levels + uint32_t startTileIdx = startNodeIdx / 64; + uint32_t numTiles = numNodes / 64; + if (startTileIdx <= nodeTileIdx + && nodeTileIdx < startTileIdx + numTiles) { + + // Update current frontier + store8B(nextFrontierTile, currentFrontier_m, + nodeTileIdx - startTileIdx, cache_w); + + // Update node levels + if (nextFrontierTile) { + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; + ++node) { + if (isSet + (nextFrontierTile, + node % 64)) { + store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks + } + } + } + } + + } + + // Wait until all tasklets have updated the current frontier + barrier_wait(&bfsBarrier); + + // Identify tasklet's nodes + uint32_t numNodesPerTasklet = + (numNodes + NR_TASKLETS - 1) / NR_TASKLETS; + uint32_t taskletNodesStart = me() * numNodesPerTasklet; + uint32_t taskletNumNodes; + if (taskletNodesStart > numNodes) { + taskletNumNodes = 0; + } else if (taskletNodesStart + numNodesPerTasklet > numNodes) { + taskletNumNodes = numNodes - taskletNodesStart; + } else { + taskletNumNodes = numNodesPerTasklet; + } + + // Visit neighbors of the current frontier + mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex); + for (uint32_t node = taskletNodesStart; + node < taskletNodesStart + taskletNumNodes; ++node) { + uint32_t nodeTileIdx = node / 64; + uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile + if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier + // Visit its neighbors + uint32_t nodePtr = + load4B(nodePtrs_m, node, + cache_w) - nodePtrsOffset; + uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr + for (uint32_t i = nodePtr; i < nextNodePtr; ++i) { + uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader + uint32_t neighborTileIdx = + neighbor / 64; + uint64_t visitedTile = + load8B(visited_m, neighborTileIdx, + cache_w); + if (!isSet(visitedTile, neighbor % 64)) { // Neighbor not previously visited + // Add neighbor to next frontier + mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention + uint64_t nextFrontierTile = + load8B(nextFrontier_m, + neighborTileIdx, + cache_w); + setBit(nextFrontierTile, + neighbor % 64); + store8B(nextFrontierTile, + nextFrontier_m, + neighborTileIdx, + cache_w); + mutex_unlock(mutexID); + } + } + } + } + + } + + return 0; } diff --git a/BFS/host/app.c b/BFS/host/app.c index 54b9cdc..4431193 100644 --- a/BFS/host/app.c +++ b/BFS/host/app.c @@ -3,9 +3,24 @@ * BFS Host Application Source File * */ +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#ifndef ENERGY +#define ENERGY 0 +#endif +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <assert.h> #include <getopt.h> #include <stdio.h> @@ -14,321 +29,436 @@ #include <unistd.h> #include "mram-management.h" -#include "../support/common.h" -#include "../support/graph.h" -#include "../support/params.h" -#include "../support/timer.h" -#include "../support/utils.h" +#include "common.h" +#include "graph.h" +#include "params.h" +#include "timer.h" +#include "utils.h" -#ifndef ENERGY -#define ENERGY 0 +#define DPU_BINARY "./bin/dpu_code" + +// Main of the Host Application +int main(int argc, char **argv) +{ + + // Process parameters + struct Params p = input_params(argc, argv); + + // Timer and profiling + Timer timer; +#if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + double tenergy = 0; +#endif + + // Allocate DPUs and load binary + struct dpu_set_t dpu_set, dpu; + uint32_t numDPUs, numRanks; + +#if WITH_ALLOC_OVERHEAD + startTimer(&timer, 0, 0); +#endif + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); +#if WITH_ALLOC_OVERHEAD + stopTimer(&timer, 0); +#else + zeroTimer(&timer, 0); +#endif + +#if WITH_LOAD_OVERHEAD + startTimer(&timer, 1, 0); +#endif + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); +#if WITH_LOAD_OVERHEAD + stopTimer(&timer, 0); +#else + zeroTimer(&timer, 1); +#endif + + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks)); + assert(NR_DPUS == numDPUs); + PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs); + + // Initialize BFS data structures + PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); + struct COOGraph cooGraph = readCOOGraph(p.fileName); + PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", + cooGraph.numNodes, cooGraph.numEdges); + struct CSRGraph csrGraph = coo2csr(cooGraph); + uint32_t numNodes = csrGraph.numNodes; + uint32_t *nodePtrs = csrGraph.nodePtrs; + uint32_t *neighborIdxs = csrGraph.neighborIdxs; + uint32_t *nodeLevel = (uint32_t*)calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + uint64_t *visited = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *currentFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *nextFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + setBit(nextFrontier[0], 0); // Initialize frontier to first node + uint32_t level = 1; + + // Partition data structure across DPUs + uint32_t numNodesPerDPU = + ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1); + PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", + numNodesPerDPU); + struct DPUParams dpuParams[numDPUs]; + uint32_t dpuParams_m[numDPUs]; + unsigned int dpuIdx = 0; + unsigned int t0ini = 0; + unsigned int t1ini = 0; + unsigned int t2ini = 0; + unsigned int t3ini = 0; + DPU_FOREACH(dpu_set, dpu) { + + // Allocate parameters + struct mram_heap_allocator_t allocator; + init_allocator(&allocator); + dpuParams_m[dpuIdx] = + mram_heap_alloc(&allocator, sizeof(struct DPUParams)); + + // Find DPU's nodes + uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU; + uint32_t dpuNumNodes; + if (dpuStartNodeIdx > numNodes) { + dpuNumNodes = 0; + } else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) { + dpuNumNodes = numNodes - dpuStartNodeIdx; + } else { + dpuNumNodes = numNodesPerDPU; + } + dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes; + PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx); + PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", + dpuNumNodes); + + // Partition edges and copy data + if (dpuNumNodes > 0) { + + // Find DPU's CSR graph partition + uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx]; + uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0]; + uint32_t *dpuNeighborIdxs_h = + neighborIdxs + dpuNodePtrsOffset; + uint32_t dpuNumNeighbors = + dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset; + uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx]; + + // Allocate MRAM + uint32_t dpuNodePtrs_m = + mram_heap_alloc(&allocator, + (dpuNumNodes + + 1) * sizeof(uint32_t)); + uint32_t dpuNeighborIdxs_m = + mram_heap_alloc(&allocator, + dpuNumNeighbors * sizeof(uint32_t)); + uint32_t dpuNodeLevel_m = + mram_heap_alloc(&allocator, + dpuNumNodes * sizeof(uint32_t)); + uint32_t dpuVisited_m = + mram_heap_alloc(&allocator, + numNodes / 64 * sizeof(uint64_t)); + uint32_t dpuCurrentFrontier_m = + mram_heap_alloc(&allocator, + dpuNumNodes / 64 * + sizeof(uint64_t)); + uint32_t dpuNextFrontier_m = + mram_heap_alloc(&allocator, + numNodes / 64 * sizeof(uint64_t)); + PRINT_INFO(p.verbosity >= 2, + " Total memory allocated is %d bytes", + allocator.totalAllocated); + + // Set up DPU parameters + dpuParams[dpuIdx].numNodes = numNodes; + dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx; + dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset; + dpuParams[dpuIdx].level = level; + dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m; + dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m; + dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m; + dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m; + dpuParams[dpuIdx].dpuCurrentFrontier_m = + dpuCurrentFrontier_m; + dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m; + + // Send data to DPU + PRINT_INFO(p.verbosity >= 2, + " Copying data to DPU"); + startTimer(&timer, 2, t0ini++); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNodePtrs_m, (uint8_t *) dpuNodePtrs_h, + ROUND_UP_TO_MULTIPLE_OF_8((dpuNumNodes + 1) * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNeighborIdxs_m, (uint8_t *) dpuNeighborIdxs_h, + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNeighbors * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNodeLevel_m, (uint8_t *) dpuNodeLevel_h, + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuVisited_m, (uint8_t *) visited, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNextFrontier_m, (uint8_t *) nextFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + + // NOTE: No need to copy current frontier because it is written before being read + stopTimer(&timer, 2); + //loadTime += getElapsedTime(timer); + + } + // Send parameters to DPU + PRINT_INFO(p.verbosity >= 2, + " Copying parameters to DPU"); + startTimer(&timer, 2, t1ini++); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams_m[dpuIdx], (uint8_t *) & dpuParams[dpuIdx], + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)))); + stopTimer(&timer, 2); + //loadTime += getElapsedTime(timer); + + ++dpuIdx; + + } + + // Iterate until next frontier is empty + uint32_t nextFrontierEmpty = 0; + while (!nextFrontierEmpty) { + + PRINT_INFO(p.verbosity >= 1, + "Processing current frontier for level %u", level); + +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); #endif + // Run all DPUs + PRINT_INFO(p.verbosity >= 1, " Booting DPUs"); + startTimer(&timer, 3, t2ini++); + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + stopTimer(&timer, 3); + //dpuTime += getElapsedTime(timer); #if ENERGY -#include <dpu_probe.h> + DPU_ASSERT(dpu_probe_stop(&probe)); + double energy; + DPU_ASSERT(dpu_probe_get + (&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + tenergy += energy; #endif -#define DPU_BINARY "./bin/dpu_code" + // Copy back next frontier from all DPUs and compute their union as the current frontier + startTimer(&timer, 4, t3ini++); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + if (dpuIdx == 0) { + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) currentFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + } else { + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) nextFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + for (uint32_t i = 0; i < numNodes / 64; + ++i) { + currentFrontier[i] |= + nextFrontier[i]; + } + } + ++dpuIdx; + } + } + + // Check if the next frontier is empty, and copy data to DPU if not empty + nextFrontierEmpty = 1; + for (uint32_t i = 0; i < numNodes / 64; ++i) { + if (currentFrontier[i]) { + nextFrontierEmpty = 0; + break; + } + } + if (!nextFrontierEmpty) { + ++level; + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = + dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier) + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) currentFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + // Copy new level to DPU + dpuParams[dpuIdx].level = level; + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams_m[dpuIdx], (uint8_t *) &dpuParams[dpuIdx], + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)))); + ++dpuIdx; + } + } + } + stopTimer(&timer, 4); + //hostTime += getElapsedTime(timer); + + } + + // Copy back node levels + PRINT_INFO(p.verbosity >= 1, "Copying back the result"); + startTimer(&timer, 5, 0); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU; + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNodeLevel_m, + (uint8_t *) (nodeLevel + dpuStartNodeIdx), + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(float)))); + } + ++dpuIdx; + } + stopTimer(&timer, 5); + //retrieveTime += getElapsedTime(timer); + //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); + + // Calculating result on CPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); + uint32_t *nodeLevelReference = (uint32_t*) calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t)); + setBit(nextFrontier[0], 0); // Initialize frontier to first node + nextFrontierEmpty = 0; + level = 1; + startTimer(&timer, 6, 0); + while (!nextFrontierEmpty) { + // Update current frontier and visited list based on the next frontier from the previous iteration + for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64; + ++nodeTileIdx) { + uint64_t nextFrontierTile = nextFrontier[nodeTileIdx]; + currentFrontier[nodeTileIdx] = nextFrontierTile; + if (nextFrontierTile) { + visited[nodeTileIdx] |= nextFrontierTile; + nextFrontier[nodeTileIdx] = 0; + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; ++node) { + if (isSet(nextFrontierTile, node % 64)) { + nodeLevelReference[node] = + level; + } + } + } + } + // Visit neighbors of the current frontier + nextFrontierEmpty = 1; + for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64; + ++nodeTileIdx) { + uint64_t currentFrontierTile = + currentFrontier[nodeTileIdx]; + if (currentFrontierTile) { + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; ++node) { + if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier + // Visit its neighbors + uint32_t nodePtr = + nodePtrs[node]; + uint32_t nextNodePtr = + nodePtrs[node + 1]; + for (uint32_t i = nodePtr; + i < nextNodePtr; ++i) { + uint32_t neighbor = + neighborIdxs[i]; + if (!isSet(visited[neighbor / 64], neighbor % 64)) { // Neighbor not previously visited + // Add neighbor to next frontier + setBit + (nextFrontier + [neighbor / + 64], + neighbor % + 64); + nextFrontierEmpty + = 0; + } + } + } + } + } + } + ++level; + } + stopTimer(&timer, 6); + +#if WITH_FREE_OVERHEAD + startTimer(&timer, 7); +#endif + DPU_ASSERT(dpu_free(dpu_set)); +#if WITH_FREE_OVERHEAD + stopTimer(&timer, 7); +#else + zeroTimer(&timer, 7); +#endif -// Main of the Host Application -int main(int argc, char** argv) { - - // Process parameters - struct Params p = input_params(argc, argv); - - // Timer and profiling - Timer timer; - #if ENERGY - struct dpu_probe_t probe; - DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); - double tenergy=0; - #endif - - // Allocate DPUs and load binary - struct dpu_set_t dpu_set, dpu; - uint32_t numDPUs; - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); - PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs); - - // Initialize BFS data structures - PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); - struct COOGraph cooGraph = readCOOGraph(p.fileName); - PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); - struct CSRGraph csrGraph = coo2csr(cooGraph); - uint32_t numNodes = csrGraph.numNodes; - uint32_t* nodePtrs = csrGraph.nodePtrs; - uint32_t* neighborIdxs = csrGraph.neighborIdxs; - uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) - uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - setBit(nextFrontier[0], 0); // Initialize frontier to first node - uint32_t level = 1; - - // Partition data structure across DPUs - uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1); - PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU); - struct DPUParams dpuParams[numDPUs]; - uint32_t dpuParams_m[numDPUs]; - unsigned int dpuIdx = 0; - unsigned int t0ini = 0; - unsigned int t1ini = 0; - unsigned int t2ini = 0; - unsigned int t3ini = 0; - DPU_FOREACH (dpu_set, dpu) { - - // Allocate parameters - struct mram_heap_allocator_t allocator; - init_allocator(&allocator); - dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams)); - - // Find DPU's nodes - uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU; - uint32_t dpuNumNodes; - if(dpuStartNodeIdx > numNodes) { - dpuNumNodes = 0; - } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) { - dpuNumNodes = numNodes - dpuStartNodeIdx; - } else { - dpuNumNodes = numNodesPerDPU; - } - dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes; - PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx); - PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", dpuNumNodes); - - // Partition edges and copy data - if(dpuNumNodes > 0) { - - // Find DPU's CSR graph partition - uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx]; - uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0]; - uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset; - uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset; - uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx]; - - // Allocate MRAM - uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t)); - uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t)); - uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t)); - uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t)); - uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t)); - uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t)); - PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated); - - // Set up DPU parameters - dpuParams[dpuIdx].numNodes = numNodes; - dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx; - dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset; - dpuParams[dpuIdx].level = level; - dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m; - dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m; - dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m; - dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m; - dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m; - dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m; - - // Send data to DPU - PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); - startTimer(&timer, 0, t0ini++); - copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t)); - copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t)); - // NOTE: No need to copy current frontier because it is written before being read - stopTimer(&timer, 0); - //loadTime += getElapsedTime(timer); - - } - - // Send parameters to DPU - PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); - startTimer(&timer, 1, t1ini++); - copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams)); - stopTimer(&timer, 1); - //loadTime += getElapsedTime(timer); - - ++dpuIdx; - - } - - // Iterate until next frontier is empty - uint32_t nextFrontierEmpty = 0; - while(!nextFrontierEmpty) { - - PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level); - - #if ENERGY - DPU_ASSERT(dpu_probe_start(&probe)); - #endif - // Run all DPUs - PRINT_INFO(p.verbosity >= 1, " Booting DPUs"); - startTimer(&timer, 2, t2ini++); - DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - stopTimer(&timer, 2); - //dpuTime += getElapsedTime(timer); - #if ENERGY - DPU_ASSERT(dpu_probe_stop(&probe)); - double energy; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); - tenergy += energy; - #endif - - - - // Copy back next frontier from all DPUs and compute their union as the current frontier - startTimer(&timer, 3, t3ini++); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - if(dpuIdx == 0) { - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t)); - } else { - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t)); - for(uint32_t i = 0; i < numNodes/64; ++i) { - currentFrontier[i] |= nextFrontier[i]; - } - } - ++dpuIdx; - } - } - - // Check if the next frontier is empty, and copy data to DPU if not empty - nextFrontierEmpty = 1; - for(uint32_t i = 0; i < numNodes/64; ++i) { - if(currentFrontier[i]) { - nextFrontierEmpty = 0; - break; - } - } - if(!nextFrontierEmpty) { - ++level; - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier) - copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t)); - // Copy new level to DPU - dpuParams[dpuIdx].level = level; - copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams)); - ++dpuIdx; - } - } - } - stopTimer(&timer, 3); - //hostTime += getElapsedTime(timer); - - } - - // Copy back node levels - PRINT_INFO(p.verbosity >= 1, "Copying back the result"); - startTimer(&timer, 4, 0); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU; - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float)); - } - ++dpuIdx; - } - stopTimer(&timer, 4); - //retrieveTime += getElapsedTime(timer); - //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); - - // Calculating result on CPU - PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); - uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) - memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t)); - setBit(nextFrontier[0], 0); // Initialize frontier to first node - nextFrontierEmpty = 0; - level = 1; - while(!nextFrontierEmpty) { - // Update current frontier and visited list based on the next frontier from the previous iteration - for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) { - uint64_t nextFrontierTile = nextFrontier[nodeTileIdx]; - currentFrontier[nodeTileIdx] = nextFrontierTile; - if(nextFrontierTile) { - visited[nodeTileIdx] |= nextFrontierTile; - nextFrontier[nodeTileIdx] = 0; - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(nextFrontierTile, node%64)) { - nodeLevelReference[node] = level; - } - } - } - } - // Visit neighbors of the current frontier - nextFrontierEmpty = 1; - for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) { - uint64_t currentFrontierTile = currentFrontier[nodeTileIdx]; - if(currentFrontierTile) { - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier - // Visit its neighbors - uint32_t nodePtr = nodePtrs[node]; - uint32_t nextNodePtr = nodePtrs[node + 1]; - for(uint32_t i = nodePtr; i < nextNodePtr; ++i) { - uint32_t neighbor = neighborIdxs[i]; - if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited - // Add neighbor to next frontier - setBit(nextFrontier[neighbor/64], neighbor%64); - nextFrontierEmpty = 0; - } - } - } - } - } - } - ++level; - } - - // Verify the result - PRINT_INFO(p.verbosity >= 1, "Verifying the result"); - int status = 1; - for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) { - if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) { - PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]); - status = 0; - } - } - - if (status) { - printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d " - "| throughput_pim_MBps=%f throughput_MBps=%f", - numDPUs, NR_TASKLETS, "uint32_t", numNodes, - numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]), - numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", - numNodes / (timer.time[2] + timer.time[3]), - numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printAll(&timer, 4); - } - - // Display DPU Logs - if(p.verbosity >= 2) { - PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - PRINT("DPU %u:", dpuIdx); - DPU_ASSERT(dpu_log_read(dpu, stdout)); - ++dpuIdx; - } - } - - // Deallocate data structures - freeCOOGraph(cooGraph); - freeCSRGraph(csrGraph); - free(nodeLevel); - free(visited); - free(currentFrontier); - free(nextFrontier); - free(nodeLevelReference); - - return 0; + // Verify the result + PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + int status = 1; + for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) { + if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) { + PRINT_ERROR + ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", + nodeIdx, nodeLevelReference[nodeIdx], + nodeLevel[nodeIdx]); + status = 0; + } + } + + if (status) { + dfatool_printf + ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d " + "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, numRanks, + NR_TASKLETS, "uint32_t", numNodes, + numNodes * sizeof(uint32_t) / (timer.time[2] + + timer.time[3]), + numNodes * sizeof(uint32_t) / (timer.time[0] + + timer.time[1] + + timer.time[2] + + timer.time[3] + + timer.time[4])); + dfatool_printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", + numNodes / (timer.time[2] + timer.time[3]), + numNodes / (timer.time[0] + timer.time[1] + + timer.time[2] + timer.time[3] + + timer.time[4])); + dfatool_printf + (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[4], timer.time[5], timer.time[6], + timer.time[7]); + } + // Display DPU Logs + if (p.verbosity >= 2) { + PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + PRINT("DPU %u:", dpuIdx); + DPU_ASSERT(dpu_log_read(dpu, stdout)); + ++dpuIdx; + } + } + // Deallocate data structures + freeCOOGraph(cooGraph); + freeCSRGraph(csrGraph); + free(nodeLevel); + free(visited); + free(currentFrontier); + free(nextFrontier); + free(nodeLevelReference); + + return 0; } - diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h index 627dfde..a953d6a 100644 --- a/BFS/host/mram-management.h +++ b/BFS/host/mram-management.h @@ -1,37 +1,29 @@ +#pragma once -#ifndef _MRAM_MANAGEMENT_H_ -#define _MRAM_MANAGEMENT_H_ +#include "common.h" +#include "utils.h" -#include "../support/common.h" -#include "../support/utils.h" - -#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB +#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB struct mram_heap_allocator_t { - uint32_t totalAllocated; + uint32_t totalAllocated; }; -static void init_allocator(struct mram_heap_allocator_t* allocator) { - allocator->totalAllocated = 0; -} - -static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) { - uint32_t ret = allocator->totalAllocated; - allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size); - if(allocator->totalAllocated > DPU_CAPACITY) { - PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY); - exit(0); - } - return ret; +static void init_allocator(struct mram_heap_allocator_t *allocator) +{ + allocator->totalAllocated = 0; } -static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) { - DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); +static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator, + uint32_t size) +{ + uint32_t ret = allocator->totalAllocated; + allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size); + if (allocator->totalAllocated > DPU_CAPACITY) { + PRINT_ERROR + (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", + allocator->totalAllocated, DPU_CAPACITY); + exit(0); + } + return ret; } - -static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) { - DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); -} - -#endif - diff --git a/BFS/include/common.h b/BFS/include/common.h new file mode 100644 index 0000000..5f2aa0d --- /dev/null +++ b/BFS/include/common.h @@ -0,0 +1,25 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2) +#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8) +#define ROUND_UP_TO_MULTIPLE_OF_64(x) ((((x) + 63)/64)*64) + +#define setBit(val, idx) (val) |= (1 << (idx)) +#define isSet(val, idx) ((val) & (1 << (idx))) + +struct DPUParams { + uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */ + uint32_t numNodes; /* Total number of nodes in the graph */ + uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */ + uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */ + uint32_t level; /* The current BFS level */ + uint32_t dpuNodePtrs_m; + uint32_t dpuNeighborIdxs_m; + uint32_t dpuNodeLevel_m; + uint32_t dpuVisited_m; + uint32_t dpuCurrentFrontier_m; + uint32_t dpuNextFrontier_m; +}; + +#endif diff --git a/BFS/include/dfatool_host.ah b/BFS/include/dfatool_host.ah new file mode 100644 index 0000000..b2677e1 --- /dev/null +++ b/BFS/include/dfatool_host.ah @@ -0,0 +1,30 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned long input_size; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(uint32_t); + } + + advice call("% input_params(...)"): after() { + printf("[>>] BFS | n_dpus=%u\n", NR_DPUS); + } + + advice call("% coo2csr(...)") : after() { + struct CSRGraph *g = tjp->result(); + input_size = g->numNodes; + printf("[--] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size); + } + + advice execution("% main(...)") : after() { + printf("[<<] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size); + } +}; diff --git a/BFS/include/graph.h b/BFS/include/graph.h new file mode 100644 index 0000000..2a19f67 --- /dev/null +++ b/BFS/include/graph.h @@ -0,0 +1,133 @@ + +#ifndef _GRAPH_H_ +#define _GRAPH_H_ + +#include <assert.h> +#include <stdio.h> + +#include "common.h" +#include "utils.h" + +struct COOGraph { + uint32_t numNodes; + uint32_t numEdges; + uint32_t *nodeIdxs; + uint32_t *neighborIdxs; +}; + +struct CSRGraph { + uint32_t numNodes; + uint32_t numEdges; + uint32_t *nodePtrs; + uint32_t *neighborIdxs; +}; + +static struct COOGraph readCOOGraph(const char *fileName) +{ + + struct COOGraph cooGraph; + + // Initialize fields + FILE *fp = fopen(fileName, "r"); + uint32_t numNodes, numCols; + assert(fscanf(fp, "%u", &numNodes)); + assert(fscanf(fp, "%u", &numCols)); + if (numNodes == numCols) { + cooGraph.numNodes = numNodes; + } else { + PRINT_WARNING + (" Adjacency matrix is not square. Padding matrix to be square."); + cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols; + } + if (cooGraph.numNodes % 64 != 0) { + PRINT_WARNING + (" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", + cooGraph.numNodes); + cooGraph.numNodes += (64 - cooGraph.numNodes % 64); + PRINT_WARNING + (" Padding to %u which is a multiple of 64 nodes.", + cooGraph.numNodes); + } + assert(fscanf(fp, "%u", &cooGraph.numEdges)); + cooGraph.nodeIdxs = + (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t)); + cooGraph.neighborIdxs = + (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t)); + + // Read the neighborIdxs + for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) { + uint32_t nodeIdx; + assert(fscanf(fp, "%u", &nodeIdx)); + cooGraph.nodeIdxs[edgeIdx] = nodeIdx; + uint32_t neighborIdx; + assert(fscanf(fp, "%u", &neighborIdx)); + cooGraph.neighborIdxs[edgeIdx] = neighborIdx; + } + + return cooGraph; + +} + +static void freeCOOGraph(struct COOGraph cooGraph) +{ + free(cooGraph.nodeIdxs); + free(cooGraph.neighborIdxs); +} + +static struct CSRGraph coo2csr(struct COOGraph cooGraph) +{ + + struct CSRGraph csrGraph; + + // Initialize fields + csrGraph.numNodes = cooGraph.numNodes; + csrGraph.numEdges = cooGraph.numEdges; + csrGraph.nodePtrs = + (uint32_t *) + calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), + sizeof(uint32_t)); + csrGraph.neighborIdxs = + (uint32_t *) + malloc(ROUND_UP_TO_MULTIPLE_OF_8 + (csrGraph.numEdges * sizeof(uint32_t))); + + // Histogram nodeIdxs + for (uint32_t i = 0; i < cooGraph.numEdges; ++i) { + uint32_t nodeIdx = cooGraph.nodeIdxs[i]; + csrGraph.nodePtrs[nodeIdx]++; + } + + // Prefix sum nodePtrs + uint32_t sumBeforeNextNode = 0; + for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { + uint32_t sumBeforeNode = sumBeforeNextNode; + sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx]; + csrGraph.nodePtrs[nodeIdx] = sumBeforeNode; + } + csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode; + + // Bin the neighborIdxs + for (uint32_t i = 0; i < cooGraph.numEdges; ++i) { + uint32_t nodeIdx = cooGraph.nodeIdxs[i]; + uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++; + csrGraph.neighborIdxs[neighborListIdx] = + cooGraph.neighborIdxs[i]; + } + + // Restore nodePtrs + for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) { + csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1]; + } + csrGraph.nodePtrs[0] = 0; + + return csrGraph; + +} + +static void freeCSRGraph(struct CSRGraph csrGraph) +{ + free(csrGraph.nodePtrs); + free(csrGraph.neighborIdxs); +} + +#endif diff --git a/BFS/include/params.h b/BFS/include/params.h new file mode 100644 index 0000000..f9169bc --- /dev/null +++ b/BFS/include/params.h @@ -0,0 +1,67 @@ + +#ifndef _PARAMS_H_ +#define _PARAMS_H_ + +#include "common.h" +#include "utils.h" + +static void usage() +{ + PRINT("\nUsage: ./program [options]" + "\n" + "\nBenchmark-specific options:" + "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)" + "\n" + "\nGeneral options:" + "\n -v <V> verbosity" "\n -h help" "\n\n"); +} + +typedef struct Params { + const char *fileName; + unsigned int verbosity; +#if NUMA + struct bitmask *bitmask_in; + int numa_node_cpu; +#endif +} Params; + +static struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.fileName = "data/roadNet-CA.txt"; + p.verbosity = 0; +#if NUMA + p.bitmask_in = NULL; + p.numa_node_cpu = -1; +#endif + int opt; + while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) { + switch (opt) { + case 'f': + p.fileName = optarg; + break; + case 'v': + p.verbosity = atoi(optarg); + break; +#if NUMA + case 'A': + p.bitmask_in = numa_parse_nodestring(optarg); + break; + case 'C': + p.numa_node_cpu = atoi(optarg); + break; +#endif + case 'h': + usage(); + exit(0); + default: + PRINT_ERROR("Unrecognized option!"); + usage(); + exit(0); + } + } + + return p; +} + +#endif diff --git a/BFS/include/timer.h b/BFS/include/timer.h new file mode 100644 index 0000000..e85490f --- /dev/null +++ b/BFS/include/timer.h @@ -0,0 +1,8 @@ +#pragma once + +#define N_TIMERS 8 +#define startTimer start +#define stopTimer stop +#define zeroTimer zero +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/BFS/support/utils.h b/BFS/include/utils.h index ddb1e2c..ccd8fbd 100644 --- a/BFS/support/utils.h +++ b/BFS/include/utils.h @@ -8,4 +8,3 @@ #define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) #endif - diff --git a/BFS/run-paper-strong-full.sh b/BFS/run-paper-strong-full.sh deleted file mode 100755 index 42806a2..0000000 --- a/BFS/run-paper-strong-full.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 are not part of upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/BFS/run-paper-strong-rank.sh b/BFS/run-paper-strong-rank.sh deleted file mode 100755 index e01d18a..0000000 --- a/BFS/run-paper-strong-rank.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/BFS/run-paper-weak.sh b/BFS/run-paper-weak.sh deleted file mode 100755 index 121758a..0000000 --- a/BFS/run-paper-weak.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# 256 and 512 are not part of upstream -for nr_dpus in 256 512 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - # upstream code uses some kind of generated rMat graphs, but does not provide instructions for reproduction - timeout --foreground -k 1m 3m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done | -) tee log-paper-weak.txt diff --git a/BFS/run.sh b/BFS/run.sh deleted file mode 100755 index 8f5bfb8..0000000 --- a/BFS/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -e - -# -f: input file (i.e., input size) -# bin/host_code -f data/loc-gowalla_edges.txt - -# input size depends on file -> strong scaling only - -echo "prim-benchmarks BFS (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do - for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do - for f in loc-gowalla_edges roadNet-CA; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - for i in `seq 1 20`; do - timeout --foreground -k 1m 30m bin/host_code -f data/${f}.txt || true - done - fi - done - done -done diff --git a/BFS/support/common.h b/BFS/support/common.h deleted file mode 100644 index ced324c..0000000 --- a/BFS/support/common.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _COMMON_H_ -#define _COMMON_H_ - -#define ROUND_UP_TO_MULTIPLE_OF_2(x) ((((x) + 1)/2)*2) -#define ROUND_UP_TO_MULTIPLE_OF_8(x) ((((x) + 7)/8)*8) -#define ROUND_UP_TO_MULTIPLE_OF_64(x) ((((x) + 63)/64)*64) - -#define setBit(val, idx) (val) |= (1 << (idx)) -#define isSet(val, idx) ((val) & (1 << (idx))) - -struct DPUParams { - uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */ - uint32_t numNodes; /* Total number of nodes in the graph */ - uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */ - uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */ - uint32_t level; /* The current BFS level */ - uint32_t dpuNodePtrs_m; - uint32_t dpuNeighborIdxs_m; - uint32_t dpuNodeLevel_m; - uint32_t dpuVisited_m; - uint32_t dpuCurrentFrontier_m; - uint32_t dpuNextFrontier_m; -}; - -#endif - diff --git a/BFS/support/graph.h b/BFS/support/graph.h deleted file mode 100644 index f89ff5c..0000000 --- a/BFS/support/graph.h +++ /dev/null @@ -1,116 +0,0 @@ - -#ifndef _GRAPH_H_ -#define _GRAPH_H_ - -#include <assert.h> -#include <stdio.h> - -#include "common.h" -#include "utils.h" - -struct COOGraph { - uint32_t numNodes; - uint32_t numEdges; - uint32_t* nodeIdxs; - uint32_t* neighborIdxs; -}; - -struct CSRGraph { - uint32_t numNodes; - uint32_t numEdges; - uint32_t* nodePtrs; - uint32_t* neighborIdxs; -}; - -static struct COOGraph readCOOGraph(const char* fileName) { - - struct COOGraph cooGraph; - - // Initialize fields - FILE* fp = fopen(fileName, "r"); - uint32_t numNodes, numCols; - assert(fscanf(fp, "%u", &numNodes)); - assert(fscanf(fp, "%u", &numCols)); - if(numNodes == numCols) { - cooGraph.numNodes = numNodes; - } else { - PRINT_WARNING(" Adjacency matrix is not square. Padding matrix to be square."); - cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols; - } - if(cooGraph.numNodes%64 != 0) { - PRINT_WARNING(" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes); - cooGraph.numNodes += (64 - cooGraph.numNodes%64); - PRINT_WARNING(" Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes); - } - assert(fscanf(fp, "%u", &cooGraph.numEdges)); - cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t)); - cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t)); - - // Read the neighborIdxs - for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) { - uint32_t nodeIdx; - assert(fscanf(fp, "%u", &nodeIdx)); - cooGraph.nodeIdxs[edgeIdx] = nodeIdx; - uint32_t neighborIdx; - assert(fscanf(fp, "%u", &neighborIdx)); - cooGraph.neighborIdxs[edgeIdx] = neighborIdx; - } - - return cooGraph; - -} - -static void freeCOOGraph(struct COOGraph cooGraph) { - free(cooGraph.nodeIdxs); - free(cooGraph.neighborIdxs); -} - -static struct CSRGraph coo2csr(struct COOGraph cooGraph) { - - struct CSRGraph csrGraph; - - // Initialize fields - csrGraph.numNodes = cooGraph.numNodes; - csrGraph.numEdges = cooGraph.numEdges; - csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t)); - csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t))); - - // Histogram nodeIdxs - for(uint32_t i = 0; i < cooGraph.numEdges; ++i) { - uint32_t nodeIdx = cooGraph.nodeIdxs[i]; - csrGraph.nodePtrs[nodeIdx]++; - } - - // Prefix sum nodePtrs - uint32_t sumBeforeNextNode = 0; - for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { - uint32_t sumBeforeNode = sumBeforeNextNode; - sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx]; - csrGraph.nodePtrs[nodeIdx] = sumBeforeNode; - } - csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode; - - // Bin the neighborIdxs - for(uint32_t i = 0; i < cooGraph.numEdges; ++i) { - uint32_t nodeIdx = cooGraph.nodeIdxs[i]; - uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++; - csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i]; - } - - // Restore nodePtrs - for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) { - csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1]; - } - csrGraph.nodePtrs[0] = 0; - - return csrGraph; - -} - -static void freeCSRGraph(struct CSRGraph csrGraph) { - free(csrGraph.nodePtrs); - free(csrGraph.neighborIdxs); -} - -#endif - diff --git a/BFS/support/params.h b/BFS/support/params.h deleted file mode 100644 index f4f12e7..0000000 --- a/BFS/support/params.h +++ /dev/null @@ -1,46 +0,0 @@ - -#ifndef _PARAMS_H_ -#define _PARAMS_H_ - -#include "common.h" -#include "utils.h" - -static void usage() { - PRINT( "\nUsage: ./program [options]" - "\n" - "\nBenchmark-specific options:" - "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)" - "\n" - "\nGeneral options:" - "\n -v <V> verbosity" - "\n -h help" - "\n\n"); -} - -typedef struct Params { - const char* fileName; - unsigned int verbosity; -} Params; - -static struct Params input_params(int argc, char **argv) { - struct Params p; - p.fileName = "data/roadNet-CA.txt"; - p.verbosity = 0; - int opt; - while((opt = getopt(argc, argv, "f:v:h")) >= 0) { - switch(opt) { - case 'f': p.fileName = optarg; break; - case 'v': p.verbosity = atoi(optarg); break; - case 'h': usage(); exit(0); - default: - PRINT_ERROR("Unrecognized option!"); - usage(); - exit(0); - } - } - - return p; -} - -#endif - diff --git a/BFS/support/timer.h b/BFS/support/timer.h deleted file mode 100644 index 80719cf..0000000 --- a/BFS/support/timer.h +++ /dev/null @@ -1,34 +0,0 @@ - -#ifndef _TIMER_H_ -#define _TIMER_H_ - -#include <stdio.h> -#include <sys/time.h> - -typedef struct Timer { - struct timeval startTime[5]; - struct timeval stopTime[5]; - double time[5]; -} Timer; - -static void startTimer(Timer *timer, int i, int rep) { - if(rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -static void stopTimer(Timer *timer, int i) { - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -static void printAll(Timer *timer, int maxt) { - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); -} - -#endif |