diff options
Diffstat (limited to 'BFS')
-rw-r--r-- | BFS/Makefile | 61 | ||||
-rwxr-xr-x | BFS/baselines/cpu/run.sh | 4 | ||||
-rw-r--r-- | BFS/host/app.c | 64 | ||||
-rwxr-xr-x | BFS/run.sh | 23 | ||||
-rw-r--r-- | BFS/support/params.h | 2 | ||||
-rw-r--r-- | BFS/support/timer.h | 6 |
6 files changed, 86 insertions, 74 deletions
diff --git a/BFS/Makefile b/BFS/Makefile index b5068cd..7577724 100644 --- a/BFS/Makefile +++ b/BFS/Makefile @@ -1,60 +1,35 @@ -DPU_DIR := dpu -HOST_DIR := host -CPU_BASE_DIR := baselines/cpu -GPU_BASE_DIR := baselines/gpu -BUILDDIR ?= bin -NR_TASKLETS ?= 16 NR_DPUS ?= 1 - -define conf_filename - ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf -endef -CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS}) - -HOST_TARGET := ${BUILDDIR}/host_code -DPU_TARGET := ${BUILDDIR}/dpu_code -CPU_BASE_TARGET := ${BUILDDIR}/cpu_baseline -GPU_BASE_TARGET := ${BUILDDIR}/gpu_baseline +NR_TASKLETS ?= 16 COMMON_INCLUDES := support -HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) -DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) -CPU_BASE_SOURCES := $(wildcard ${CPU_BASE_DIR}/*.c) -GPU_BASE_SOURCES := $(wildcard ${GPU_BASE_DIR}/*.cu) +HOST_SOURCES := $(wildcard host/*.c) +DPU_SOURCES := $(wildcard dpu/*.c) .PHONY: all clean test -__dirs := $(shell mkdir -p ${BUILDDIR}) - COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -CPU_BASE_FLAGS := -O3 -fopenmp -GPU_BASE_FLAGS := -O3 - -all: ${HOST_TARGET} ${DPU_TARGET} ${CPU_BASE_TARGET} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} +DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -gpu: ${GPU_BASE_TARGET} +QUIET = @ -${CONF}: - $(RM) $(call conf_filename,*,*) - touch ${CONF} +ifdef verbose + QUIET = +endif -${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} - $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +all: bin/host_code bin/dpu_code -${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} - dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin: + ${QUIET}mkdir -p bin -${CPU_BASE_TARGET}: ${CPU_BASE_SOURCES} - $(CC) -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS} +bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} -${GPU_BASE_TARGET}: ${GPU_BASE_SOURCES} - nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS} +bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} + ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} clean: - $(RM) -r $(BUILDDIR) + rm -rf bin test: all - ./${HOST_TARGET} - + bin/host_code diff --git a/BFS/baselines/cpu/run.sh b/BFS/baselines/cpu/run.sh index cbed050..8d51442 100755 --- a/BFS/baselines/cpu/run.sh +++ b/BFS/baselines/cpu/run.sh @@ -10,5 +10,7 @@ echo "Revision $(git describe --always)" make for nr_threads in 1 2 4 6 8 12 16 20 24 32; do - OMP_NUM_THREADS=${nr_threads} timeout -k 1m 30m ./bfs -f ../../data/loc-gowalla_edges.txt || true + for f in loc-gowalla_edges roadNet-CA; do + OMP_NUM_THREADS=${nr_threads} timeout -k 1m 30m ./bfs -f ../../data/${f}.txt || true + done done diff --git a/BFS/host/app.c b/BFS/host/app.c index c6f8301..5fb7254 100644 --- a/BFS/host/app.c +++ b/BFS/host/app.c @@ -37,7 +37,6 @@ int main(int argc, char** argv) { // Timer and profiling Timer timer; - float loadTime = 0.0f, dpuTime = 0.0f, hostTime = 0.0f, retrieveTime = 0.0f; #if ENERGY struct dpu_probe_t probe; DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); @@ -73,6 +72,10 @@ int main(int argc, char** argv) { struct DPUParams dpuParams[numDPUs]; uint32_t dpuParams_m[numDPUs]; unsigned int dpuIdx = 0; + unsigned int t0ini = 0; + unsigned int t1ini = 0; + unsigned int t2ini = 0; + unsigned int t3ini = 0; DPU_FOREACH (dpu_set, dpu) { // Allocate parameters @@ -127,29 +130,28 @@ int main(int argc, char** argv) { // Send data to DPU PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); - startTimer(&timer); + startTimer(&timer, 0, t0ini++); copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t)); copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t)); copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t)); copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t)); copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t)); // NOTE: No need to copy current frontier because it is written before being read - stopTimer(&timer); - loadTime += getElapsedTime(timer); + stopTimer(&timer, 0); + //loadTime += getElapsedTime(timer); } // Send parameters to DPU PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); - startTimer(&timer); + startTimer(&timer, 1, t1ini++); copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams)); - stopTimer(&timer); - loadTime += getElapsedTime(timer); + stopTimer(&timer, 1); + //loadTime += getElapsedTime(timer); ++dpuIdx; } - PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", loadTime*1e3); // Iterate until next frontier is empty uint32_t nextFrontierEmpty = 0; @@ -162,11 +164,10 @@ int main(int argc, char** argv) { #endif // Run all DPUs PRINT_INFO(p.verbosity >= 1, " Booting DPUs"); - startTimer(&timer); + startTimer(&timer, 2, t2ini++); DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - stopTimer(&timer); - dpuTime += getElapsedTime(timer); - PRINT_INFO(p.verbosity >= 2, " Level DPU Time: %f ms", getElapsedTime(timer)*1e3); + stopTimer(&timer, 2); + //dpuTime += getElapsedTime(timer); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); double energy; @@ -177,7 +178,7 @@ int main(int argc, char** argv) { // Copy back next frontier from all DPUs and compute their union as the current frontier - startTimer(&timer); + startTimer(&timer, 3, t3ini++); dpuIdx = 0; DPU_FOREACH (dpu_set, dpu) { uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; @@ -217,20 +218,14 @@ int main(int argc, char** argv) { } } } - stopTimer(&timer); - hostTime += getElapsedTime(timer); - PRINT_INFO(p.verbosity >= 2, " Level Inter-DPU Time: %f ms", getElapsedTime(timer)*1e3); + stopTimer(&timer, 3); + //hostTime += getElapsedTime(timer); } - PRINT_INFO(p.verbosity >= 1, "DPU Kernel Time: %f ms", dpuTime*1e3); - PRINT_INFO(p.verbosity >= 1, "Inter-DPU Time: %f ms", hostTime*1e3); - #if ENERGY - PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", tenergy); - #endif // Copy back node levels PRINT_INFO(p.verbosity >= 1, "Copying back the result"); - startTimer(&timer); + startTimer(&timer, 4, 0); dpuIdx = 0; DPU_FOREACH (dpu_set, dpu) { uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; @@ -240,10 +235,9 @@ int main(int argc, char** argv) { } ++dpuIdx; } - stopTimer(&timer); - retrieveTime += getElapsedTime(timer); - PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", retrieveTime*1e3); - if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); + stopTimer(&timer, 4); + //retrieveTime += getElapsedTime(timer); + //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); // Calculating result on CPU PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); @@ -294,12 +288,30 @@ int main(int argc, char** argv) { // Verify the result PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + int status = 1; for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) { if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) { PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]); + status = 0; } } + if (status) { + printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d " + "| throughput_pim_MBps=%f throughput_MBps=%f\n", + numDPUs, NR_TASKLETS, "uint32_t", numNodes, + numNodes * sizeof(uint32_t) / (timer.time[2]), + numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); + printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d " + "| throughput_pim_MOpps=%f throughput_MOpps=%f\n", + numDPUs, NR_TASKLETS, "uint32_t", numNodes, + numNodes / (timer.time[2]), + numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); + printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d | ", + numDPUs, NR_TASKLETS, "uint32_t", numNodes); + printAll(&timer, 4); + } + // Display DPU Logs if(p.verbosity >= 2) { PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); diff --git a/BFS/run.sh b/BFS/run.sh new file mode 100755 index 0000000..0ad5af2 --- /dev/null +++ b/BFS/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e + +# -f: input file (i.e., input size) +# bin/host_code -f data/loc-gowalla_edges.txt + +echo "prim-benchmarks BFS (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do + for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do + for f in loc-gowalla_edges roadNet-CA; do + echo + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then + for i in `seq 1 20`; do + bin/host_code -f data/${f}.txt || true + done + fi + done + done +done diff --git a/BFS/support/params.h b/BFS/support/params.h index 9bf9158..f4f12e7 100644 --- a/BFS/support/params.h +++ b/BFS/support/params.h @@ -25,7 +25,7 @@ typedef struct Params { static struct Params input_params(int argc, char **argv) { struct Params p; p.fileName = "data/roadNet-CA.txt"; - p.verbosity = 1; + p.verbosity = 0; int opt; while((opt = getopt(argc, argv, "f:v:h")) >= 0) { switch(opt) { diff --git a/BFS/support/timer.h b/BFS/support/timer.h index 23116e3..80719cf 100644 --- a/BFS/support/timer.h +++ b/BFS/support/timer.h @@ -6,9 +6,9 @@ #include <sys/time.h> typedef struct Timer { - struct timeval startTime[4]; - struct timeval stopTime[4]; - double time[4]; + struct timeval startTime[5]; + struct timeval stopTime[5]; + double time[5]; } Timer; static void startTimer(Timer *timer, int i, int rep) { |