summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BFS/Makefile61
-rwxr-xr-xBFS/baselines/cpu/run.sh4
-rw-r--r--BFS/host/app.c64
-rwxr-xr-xBFS/run.sh23
-rw-r--r--BFS/support/params.h2
-rw-r--r--BFS/support/timer.h6
6 files changed, 86 insertions, 74 deletions
diff --git a/BFS/Makefile b/BFS/Makefile
index b5068cd..7577724 100644
--- a/BFS/Makefile
+++ b/BFS/Makefile
@@ -1,60 +1,35 @@
-DPU_DIR := dpu
-HOST_DIR := host
-CPU_BASE_DIR := baselines/cpu
-GPU_BASE_DIR := baselines/gpu
-BUILDDIR ?= bin
-NR_TASKLETS ?= 16
NR_DPUS ?= 1
-
-define conf_filename
- ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf
-endef
-CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS})
-
-HOST_TARGET := ${BUILDDIR}/host_code
-DPU_TARGET := ${BUILDDIR}/dpu_code
-CPU_BASE_TARGET := ${BUILDDIR}/cpu_baseline
-GPU_BASE_TARGET := ${BUILDDIR}/gpu_baseline
+NR_TASKLETS ?= 16
COMMON_INCLUDES := support
-HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
-DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
-CPU_BASE_SOURCES := $(wildcard ${CPU_BASE_DIR}/*.c)
-GPU_BASE_SOURCES := $(wildcard ${GPU_BASE_DIR}/*.cu)
+HOST_SOURCES := $(wildcard host/*.c)
+DPU_SOURCES := $(wildcard dpu/*.c)
.PHONY: all clean test
-__dirs := $(shell mkdir -p ${BUILDDIR})
-
COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
-DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-CPU_BASE_FLAGS := -O3 -fopenmp
-GPU_BASE_FLAGS := -O3
-
-all: ${HOST_TARGET} ${DPU_TARGET} ${CPU_BASE_TARGET}
+HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-gpu: ${GPU_BASE_TARGET}
+QUIET = @
-${CONF}:
- $(RM) $(call conf_filename,*,*)
- touch ${CONF}
+ifdef verbose
+ QUIET =
+endif
-${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
- $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+all: bin/host_code bin/dpu_code
-${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
- dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
+bin:
+ ${QUIET}mkdir -p bin
-${CPU_BASE_TARGET}: ${CPU_BASE_SOURCES}
- $(CC) -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS}
+bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES}
+ ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
-${GPU_BASE_TARGET}: ${GPU_BASE_SOURCES}
- nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS}
+bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES}
+ ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
clean:
- $(RM) -r $(BUILDDIR)
+ rm -rf bin
test: all
- ./${HOST_TARGET}
-
+ bin/host_code
diff --git a/BFS/baselines/cpu/run.sh b/BFS/baselines/cpu/run.sh
index cbed050..8d51442 100755
--- a/BFS/baselines/cpu/run.sh
+++ b/BFS/baselines/cpu/run.sh
@@ -10,5 +10,7 @@ echo "Revision $(git describe --always)"
make
for nr_threads in 1 2 4 6 8 12 16 20 24 32; do
- OMP_NUM_THREADS=${nr_threads} timeout -k 1m 30m ./bfs -f ../../data/loc-gowalla_edges.txt || true
+ for f in loc-gowalla_edges roadNet-CA; do
+ OMP_NUM_THREADS=${nr_threads} timeout -k 1m 30m ./bfs -f ../../data/${f}.txt || true
+ done
done
diff --git a/BFS/host/app.c b/BFS/host/app.c
index c6f8301..5fb7254 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -37,7 +37,6 @@ int main(int argc, char** argv) {
// Timer and profiling
Timer timer;
- float loadTime = 0.0f, dpuTime = 0.0f, hostTime = 0.0f, retrieveTime = 0.0f;
#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
@@ -73,6 +72,10 @@ int main(int argc, char** argv) {
struct DPUParams dpuParams[numDPUs];
uint32_t dpuParams_m[numDPUs];
unsigned int dpuIdx = 0;
+ unsigned int t0ini = 0;
+ unsigned int t1ini = 0;
+ unsigned int t2ini = 0;
+ unsigned int t3ini = 0;
DPU_FOREACH (dpu_set, dpu) {
// Allocate parameters
@@ -127,29 +130,28 @@ int main(int argc, char** argv) {
// Send data to DPU
PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer);
+ startTimer(&timer, 0, t0ini++);
copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
// NOTE: No need to copy current frontier because it is written before being read
- stopTimer(&timer);
- loadTime += getElapsedTime(timer);
+ stopTimer(&timer, 0);
+ //loadTime += getElapsedTime(timer);
}
// Send parameters to DPU
PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer);
+ startTimer(&timer, 1, t1ini++);
copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- stopTimer(&timer);
- loadTime += getElapsedTime(timer);
+ stopTimer(&timer, 1);
+ //loadTime += getElapsedTime(timer);
++dpuIdx;
}
- PRINT_INFO(p.verbosity >= 1, " CPU-DPU Time: %f ms", loadTime*1e3);
// Iterate until next frontier is empty
uint32_t nextFrontierEmpty = 0;
@@ -162,11 +164,10 @@ int main(int argc, char** argv) {
#endif
// Run all DPUs
PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
- startTimer(&timer);
+ startTimer(&timer, 2, t2ini++);
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- stopTimer(&timer);
- dpuTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 2, " Level DPU Time: %f ms", getElapsedTime(timer)*1e3);
+ stopTimer(&timer, 2);
+ //dpuTime += getElapsedTime(timer);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
double energy;
@@ -177,7 +178,7 @@ int main(int argc, char** argv) {
// Copy back next frontier from all DPUs and compute their union as the current frontier
- startTimer(&timer);
+ startTimer(&timer, 3, t3ini++);
dpuIdx = 0;
DPU_FOREACH (dpu_set, dpu) {
uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
@@ -217,20 +218,14 @@ int main(int argc, char** argv) {
}
}
}
- stopTimer(&timer);
- hostTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 2, " Level Inter-DPU Time: %f ms", getElapsedTime(timer)*1e3);
+ stopTimer(&timer, 3);
+ //hostTime += getElapsedTime(timer);
}
- PRINT_INFO(p.verbosity >= 1, "DPU Kernel Time: %f ms", dpuTime*1e3);
- PRINT_INFO(p.verbosity >= 1, "Inter-DPU Time: %f ms", hostTime*1e3);
- #if ENERGY
- PRINT_INFO(p.verbosity >= 1, " DPU Energy: %f J", tenergy);
- #endif
// Copy back node levels
PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer);
+ startTimer(&timer, 4, 0);
dpuIdx = 0;
DPU_FOREACH (dpu_set, dpu) {
uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
@@ -240,10 +235,9 @@ int main(int argc, char** argv) {
}
++dpuIdx;
}
- stopTimer(&timer);
- retrieveTime += getElapsedTime(timer);
- PRINT_INFO(p.verbosity >= 1, " DPU-CPU Time: %f ms", retrieveTime*1e3);
- if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+ stopTimer(&timer, 4);
+ //retrieveTime += getElapsedTime(timer);
+ //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
// Calculating result on CPU
PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
@@ -294,12 +288,30 @@ int main(int argc, char** argv) {
// Verify the result
PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
+ status = 0;
}
}
+ if (status) {
+ printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
+ "| throughput_pim_MBps=%f throughput_MBps=%f\n",
+ numDPUs, NR_TASKLETS, "uint32_t", numNodes,
+ numNodes * sizeof(uint32_t) / (timer.time[2]),
+ numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+ printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
+ "| throughput_pim_MOpps=%f throughput_MOpps=%f\n",
+ numDPUs, NR_TASKLETS, "uint32_t", numNodes,
+ numNodes / (timer.time[2]),
+ numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+ printf("[::] n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d | ",
+ numDPUs, NR_TASKLETS, "uint32_t", numNodes);
+ printAll(&timer, 4);
+ }
+
// Display DPU Logs
if(p.verbosity >= 2) {
PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
diff --git a/BFS/run.sh b/BFS/run.sh
new file mode 100755
index 0000000..0ad5af2
--- /dev/null
+++ b/BFS/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+
+# -f: input file (i.e., input size)
+# bin/host_code -f data/loc-gowalla_edges.txt
+
+echo "prim-benchmarks BFS (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do
+ for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do
+ for f in loc-gowalla_edges roadNet-CA; do
+ echo
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then
+ for i in `seq 1 20`; do
+ bin/host_code -f data/${f}.txt || true
+ done
+ fi
+ done
+ done
+done
diff --git a/BFS/support/params.h b/BFS/support/params.h
index 9bf9158..f4f12e7 100644
--- a/BFS/support/params.h
+++ b/BFS/support/params.h
@@ -25,7 +25,7 @@ typedef struct Params {
static struct Params input_params(int argc, char **argv) {
struct Params p;
p.fileName = "data/roadNet-CA.txt";
- p.verbosity = 1;
+ p.verbosity = 0;
int opt;
while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
switch(opt) {
diff --git a/BFS/support/timer.h b/BFS/support/timer.h
index 23116e3..80719cf 100644
--- a/BFS/support/timer.h
+++ b/BFS/support/timer.h
@@ -6,9 +6,9 @@
#include <sys/time.h>
typedef struct Timer {
- struct timeval startTime[4];
- struct timeval stopTime[4];
- double time[4];
+ struct timeval startTime[5];
+ struct timeval stopTime[5];
+ double time[5];
} Timer;
static void startTimer(Timer *timer, int i, int rep) {