diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2023-06-02 15:29:25 +0200 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2023-06-02 15:29:25 +0200 |
commit | 39d1c972dcea37beca6e20be152a3da78143a7d3 (patch) | |
tree | 7ed0fbde0088a9d330da26db1f6890d903645529 /SpMV | |
parent | ff9304370fdd94e9b7e4c4262c59ac734f1a28fd (diff) |
port SpMV to dfatool; add benchmark scripts
Diffstat (limited to 'SpMV')
-rw-r--r-- | SpMV/Makefile | 63 | ||||
-rw-r--r-- | SpMV/baselines/cpu/Makefile | 25 | ||||
-rw-r--r-- | SpMV/baselines/cpu/app.c | 19 | ||||
-rwxr-xr-x | SpMV/baselines/cpu/run-opti.sh | 15 | ||||
-rwxr-xr-x | SpMV/baselines/cpu/run.sh | 25 | ||||
-rw-r--r-- | SpMV/host/app.c | 22 | ||||
-rwxr-xr-x | SpMV/run-paper-strong-full.sh | 22 | ||||
-rwxr-xr-x | SpMV/run-paper-strong-rank.sh | 23 | ||||
-rwxr-xr-x | SpMV/run-paper-weak.sh | 28 |
9 files changed, 198 insertions, 44 deletions
diff --git a/SpMV/Makefile b/SpMV/Makefile index e517524..0e7a70c 100644 --- a/SpMV/Makefile +++ b/SpMV/Makefile @@ -1,30 +1,11 @@ -DPU_DIR := dpu -HOST_DIR := host -CPU_BASE_DIR := baselines/cpu -GPU_BASE_DIR := baselines/gpu -BUILDDIR ?= bin NR_TASKLETS ?= 16 NR_DPUS ?= 1 -define conf_filename - ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf -endef -CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS}) - -HOST_TARGET := ${BUILDDIR}/host_code -DPU_TARGET := ${BUILDDIR}/dpu_code -CPU_BASE_TARGET := ${BUILDDIR}/cpu_baseline -GPU_BASE_TARGET := ${BUILDDIR}/gpu_baseline - COMMON_INCLUDES := support -HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) -DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) -CPU_BASE_SOURCES := $(wildcard ${CPU_BASE_DIR}/*.c) -GPU_BASE_SOURCES := $(wildcard ${GPU_BASE_DIR}/*.cu) - -.PHONY: all clean test - -__dirs := $(shell mkdir -p ${BUILDDIR}) +HOST_SOURCES := $(wildcard host/*.c) +DPU_SOURCES := $(wildcard dpu/*.c) +CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c) +GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} @@ -32,29 +13,35 @@ DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} CPU_BASE_FLAGS := -O3 -fopenmp GPU_BASE_FLAGS := -O3 -all: ${HOST_TARGET} ${DPU_TARGET} ${CPU_BASE_TARGET} +QUIET = @ + +ifdef verbose + QUIET = +endif -gpu: ${GPU_BASE_TARGET} +all: bin/host_code bin/dpu_code -${CONF}: - $(RM) $(call conf_filename,*,*) - touch ${CONF} +bin: + ${QUIET}mkdir -p bin -${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} - $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +gpu: bin/gpu_baseline -${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} - dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin + ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} -${CPU_BASE_TARGET}: ${CPU_BASE_SOURCES} - $(CC) -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS} +bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} -${GPU_BASE_TARGET}: ${GPU_BASE_SOURCES} - nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS} +bin/cpu_baseline: ${CPU_BASE_SOURCES} + ${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS} + +bin/gpu_baseline: ${GPU_BASE_SOURCES} + ${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS} clean: - $(RM) -r $(BUILDDIR) + ${QUIET}rm -rf bin test: all - ./${HOST_TARGET} + ${QUIET}bin/host_code +.PHONY: all clean test diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile index 9c63605..64b20db 100644 --- a/SpMV/baselines/cpu/Makefile +++ b/SpMV/baselines/cpu/Makefile @@ -1,7 +1,24 @@ -all: - gcc -o spmv -fopenmp app.c +all: spmv -clean: - rm spmv +spmv: app.c + gcc -O2 -o spmv -fopenmp app.c + +spmv_O0: app.c + gcc -o spmv_O0 -fopenmp app.c + +spmv_O2: app.c + gcc -O2 -o spmv_O2 -fopenmp app.c + +run: spmv + OMP_NUM_THREADS=4 ./spmv -f ../../data/bcsstk30.mtx -v 0 +run_O0: spmv_O0 + OMP_NUM_THREADS=4 ./spmv_O0 -f ../../data/bcsstk30.mtx -v 0 + +run_O2: spmv_O2 + OMP_NUM_THREADS=4 ./spmv_O2 -f ../../data/bcsstk30.mtx -v 0 + +clean: + rm -f spmv spmv_O0 spmv_O2 +.PHONY: all run run_O0 run_O2 clean diff --git a/SpMV/baselines/cpu/app.c b/SpMV/baselines/cpu/app.c index 46db2f0..8d360ee 100644 --- a/SpMV/baselines/cpu/app.c +++ b/SpMV/baselines/cpu/app.c @@ -29,7 +29,7 @@ int main(int argc, char** argv) { // Calculating result on CPU PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); - omp_set_num_threads(4); + //omp_set_num_threads(4); Timer timer; startTimer(&timer); #pragma omp parallel for @@ -43,7 +43,22 @@ int main(int argc, char** argv) { outVector[rowIdx] = sum; } stopTimer(&timer); - if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); + + + unsigned int nr_threads = 0; +#pragma omp parallel +#pragma omp atomic + nr_threads++; + + + // coomatrix / csrmatrix use uint32_t indexes and float values + printf("[::] SpMV CPU | n_threads=%u e_type=float n_elements=%u |" + " throughput_MBps=%f throughput_MOpps=%f timer0_us=%f\n", + nr_threads, csrMatrix.numNonzeros, + csrMatrix.numNonzeros * sizeof(float) / (getElapsedTime(timer)*1e6), + csrMatrix.numNonzeros / (getElapsedTime(timer)*1e6), + getElapsedTime(timer)*1e6); + //if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); PRINT_INFO(p.verbosity >= 1, " Elapsed time: %f ms", getElapsedTime(timer)*1e3); // Deallocate data structures diff --git a/SpMV/baselines/cpu/run-opti.sh b/SpMV/baselines/cpu/run-opti.sh new file mode 100755 index 0000000..62a3e8b --- /dev/null +++ b/SpMV/baselines/cpu/run-opti.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +HOST="$(hostname)" + +echo $HOST + +make clean + +for i in $(seq 1 50); do + make run_O0 | sed 's/CPU/CPU O0/' +done | tee "${HOST}-O0.txt" + +for i in $(seq 1 50); do + make run_O2 | sed 's/CPU/CPU O2/' +done | tee "${HOST}-O2.txt" diff --git a/SpMV/baselines/cpu/run.sh b/SpMV/baselines/cpu/run.sh new file mode 100755 index 0000000..a993cc0 --- /dev/null +++ b/SpMV/baselines/cpu/run.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +set -e + +HOST="$(hostname)" + +echo $HOST + +( + +echo "prim-benchmarks SpMV CPU (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# default threads: 4 + +# input size depends on file -> strong scaling only + +make -B +for i in $(seq 1 50); do + for nr_threads in 88 64 44 1 2 4 6 8 12 16 20 24 32; do + OMP_NUM_THREADS=${nr_threads} timeout --foreground -k 1m 30m ./spmv -f ../../data/bcsstk30.mtx -v 0 || true + done +done +) | tee "${HOST}-explore.txt" diff --git a/SpMV/host/app.c b/SpMV/host/app.c index 8887410..c1cf92f 100644 --- a/SpMV/host/app.c +++ b/SpMV/host/app.c @@ -22,6 +22,9 @@ #define DPU_BINARY "./bin/dpu_code" +#define XSTR(x) STR(x) +#define STR(x) #x + #ifndef ENERGY #define ENERGY 0 #endif @@ -187,14 +190,33 @@ int main(int argc, char** argv) { // Verify the result PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + int status = 1; for(uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) { float diff = (outVectorReference[rowIdx] - outVector[rowIdx])/outVectorReference[rowIdx]; const float tolerance = 0.00001; if(diff > tolerance || diff < -tolerance) { + status = 0; PRINT_ERROR("Mismatch at index %u (CPU result = %f, DPU result = %f)", rowIdx, outVectorReference[rowIdx], outVector[rowIdx]); } } + if (status) { + printf("[::] SpMV NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d " + "| throughput_pim_MBps=%f throughput_MBps=%f", + // coomatrix / csrmatrix use uint32_t indexes and float values + numDPUs, NR_TASKLETS, "float", csrMatrix.numNonzeros, + csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6), + csrMatrix.numNonzeros * sizeof(uint32_t) / ((loadTime + dpuTime + retrieveTime) * 1e6) + ); + printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", + csrMatrix.numNonzeros / (dpuTime * 1e6), + csrMatrix.numNonzeros / ((loadTime + dpuTime + retrieveTime) * 1e6) + ); + printf(" timer_load_us=%f timer_dpu_us=%f timer_retrieve_us=%f\n", + loadTime * 1e6, dpuTime * 1e6, retrieveTime * 1e6 + ); + } + // Display DPU Logs if(p.verbosity >= 2) { PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); diff --git a/SpMV/run-paper-strong-full.sh b/SpMV/run-paper-strong-full.sh new file mode 100755 index 0000000..21f3d25 --- /dev/null +++ b/SpMV/run-paper-strong-full.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e + +( + +echo "prim-benchmarks SpMV strong-full (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +for nr_dpus in 256 512 1024 2048; do + for nr_tasklets in 1 2 4 8 16; do + echo + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then + # repetition is not part of upstream setup + for i in `seq 1 50`; do + timeout --foreground -k 1m 3m bin/host_code -v 0 || true + done + fi + done +done +) | tee log-paper-strong-full.txt diff --git a/SpMV/run-paper-strong-rank.sh b/SpMV/run-paper-strong-rank.sh new file mode 100755 index 0000000..e0cd45d --- /dev/null +++ b/SpMV/run-paper-strong-rank.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e + +( + +echo "prim-benchmarks SpMV strong-rank (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# 256 and 512 are not part of upstream +for nr_dpus in 256 512 1 4 16 64; do + for nr_tasklets in 1 2 4 8 16; do + echo + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then + # repetition is not part of upstream setup + for i in `seq 1 50`; do + timeout --foreground -k 1m 3m bin/host_code -v 0 || true + done + fi + done +done +) | tee log-paper-strong-rank.txt diff --git a/SpMV/run-paper-weak.sh b/SpMV/run-paper-weak.sh new file mode 100755 index 0000000..d2cc0ed --- /dev/null +++ b/SpMV/run-paper-weak.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -e + +( + +echo "prim-benchmarks SpMV weak (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# 256 and 512 are not part of upstream +for nr_dpus in 256 512 1 4 16 64; do + cd data/generate + make + ./replicate ../bcsstk30.mtx ${nr_dpus} /tmp/bcsstk30.mtx.${nr_dpus}.mtx + cd ../.. + for nr_tasklets in 1 2 4 8 16; do + echo + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then + # repetition is not part of upstream setup + for i in `seq 1 50`; do + timeout --foreground -k 1m 3m bin/host_code -v 0 -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx || true + done + fi + done + rm -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx +done | +) tee log-paper-weak.txt |