summaryrefslogtreecommitdiff
path: root/TS
diff options
context:
space:
mode:
authorDaniel Friesel <daniel.friesel@uos.de>2023-05-31 15:44:51 +0200
committerDaniel Friesel <daniel.friesel@uos.de>2023-05-31 15:44:51 +0200
commit20e2656d51a2d13a6d4783b97933e8098a1ff158 (patch)
treeaaefe7575476ab8f8ac3032fa4dbc8cb3bb6304a /TS
parentea5352505750377a1de36c1fa12e012dc0d3af4c (diff)
TS: port CPU and NMC versions to dfatool
Diffstat (limited to 'TS')
-rw-r--r--TS/Makefile51
-rw-r--r--TS/baselines/cpu/Makefile6
-rwxr-xr-xTS/baselines/cpu/run-opti.sh15
-rwxr-xr-xTS/baselines/cpu/run.sh11
-rw-r--r--TS/baselines/cpu/streamp_openmp.cpp26
-rw-r--r--TS/host/app.c51
-rwxr-xr-xTS/run-paper-strong-full.sh27
-rwxr-xr-xTS/run-paper-strong-rank.sh28
-rwxr-xr-xTS/run-paper-weak.sh29
-rwxr-xr-xTS/support/timer.h7
10 files changed, 183 insertions, 68 deletions
diff --git a/TS/Makefile b/TS/Makefile
index bc5d66f..37d296b 100644
--- a/TS/Makefile
+++ b/TS/Makefile
@@ -1,43 +1,36 @@
-DPU_DIR := dpu
-HOST_DIR := host
-BUILDDIR ?= bin
-NR_TASKLETS ?= 16
NR_DPUS ?= 1
-
-define conf_filename
- ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf
-endef
-CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS})
+NR_TASKLETS ?= 16
+BL ?= 10
COMMON_INCLUDES := support
-HOST_TARGET := ${BUILDDIR}/ts_host
-DPU_TARGET := ${BUILDDIR}/ts_dpu
-
-HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
-DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
-
-.PHONY: all clean test
+HOST_SOURCES := $(wildcard host/*.c)
+DPU_SOURCES := $(wildcard dpu/*.c)
-__dirs := $(shell mkdir -p ${BUILDDIR})
-
-COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
+COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DBL=${BL}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -lm
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
-all: ${HOST_TARGET} ${DPU_TARGET}
+QUIET = @
+
+ifdef verbose
+ QUIET =
+endif
+
+all: bin/ts_host bin/ts_dpu
-${CONF}:
- $(RM) $(call conf_filename,*,*)
- touch ${CONF}
+bin:
+ ${QUIET}mkdir -p bin
-${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
- $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+bin/ts_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin
+ ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
-${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
- dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
+bin/ts_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin
+ ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
clean:
- $(RM) -r $(BUILDDIR)
+ ${QUIET}rm -rf bin
test: all
- ./${HOST_TARGET} -n 131072
+ ${QUIET}bin/ts_host -n 131072
+
+.PHONY: all clean test
diff --git a/TS/baselines/cpu/Makefile b/TS/baselines/cpu/Makefile
index 69121ed..b0b9a86 100644
--- a/TS/baselines/cpu/Makefile
+++ b/TS/baselines/cpu/Makefile
@@ -1,4 +1,3 @@
-
all: streamp_openmp
streamp_openmp: streamp_openmp.cpp tools.cpp
@@ -13,10 +12,15 @@ streamp_openmp_O2: streamp_openmp.cpp tools.cpp
run: streamp_openmp
./streamp_openmp inputs/randomlist33M.txt 256
+# may need OMP_NUM_THREADS=32 (≈ tinos) -- does not work with 88 threads @ ios
+
run_O0: streamp_openmp_O0
./streamp_openmp_O0 inputs/randomlist33M.txt 256
run_O2: streamp_openmp_O2
./streamp_openmp_O2 inputs/randomlist33M.txt 256
+clean:
+ rm -f streamp_openmp streamp_openmp_O0 streamp_openmp_O2
+
.PHONY: all run run_O0 run_O2 clean
diff --git a/TS/baselines/cpu/run-opti.sh b/TS/baselines/cpu/run-opti.sh
new file mode 100755
index 0000000..9135393
--- /dev/null
+++ b/TS/baselines/cpu/run-opti.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+HOST="$(hostname)"
+
+echo $HOST
+
+make clean
+
+for i in $(seq 1 50); do
+ ( make run_O0 || OMP_NUM_THREADS=32 make run_O0 ) | sed 's/CPU/CPU O0/'
+done | tee "${HOST}-O0.txt"
+
+for i in $(seq 1 50); do
+ ( make run_O2 || OMP_NUM_THREADS=32 make run_O2 ) | sed 's/CPU/CPU O2/'
+done | tee "${HOST}-O2.txt"
diff --git a/TS/baselines/cpu/run.sh b/TS/baselines/cpu/run.sh
index 7d8fe37..254ca15 100755
--- a/TS/baselines/cpu/run.sh
+++ b/TS/baselines/cpu/run.sh
@@ -2,15 +2,22 @@
set -e
+HOST="$(hostname)"
+
+echo $HOST
+
+(
+
echo "prim-benchmarks TS CPU (dfatool edition)"
echo "Started at $(date)"
echo "Revision $(git describe --always)"
# input size depends on file -> strong scaling only
-make
+make -B
for i in $(seq 1 10); do
- for nr_threads in 1 2 4 6 8 12 16 20 24 32; do
+ for nr_threads in 88 64 44 1 2 4 6 8 12 16 20 24 32; do
OMP_NUM_THREADS=${nr_threads} timeout --foreground -k 1m 30m ./streamp_openmp inputs/randomlist33M.txt 256 || true
done
done
+) | tee "${HOST}-explore.txt"
diff --git a/TS/baselines/cpu/streamp_openmp.cpp b/TS/baselines/cpu/streamp_openmp.cpp
index 94f110f..cc970e2 100644
--- a/TS/baselines/cpu/streamp_openmp.cpp
+++ b/TS/baselines/cpu/streamp_openmp.cpp
@@ -290,7 +290,7 @@ int main(int argc, char* argv[])
std::cout << "///////////////////////// STREAMP //////////////////////////" << std::endl;
std::cout << "############################################################" << std::endl;
std::cout << std::endl;
- std::cout << "[>>] Reading File..." << std::endl;
+ //std::cout << "[>>] Reading File..." << std::endl;
/* Read time series file */
tstart = std::chrono::high_resolution_clock::now();
@@ -329,7 +329,7 @@ int main(int argc, char* argv[])
std::cout << std::endl;
// Preprocess, statistics, get the mean and standard deviation of every subsequence in the time series
- std::cout << "[>>] Preprocessing..." << std::endl;
+ //std::cout << "[>>] Preprocessing..." << std::endl;
tstart = std::chrono::high_resolution_clock::now();
tprogstart = tstart;
@@ -337,11 +337,11 @@ int main(int argc, char* argv[])
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
- std::cout << "[OK] Preprocess Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] n_threads=%d e_type=%s n_elements=%d | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f\n", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ //std::cout << "[OK] Preprocess Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+ printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
//Initialize Matrix Profile and Matrix Profile Index
- std::cout << "[>>] Initializing Profile..." << std::endl;
+ //std::cout << "[>>] Initializing Profile..." << std::endl;
tstart = std::chrono::high_resolution_clock::now();
profile = new DTYPE[ProfileLength];
@@ -354,8 +354,8 @@ int main(int argc, char* argv[])
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
- std::cout << "[OK] Initialize Profile Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] n_threads=%d e_type=%s n_elements=%d | throughput_init_MBps=%f throughput_init_MOpps=%f\n", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ //std::cout << "[OK] Initialize Profile Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+ printf(" throughput_init_MBps=%f throughput_init_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
// Random shuffle the diagonals
idx.clear();
@@ -366,15 +366,15 @@ int main(int argc, char* argv[])
std::random_shuffle(idx.begin(), idx.end());
/******************** SCRIMP ********************/
- std::cout << "[>>] Performing STREAMP..." << std::endl;
+ //std::cout << "[>>] Performing STREAMP..." << std::endl;
tstart = std::chrono::high_resolution_clock::now();
streamp();
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
- std::cout << "[OK] STREAMP Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] n_threads=%d e_type=%s n_elements=%d | throughput_streamp_MBps=%f throughput_streamp_MOpps=%f\n", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ //std::cout << "[OK] STREAMP Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+ printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
// Save profile to file
//std::cout << "[>>] Saving Profile..." << std::endl;
@@ -388,9 +388,9 @@ int main(int argc, char* argv[])
// Calculate total time
time_elapsed = tend - tprogstart;
- std::cout << "[OK] Total Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] n_threads=%d e_type=%s n_elements=%d | throughput_total_MBps=%f throughput_total_MOpps=%f\n", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
- std::cout << std::endl;
+ //std::cout << "[OK] Total Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+ printf(" throughput_MBps=%f throughput_MOpps=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ //std::cout << std::endl;
delete profile;
delete profileIndex;
diff --git a/TS/host/app.c b/TS/host/app.c
index 7b26a26..9454aea 100644
--- a/TS/host/app.c
+++ b/TS/host/app.c
@@ -25,6 +25,9 @@
// Define the DPU Binary path as DPU_BINARY here
#define DPU_BINARY "./bin/ts_dpu"
+#define XSTR(x) STR(x)
+#define STR(x) #x
+
#define MAX_DATA_VAL 127
static DTYPE tSeries[1 << 26];
@@ -185,7 +188,7 @@ int main(int argc, char **argv) {
for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
if (rep >= p.n_warmup)
- start(&timer, 1, rep - p.n_warmup);
+ start(&timer, 1, 0);
uint32_t i = 0;
DPU_FOREACH(dpu_set, dpu) {
@@ -238,7 +241,7 @@ int main(int argc, char **argv) {
// Run kernel on DPUs
if (rep >= p.n_warmup)
{
- start(&timer, 2, rep - p.n_warmup);
+ start(&timer, 2, 0);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
@@ -257,7 +260,7 @@ int main(int argc, char **argv) {
dpu_result_t* results_retrieve[nr_of_dpus];
if (rep >= p.n_warmup)
- start(&timer, 3, rep - p.n_warmup);
+ start(&timer, 3, 0);
DPU_FOREACH(dpu_set, dpu, i) {
results_retrieve[i] = (dpu_result_t*)malloc(NR_TASKLETS * sizeof(dpu_result_t));
@@ -295,10 +298,30 @@ int main(int argc, char **argv) {
#endif
if (rep >= p.n_warmup)
- start(&timer, 4, rep - p.n_warmup);
+ start(&timer, 0, 0);
streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, query, query_length, query_mean, query_std);
if(rep >= p.n_warmup)
- stop(&timer, 4);
+ stop(&timer, 0);
+
+ int status = (minHost == result.minValue);
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
+ if (rep >= p.n_warmup) {
+ printf("[::] TS NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu "
+ "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f",
+ nr_of_dpus, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, ts_size,
+ ts_size * sizeof(DTYPE) / timer.time[0],
+ ts_size * sizeof(DTYPE) / (timer.time[2]),
+ ts_size * sizeof(DTYPE) / (timer.time[1] + timer.time[2] + timer.time[3]));
+ printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f",
+ ts_size / timer.time[0],
+ ts_size / (timer.time[2]),
+ ts_size / (timer.time[1] + timer.time[2] + timer.time[3]));
+ printall(&timer, 3);
+ }
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
+ }
}
#if ENERGY
@@ -309,28 +332,10 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
#endif
- // Print timing results
- printf("CPU Version Time (ms): ");
- print(&timer, 4, p.n_reps);
- printf("Inter-DPU Time (ms): ");
- print(&timer, 0, p.n_reps);
- printf("CPU-DPU Time (ms): ");
- print(&timer, 1, p.n_reps);
- printf("DPU Kernel Time (ms): ");
- print(&timer, 2, p.n_reps);
- printf("DPU-CPU Time (ms): ");
- print(&timer, 3, p.n_reps);
-
#if ENERGY
printf("Energy (J): %f J\t", avg_energy);
#endif
- int status = (minHost == result.minValue);
- if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
- } else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
- }
DPU_ASSERT(dpu_free(dpu_set));
diff --git a/TS/run-paper-strong-full.sh b/TS/run-paper-strong-full.sh
new file mode 100755
index 0000000..029ea4f
--- /dev/null
+++ b/TS/run-paper-strong-full.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+(
+
+echo "prim-benchmarks TS strong-full (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+for nr_dpus in 256 512 1024 2048; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
+ # BL=10 appears to be slightly faster.
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n 33554432 || true
+ fi
+ done
+done
+) | tee log-paper-strong-full.txt
diff --git a/TS/run-paper-strong-rank.sh b/TS/run-paper-strong-rank.sh
new file mode 100755
index 0000000..ec07fab
--- /dev/null
+++ b/TS/run-paper-strong-rank.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+(
+
+echo "prim-benchmarks TS strong-rank (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+# 256 and 512 are not part of upstream config space
+for nr_dpus in 512 256 1 4 16 64; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
+ # BL=10 appears to be slightly faster.
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n 524288 || true
+ fi
+ done
+done
+) | tee log-paper-strong-rank.txt
diff --git a/TS/run-paper-weak.sh b/TS/run-paper-weak.sh
new file mode 100755
index 0000000..f97d2bd
--- /dev/null
+++ b/TS/run-paper-weak.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+(
+
+echo "prim-benchmarks TS weak (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+# 256 and 512 are not part of upstream
+for nr_dpus in 512 256 1 4 16 64; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead.
+ # BL=10 appears to be slightly faster.
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ i=$(( nr_dpus * 524288 ))
+ timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n $i || true
+ fi
+ done
+done
+) | tee log-paper-weak.txt
diff --git a/TS/support/timer.h b/TS/support/timer.h
index 0ea7739..a0747b2 100755
--- a/TS/support/timer.h
+++ b/TS/support/timer.h
@@ -57,3 +57,10 @@ void stop(Timer *timer, int i) {
}
void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
+
+void printall(Timer *timer, int maxt) {
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}