summaryrefslogtreecommitdiff
path: root/TS
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-07-10 15:16:49 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-07-10 15:16:49 +0200
commitc8ab0212139f55f3b0d45331a2185bfb0c7b784b (patch)
tree02b41d84eda3341b4b1c9bff8fabbbb5491a3347 /TS
parent56a71a9bc8b7ec6157596739d9dab634dcb10bf6 (diff)
TS: NUMA and variable data type support; add benchmark scripts
Diffstat (limited to 'TS')
-rw-r--r--TS/baselines/cpu/Makefile17
-rw-r--r--TS/baselines/cpu/mprofile.h5
-rw-r--r--TS/baselines/cpu/streamp_openmp.cpp64
-rwxr-xr-xTS/dimes-hetsim-hbm.sh43
-rwxr-xr-xTS/dimes-hetsim-nmc.sh76
5 files changed, 197 insertions, 8 deletions
diff --git a/TS/baselines/cpu/Makefile b/TS/baselines/cpu/Makefile
index b0b9a86..223f755 100644
--- a/TS/baselines/cpu/Makefile
+++ b/TS/baselines/cpu/Makefile
@@ -1,7 +1,16 @@
+NUMA ?= 0
+FLAGS =
+
+ifeq (${NUMA}, 1)
+ FLAGS += -lnuma
+endif
+
+DTYPE ?= double
+
all: streamp_openmp
streamp_openmp: streamp_openmp.cpp tools.cpp
- g++ -O2 streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp
+ g++ -ggdb -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DDTYPE=${DTYPE} streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp ${FLAGS}
streamp_openmp_O0: streamp_openmp.cpp tools.cpp
g++ streamp_openmp.cpp tools.cpp -o streamp_openmp_O0 -std=c++11 -fopenmp
@@ -23,4 +32,10 @@ run_O2: streamp_openmp_O2
clean:
rm -f streamp_openmp streamp_openmp_O0 streamp_openmp_O2
+inputs/randomlistDPU.txt: inputs/randomlist33M.txt
+ head -n 33554432 $^ > $@
+
+inputs/randomlist10M.txt: inputs/randomlist33M.txt
+ head -n 10000000 $^ > $@
+
.PHONY: all run run_O0 run_O2 clean
diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h
index c25ed65..120c225 100644
--- a/TS/baselines/cpu/mprofile.h
+++ b/TS/baselines/cpu/mprofile.h
@@ -1,9 +1,12 @@
#define PATH_TIME_SERIES "./"
#define PATH_RESULTS "./"
-#define DTYPE double
#define ARIT_FACT 32
+#ifndef DTYPE
+#define DTYPE double
+#endif
+
//#define HBM_ALOC
//#define RANDOM_DIAGS
diff --git a/TS/baselines/cpu/streamp_openmp.cpp b/TS/baselines/cpu/streamp_openmp.cpp
index cc970e2..3bc6296 100644
--- a/TS/baselines/cpu/streamp_openmp.cpp
+++ b/TS/baselines/cpu/streamp_openmp.cpp
@@ -42,6 +42,18 @@ The second column of the output file is the matrix profile index.
#define XSTR(x) STR(x)
#define STR(x) #x
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+struct bitmask* bitmask_in = NULL;
+int numa_node_in = -1;
+int numa_node_cpu = -1;
+#endif
+
#include "mprofile.h"
bool interrupt = false;
@@ -109,7 +121,8 @@ void streamp()
{
DTYPE lastz, distance, windowSizeDTYPE;
DTYPE * distances, * lastzs;
- int diag, my_offset, i, j, ri;
+ int diag, my_offset, i, j;
+ size_t ri;
distances = new DTYPE[ARIT_FACT];
lastzs = new DTYPE[ARIT_FACT];
@@ -271,6 +284,16 @@ int main(int argc, char* argv[])
// Set window size
windowSize = atoi(argv[2]);
+#if NUMA
+ bitmask_in = numa_parse_nodestring(argv[3]);
+ numa_node_cpu = atoi(argv[4]);
+
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ numa_free_nodemask(bitmask_in);
+ }
+#endif
+
// Set the exclusion zone
exclusionZone = (int) (windowSize * 0.25);
@@ -308,6 +331,26 @@ int main(int argc, char* argv[])
time_elapsed = tend - tstart;
std::cout << "[OK] Read File Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+#if NUMA
+ mp_pages[0] = static_cast<void*>(A.data());
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_in = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+#endif
+
// Set Matrix Profile Length
ProfileLength = timeSeriesLength - windowSize + 1;
@@ -338,12 +381,21 @@ int main(int argc, char* argv[])
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] Preprocess Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d"
+#if NUMA
+ " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
+#endif
+ " | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f latency_preproc_s=%f",
+ numThreads, XSTR(DTYPE), timeSeriesLength,
+#if NUMA
+ numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+#endif
+ timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
//Initialize Matrix Profile and Matrix Profile Index
//std::cout << "[>>] Initializing Profile..." << std::endl;
- tstart = std::chrono::high_resolution_clock::now();
+ tstart = std::chrono::high_resolution_clock::now();
profile = new DTYPE[ProfileLength];
profileIndex = new int[ProfileLength];
@@ -355,7 +407,7 @@ int main(int argc, char* argv[])
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] Initialize Profile Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_init_MBps=%f throughput_init_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_init_MBps=%f throughput_init_MOpps=%f latency_init_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
// Random shuffle the diagonals
idx.clear();
@@ -374,7 +426,7 @@ int main(int argc, char* argv[])
tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] STREAMP Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f latency_streamp_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
// Save profile to file
//std::cout << "[>>] Saving Profile..." << std::endl;
@@ -389,7 +441,7 @@ int main(int argc, char* argv[])
// Calculate total time
time_elapsed = tend - tprogstart;
//std::cout << "[OK] Total Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_MBps=%f throughput_MOpps=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_MBps=%f throughput_MOpps=%f latency_s=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
//std::cout << std::endl;
delete profile;
diff --git a/TS/dimes-hetsim-hbm.sh b/TS/dimes-hetsim-hbm.sh
new file mode 100755
index 0000000..192a705
--- /dev/null
+++ b/TS/dimes-hetsim-hbm.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+cd baselines/cpu
+make -B NUMA=1
+make inputs/randomlist10M.txt
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/$(date +%Y%m%d)
+
+# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data
+# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double
+# However, this does not work with ~64 or more threads due to an internal tmp array allocation failure in 'profile_tmp new DTYPE[ProfileLength * numThreads]' → use 10M elements instead.
+
+run_benchmark() {
+ local "$@"
+ OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlist10M.txt 256 ${ram} ${cpu}
+}
+
+export -f run_benchmark
+
+(
+
+echo "single-node execution (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --header : \
+ run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
+ ::: i $(seq 0 5) \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: cpu $(seq 0 7) \
+ ::: ram $(seq 0 15)
+
+echo "multi-node execution (2/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --header : \
+ run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
+ ::: i $(seq 0 20) \
+ ::: nr_threads 32 48 64 96 128 \
+ ::: cpu -1 \
+ ::: ram $(seq 0 15)
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt
diff --git a/TS/dimes-hetsim-nmc.sh b/TS/dimes-hetsim-nmc.sh
new file mode 100755
index 0000000..bf8eafe
--- /dev/null
+++ b/TS/dimes-hetsim-nmc.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
+fn=log/$(hostname)/$(date +%Y%m%d)
+
+# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data
+# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double
+# This benchmark uses int32 and 33554432 elements for both.
+
+run_benchmark_nmc() {
+ local "$@"
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then
+ bin/ts_host -w 0 -e 100 -n ${input_size}
+ fi
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+run_benchmark() {
+ local "$@"
+ OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlistDPU.txt 256 ${ram} ${cpu}
+}
+
+export -f run_benchmark
+
+(
+
+echo "NMC single-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \
+ ::: numa_rank 0 1 \
+ ::: nr_dpus 64 128 256 512 768 1024 \
+ ::: input_size 33554432
+
+echo "NMC multi-node operation (2/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \
+ ::: numa_rank any \
+ ::: nr_dpus 1536 2048 2304 \
+ ::: input_size 33554432
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt
+
+cd baselines/cpu
+make -B NUMA=1 DTYPE=int32_t
+make inputs/randomlistDPU.txt
+
+(
+
+echo "CPU single-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --header : \
+ run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
+ ::: i $(seq 0 20) \
+ ::: ram 0 1 \
+ ::: cpu 0 1 \
+ ::: nr_threads 1 2 4 8 12 16
+
+echo "CPU multi-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --header : \
+ run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
+ ::: i $(seq 0 20) \
+ ::: ram 0 1 \
+ ::: cpu -1 \
+ ::: nr_threads 24 32
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt