diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-10 15:16:49 +0200 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-10 15:16:49 +0200 |
commit | c8ab0212139f55f3b0d45331a2185bfb0c7b784b (patch) | |
tree | 02b41d84eda3341b4b1c9bff8fabbbb5491a3347 | |
parent | 56a71a9bc8b7ec6157596739d9dab634dcb10bf6 (diff) |
TS: NUMA and variable data type support; add benchmark scripts
-rw-r--r-- | TS/baselines/cpu/Makefile | 17 | ||||
-rw-r--r-- | TS/baselines/cpu/mprofile.h | 5 | ||||
-rw-r--r-- | TS/baselines/cpu/streamp_openmp.cpp | 64 | ||||
-rwxr-xr-x | TS/dimes-hetsim-hbm.sh | 43 | ||||
-rwxr-xr-x | TS/dimes-hetsim-nmc.sh | 76 |
5 files changed, 197 insertions, 8 deletions
diff --git a/TS/baselines/cpu/Makefile b/TS/baselines/cpu/Makefile index b0b9a86..223f755 100644 --- a/TS/baselines/cpu/Makefile +++ b/TS/baselines/cpu/Makefile @@ -1,7 +1,16 @@ +NUMA ?= 0 +FLAGS = + +ifeq (${NUMA}, 1) + FLAGS += -lnuma +endif + +DTYPE ?= double + all: streamp_openmp streamp_openmp: streamp_openmp.cpp tools.cpp - g++ -O2 streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp + g++ -ggdb -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DDTYPE=${DTYPE} streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp ${FLAGS} streamp_openmp_O0: streamp_openmp.cpp tools.cpp g++ streamp_openmp.cpp tools.cpp -o streamp_openmp_O0 -std=c++11 -fopenmp @@ -23,4 +32,10 @@ run_O2: streamp_openmp_O2 clean: rm -f streamp_openmp streamp_openmp_O0 streamp_openmp_O2 +inputs/randomlistDPU.txt: inputs/randomlist33M.txt + head -n 33554432 $^ > $@ + +inputs/randomlist10M.txt: inputs/randomlist33M.txt + head -n 10000000 $^ > $@ + .PHONY: all run run_O0 run_O2 clean diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h index c25ed65..120c225 100644 --- a/TS/baselines/cpu/mprofile.h +++ b/TS/baselines/cpu/mprofile.h @@ -1,9 +1,12 @@ #define PATH_TIME_SERIES "./" #define PATH_RESULTS "./" -#define DTYPE double #define ARIT_FACT 32 +#ifndef DTYPE +#define DTYPE double +#endif + //#define HBM_ALOC //#define RANDOM_DIAGS diff --git a/TS/baselines/cpu/streamp_openmp.cpp b/TS/baselines/cpu/streamp_openmp.cpp index cc970e2..3bc6296 100644 --- a/TS/baselines/cpu/streamp_openmp.cpp +++ b/TS/baselines/cpu/streamp_openmp.cpp @@ -42,6 +42,18 @@ The second column of the output file is the matrix profile index. #define XSTR(x) STR(x)
#define STR(x) #x
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+struct bitmask* bitmask_in = NULL;
+int numa_node_in = -1;
+int numa_node_cpu = -1;
+#endif
+
#include "mprofile.h"
bool interrupt = false;
@@ -109,7 +121,8 @@ void streamp() {
DTYPE lastz, distance, windowSizeDTYPE;
DTYPE * distances, * lastzs;
- int diag, my_offset, i, j, ri;
+ int diag, my_offset, i, j;
+ size_t ri;
distances = new DTYPE[ARIT_FACT];
lastzs = new DTYPE[ARIT_FACT];
@@ -271,6 +284,16 @@ int main(int argc, char* argv[]) // Set window size
windowSize = atoi(argv[2]);
+#if NUMA
+ bitmask_in = numa_parse_nodestring(argv[3]);
+ numa_node_cpu = atoi(argv[4]);
+
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ numa_free_nodemask(bitmask_in);
+ }
+#endif
+
// Set the exclusion zone
exclusionZone = (int) (windowSize * 0.25);
@@ -308,6 +331,26 @@ int main(int argc, char* argv[]) time_elapsed = tend - tstart;
std::cout << "[OK] Read File Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
+#if NUMA
+ mp_pages[0] = static_cast<void*>(A.data());
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_in = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+#endif
+
// Set Matrix Profile Length
ProfileLength = timeSeriesLength - windowSize + 1;
@@ -338,12 +381,21 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] Preprocess Time: " << std::setprecision(std::numeric_limits<double>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d"
+#if NUMA
+ " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
+#endif
+ " | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f latency_preproc_s=%f",
+ numThreads, XSTR(DTYPE), timeSeriesLength,
+#if NUMA
+ numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+#endif
+ timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
//Initialize Matrix Profile and Matrix Profile Index
//std::cout << "[>>] Initializing Profile..." << std::endl;
- tstart = std::chrono::high_resolution_clock::now();
+ tstart = std::chrono::high_resolution_clock::now();
profile = new DTYPE[ProfileLength];
profileIndex = new int[ProfileLength];
@@ -355,7 +407,7 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] Initialize Profile Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_init_MBps=%f throughput_init_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_init_MBps=%f throughput_init_MOpps=%f latency_init_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
// Random shuffle the diagonals
idx.clear();
@@ -374,7 +426,7 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now();
time_elapsed = tend - tstart;
//std::cout << "[OK] STREAMP Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f latency_streamp_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
// Save profile to file
//std::cout << "[>>] Saving Profile..." << std::endl;
@@ -389,7 +441,7 @@ int main(int argc, char* argv[]) // Calculate total time
time_elapsed = tend - tprogstart;
//std::cout << "[OK] Total Time: " << std::setprecision(std::numeric_limits<DTYPE>::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl;
- printf(" throughput_MBps=%f throughput_MOpps=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6));
+ printf(" throughput_MBps=%f throughput_MOpps=%f latency_s=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count());
//std::cout << std::endl;
delete profile;
diff --git a/TS/dimes-hetsim-hbm.sh b/TS/dimes-hetsim-hbm.sh new file mode 100755 index 0000000..192a705 --- /dev/null +++ b/TS/dimes-hetsim-hbm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +cd baselines/cpu +make -B NUMA=1 +make inputs/randomlist10M.txt + +mkdir -p log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data +# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double +# However, this does not work with ~64 or more threads due to an internal tmp array allocation failure in 'profile_tmp new DTYPE[ProfileLength * numThreads]' → use 10M elements instead. + +run_benchmark() { + local "$@" + OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlist10M.txt 256 ${ram} ${cpu} +} + +export -f run_benchmark + +( + +echo "single-node execution (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 5) \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: cpu $(seq 0 7) \ + ::: ram $(seq 0 15) + +echo "multi-node execution (2/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram $(seq 0 15) + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt diff --git a/TS/dimes-hetsim-nmc.sh b/TS/dimes-hetsim-nmc.sh new file mode 100755 index 0000000..bf8eafe --- /dev/null +++ b/TS/dimes-hetsim-nmc.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data +# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double +# This benchmark uses int32 and 33554432 elements for both. + +run_benchmark_nmc() { + local "$@" + sudo limit_ranks_to_numa_node ${numa_rank} + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then + bin/ts_host -w 0 -e 100 -n ${input_size} + fi + return $? +} + +export -f run_benchmark_nmc + +run_benchmark() { + local "$@" + OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlistDPU.txt 256 ${ram} ${cpu} +} + +export -f run_benchmark + +( + +echo "NMC single-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank 0 1 \ + ::: nr_dpus 64 128 256 512 768 1024 \ + ::: input_size 33554432 + +echo "NMC multi-node operation (2/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 1536 2048 2304 \ + ::: input_size 33554432 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt + +cd baselines/cpu +make -B NUMA=1 DTYPE=int32_t +make inputs/randomlistDPU.txt + +( + +echo "CPU single-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: ram 0 1 \ + ::: cpu 0 1 \ + ::: nr_threads 1 2 4 8 12 16 + +echo "CPU multi-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: ram 0 1 \ + ::: cpu -1 \ + ::: nr_threads 24 32 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt |