From c8ab0212139f55f3b0d45331a2185bfb0c7b784b Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Wed, 10 Jul 2024 15:16:49 +0200 Subject: TS: NUMA and variable data type support; add benchmark scripts --- TS/baselines/cpu/Makefile | 17 ++++++++- TS/baselines/cpu/mprofile.h | 5 ++- TS/baselines/cpu/streamp_openmp.cpp | 64 ++++++++++++++++++++++++++++--- TS/dimes-hetsim-hbm.sh | 43 +++++++++++++++++++++ TS/dimes-hetsim-nmc.sh | 76 +++++++++++++++++++++++++++++++++++++ 5 files changed, 197 insertions(+), 8 deletions(-) create mode 100755 TS/dimes-hetsim-hbm.sh create mode 100755 TS/dimes-hetsim-nmc.sh (limited to 'TS') diff --git a/TS/baselines/cpu/Makefile b/TS/baselines/cpu/Makefile index b0b9a86..223f755 100644 --- a/TS/baselines/cpu/Makefile +++ b/TS/baselines/cpu/Makefile @@ -1,7 +1,16 @@ +NUMA ?= 0 +FLAGS = + +ifeq (${NUMA}, 1) + FLAGS += -lnuma +endif + +DTYPE ?= double + all: streamp_openmp streamp_openmp: streamp_openmp.cpp tools.cpp - g++ -O2 streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp + g++ -ggdb -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DDTYPE=${DTYPE} streamp_openmp.cpp tools.cpp -o streamp_openmp -std=c++11 -fopenmp ${FLAGS} streamp_openmp_O0: streamp_openmp.cpp tools.cpp g++ streamp_openmp.cpp tools.cpp -o streamp_openmp_O0 -std=c++11 -fopenmp @@ -23,4 +32,10 @@ run_O2: streamp_openmp_O2 clean: rm -f streamp_openmp streamp_openmp_O0 streamp_openmp_O2 +inputs/randomlistDPU.txt: inputs/randomlist33M.txt + head -n 33554432 $^ > $@ + +inputs/randomlist10M.txt: inputs/randomlist33M.txt + head -n 10000000 $^ > $@ + .PHONY: all run run_O0 run_O2 clean diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h index c25ed65..120c225 100644 --- a/TS/baselines/cpu/mprofile.h +++ b/TS/baselines/cpu/mprofile.h @@ -1,9 +1,12 @@ #define PATH_TIME_SERIES "./" #define PATH_RESULTS "./" -#define DTYPE double #define ARIT_FACT 32 +#ifndef DTYPE +#define DTYPE double +#endif + //#define HBM_ALOC //#define RANDOM_DIAGS diff --git a/TS/baselines/cpu/streamp_openmp.cpp b/TS/baselines/cpu/streamp_openmp.cpp index cc970e2..3bc6296 100644 --- a/TS/baselines/cpu/streamp_openmp.cpp +++ b/TS/baselines/cpu/streamp_openmp.cpp @@ -42,6 +42,18 @@ The second column of the output file is the matrix profile index. #define XSTR(x) STR(x) #define STR(x) #x +#if NUMA +#include +#include + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +struct bitmask* bitmask_in = NULL; +int numa_node_in = -1; +int numa_node_cpu = -1; +#endif + #include "mprofile.h" bool interrupt = false; @@ -109,7 +121,8 @@ void streamp() { DTYPE lastz, distance, windowSizeDTYPE; DTYPE * distances, * lastzs; - int diag, my_offset, i, j, ri; + int diag, my_offset, i, j; + size_t ri; distances = new DTYPE[ARIT_FACT]; lastzs = new DTYPE[ARIT_FACT]; @@ -271,6 +284,16 @@ int main(int argc, char* argv[]) // Set window size windowSize = atoi(argv[2]); +#if NUMA + bitmask_in = numa_parse_nodestring(argv[3]); + numa_node_cpu = atoi(argv[4]); + + if (bitmask_in) { + numa_set_membind(bitmask_in); + numa_free_nodemask(bitmask_in); + } +#endif + // Set the exclusion zone exclusionZone = (int) (windowSize * 0.25); @@ -308,6 +331,26 @@ int main(int argc, char* argv[]) time_elapsed = tend - tstart; std::cout << "[OK] Read File Time: " << std::setprecision(std::numeric_limits::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl; +#if NUMA + mp_pages[0] = static_cast(A.data()); + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_in = mp_status[0]; + } + + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } +#endif + // Set Matrix Profile Length ProfileLength = timeSeriesLength - windowSize + 1; @@ -338,12 +381,21 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now(); time_elapsed = tend - tstart; //std::cout << "[OK] Preprocess Time: " << std::setprecision(std::numeric_limits::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl; - printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f", numThreads, XSTR(DTYPE), timeSeriesLength, timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6)); + printf("[::] TS CPU | n_threads=%d e_type=%s n_elements=%d" +#if NUMA + " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" +#endif + " | throughput_preproc_MBps=%f throughput_preproc_MOpps=%f latency_preproc_s=%f", + numThreads, XSTR(DTYPE), timeSeriesLength, +#if NUMA + numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), +#endif + timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count()); //Initialize Matrix Profile and Matrix Profile Index //std::cout << "[>>] Initializing Profile..." << std::endl; - tstart = std::chrono::high_resolution_clock::now(); + tstart = std::chrono::high_resolution_clock::now(); profile = new DTYPE[ProfileLength]; profileIndex = new int[ProfileLength]; @@ -355,7 +407,7 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now(); time_elapsed = tend - tstart; //std::cout << "[OK] Initialize Profile Time: " << std::setprecision(std::numeric_limits::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl; - printf(" throughput_init_MBps=%f throughput_init_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6)); + printf(" throughput_init_MBps=%f throughput_init_MOpps=%f latency_init_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count()); // Random shuffle the diagonals idx.clear(); @@ -374,7 +426,7 @@ int main(int argc, char* argv[]) tend = std::chrono::high_resolution_clock::now(); time_elapsed = tend - tstart; //std::cout << "[OK] STREAMP Time: " << std::setprecision(std::numeric_limits::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl; - printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6)); + printf(" throughput_streamp_MBps=%f throughput_streamp_MOpps=%f latency_streamp_s=%f", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count()); // Save profile to file //std::cout << "[>>] Saving Profile..." << std::endl; @@ -389,7 +441,7 @@ int main(int argc, char* argv[]) // Calculate total time time_elapsed = tend - tprogstart; //std::cout << "[OK] Total Time: " << std::setprecision(std::numeric_limits::digits10 + 2) << time_elapsed.count() << " seconds." << std::endl; - printf(" throughput_MBps=%f throughput_MOpps=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6)); + printf(" throughput_MBps=%f throughput_MOpps=%f latency_s=%f\n", timeSeriesLength * sizeof(DTYPE) / (time_elapsed.count() * 1e6), timeSeriesLength / (time_elapsed.count() * 1e6), time_elapsed.count()); //std::cout << std::endl; delete profile; diff --git a/TS/dimes-hetsim-hbm.sh b/TS/dimes-hetsim-hbm.sh new file mode 100755 index 0000000..192a705 --- /dev/null +++ b/TS/dimes-hetsim-hbm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +cd baselines/cpu +make -B NUMA=1 +make inputs/randomlist10M.txt + +mkdir -p log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data +# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double +# However, this does not work with ~64 or more threads due to an internal tmp array allocation failure in 'profile_tmp new DTYPE[ProfileLength * numThreads]' → use 10M elements instead. + +run_benchmark() { + local "$@" + OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlist10M.txt 256 ${ram} ${cpu} +} + +export -f run_benchmark + +( + +echo "single-node execution (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 5) \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: cpu $(seq 0 7) \ + ::: ram $(seq 0 15) + +echo "multi-node execution (2/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram $(seq 0 15) + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt diff --git a/TS/dimes-hetsim-nmc.sh b/TS/dimes-hetsim-nmc.sh new file mode 100755 index 0000000..bf8eafe --- /dev/null +++ b/TS/dimes-hetsim-nmc.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses ts_size = 33554432 elements and query_length = 256 with int32_t data +# Upstream CPU version uses inputs/randomlist33M.txt with 33618177 elements and query_length = 256 with double +# This benchmark uses int32 and 33554432 elements for both. + +run_benchmark_nmc() { + local "$@" + sudo limit_ranks_to_numa_node ${numa_rank} + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then + bin/ts_host -w 0 -e 100 -n ${input_size} + fi + return $? +} + +export -f run_benchmark_nmc + +run_benchmark() { + local "$@" + OMP_NUM_THREADS=${nr_threads} ./streamp_openmp inputs/randomlistDPU.txt 256 ${ram} ${cpu} +} + +export -f run_benchmark + +( + +echo "NMC single-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank 0 1 \ + ::: nr_dpus 64 128 256 512 768 1024 \ + ::: input_size 33554432 + +echo "NMC multi-node operation (2/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 1536 2048 2304 \ + ::: input_size 33554432 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt + +cd baselines/cpu +make -B NUMA=1 DTYPE=int32_t +make inputs/randomlistDPU.txt + +( + +echo "CPU single-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: ram 0 1 \ + ::: cpu 0 1 \ + ::: nr_threads 1 2 4 8 12 16 + +echo "CPU multi-node operation (1/2)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ + run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ::: i $(seq 0 20) \ + ::: ram 0 1 \ + ::: cpu -1 \ + ::: nr_threads 24 32 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt -- cgit v1.2.3