diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2025-05-16 14:15:10 +0200 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2025-05-16 14:15:39 +0200 |
commit | 3e9a46a8b16668ed69f2b8a155246e8878e1ca2a (patch) | |
tree | c44677aa5945458463a80348f9129d821d293d7f | |
parent | 20f20b7bd24a70fde8074af42bb5d1c18996081c (diff) |
CPU-DPU: determine overhead of data transformation
-rwxr-xr-x | Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh | 35 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/util/upvec-read.pdf | bin | 0 -> 106003 bytes | |||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance | 41 | ||||
-rw-r--r-- | Microbenchmarks/CPU-DPU/util/upvec-write.tex | 38 |
4 files changed, 114 insertions, 0 deletions
diff --git a/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh new file mode 100755 index 0000000..ee5ee99 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +./make-size.sh 0 + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1 + bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 20 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size} + return $? +} + +export -f run_benchmark_nmc + +# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output). +# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB). +# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory. + +for sdk in 2025.1.0-orig 2025.1.0-notransform; do + source /opt/upmem/transformation-benchmarks/${sdk}/upmem_env.sh + fn=log/$(hostname)/upvec-${sdk} + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \ + ::: numa_rank any \ + ::: numa_in 1 \ + ::: numa_out 1 \ + ::: numa_cpu 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: input_size 1048576 \ + >> ${fn}.txt +done diff --git a/Microbenchmarks/CPU-DPU/util/upvec-read.pdf b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf Binary files differnew file mode 100644 index 0000000..63af6cc --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf diff --git a/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance new file mode 100755 index 0000000..b175b8d --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance @@ -0,0 +1,41 @@ +#!/bin/sh + +data=$(mktemp -d) + +echo +echo SDK with transformation +echo + +DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \ +analyze-log.py \ +--filter-param='n_elements_per_dpu=1048576' \ +--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \ +--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \ +--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \ +--export-pgf-unparam ${data}/orig- \ +--cross-validate=kfold:10 --progress \ +--show-model=param --show-model-error --show-model-precision=6 \ +log/tinos/upvec-2025.1.0-orig.txt + +echo +echo SDK without transformation +echo + +DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \ +analyze-log.py \ +--filter-param='n_elements_per_dpu=1048576' \ +--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \ +--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \ +--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \ +--export-pgf-unparam ${data}/notransform- \ +--cross-validate=kfold:10 --progress \ +--show-model=param --show-model-error --show-model-precision=6 \ +log/tinos/upvec-2025.1.0-notransform.txt + +for op in read write; do + cp util/upvec-${op}.tex ${data} + lualatex -output-directory ${data} ${data}/upvec-${op} + cp ${data}/upvec-${op}.pdf util +done + +rm -rf ${data} diff --git a/Microbenchmarks/CPU-DPU/util/upvec-write.tex b/Microbenchmarks/CPU-DPU/util/upvec-write.tex new file mode 100644 index 0000000..f6d7bf5 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-write.tex @@ -0,0 +1,38 @@ +\documentclass{standalone} + +\usepackage[T1]{fontenc} +\usepackage[default]{opensans} +\usepackage[scaled]{beramono} + +\usepackage{tikz} +\usepackage{pgfplots} +\pgfplotsset{compat=1.18} +\usepgfplotslibrary{statistics} + +\begin{document} + \begin{tikzpicture} + \begin{axis}[ + ylabel={write [GB/s]}, + xlabel={\# Ranks}, + x label style={font=\footnotesize, yshift=2mm}, + y label style={font=\footnotesize}, + tick label style={/pgf/number format/assume math mode=true}, + title={Benchmark Data}, + title style={yshift=-3mm}, + legend style={font=\footnotesize, legend columns=-1, column sep=1ex}, + legend pos=south east, + legend entries={upstream,,{no transformation},}, + reverse legend, + ymin=0,ymax=14, + xmin=0,xmax=41, + width=90mm,height=45mm + ] + \addplot[thick, color=red, domain=1:40] {5.042768 + 0.258673 * min(x, 31.536102)}; + \addplot[color=red,only marks,mark=*,mark size=0.9,opacity=.05] + table[x=n_ranks, y=value] {orig-NMC-transfer-writeThroughputGBps.txt}; + \addplot[thick, color=blue, domain=1:40] {5.049962 + 0.308594 * min(x, 25.657012)}; + \addplot[color=blue,only marks,mark=*,mark size=0.9,opacity=.05] + table[x=n_ranks, y=value] {notransform-NMC-transfer-writeThroughputGBps.txt}; + \end{axis} + \end{tikzpicture} +\end{document} |