CPU-DPU: determine overhead of data transformation

author: Birte Kristina Friesel <birte.friesel@uos.de> 2025-05-16 14:15:10 +0200
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2025-05-16 14:15:39 +0200
commit: 3e9a46a8b16668ed69f2b8a155246e8878e1ca2a (patch)
tree: c44677aa5945458463a80348f9129d821d293d7f
parent: 20f20b7bd24a70fde8074af42bb5d1c18996081c (diff)
4 files changed, 114 insertions, 0 deletions
diff --git a/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh
new file mode 100755
index 0000000..ee5ee99
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+	bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 20 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+for sdk in 2025.1.0-orig 2025.1.0-notransform; do
+	source /opt/upmem/transformation-benchmarks/${sdk}/upmem_env.sh
+	fn=log/$(hostname)/upvec-${sdk}
+
+	parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+		run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+		::: numa_rank any \
+		::: numa_in 1 \
+		::: numa_out 1 \
+		::: numa_cpu 1 \
+		::: nr_ranks $(seq 1 40) \
+		::: input_size 1048576 \
+	>> ${fn}.txt
+done
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-read.pdf b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf
new file mode 100644
index 0000000..63af6cc
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance
new file mode 100755
index 0000000..b175b8d
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+data=$(mktemp -d)
+
+echo
+echo SDK with transformation
+echo
+
+DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \
+analyze-log.py \
+--filter-param='n_elements_per_dpu=1048576' \
+--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \
+--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \
+--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \
+--export-pgf-unparam ${data}/orig- \
+--cross-validate=kfold:10 --progress \
+--show-model=param --show-model-error --show-model-precision=6 \
+log/tinos/upvec-2025.1.0-orig.txt
+
+echo
+echo SDK without transformation
+echo
+
+DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \
+analyze-log.py \
+--filter-param='n_elements_per_dpu=1048576' \
+--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \
+--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \
+--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \
+--export-pgf-unparam ${data}/notransform- \
+--cross-validate=kfold:10 --progress \
+--show-model=param --show-model-error --show-model-precision=6 \
+log/tinos/upvec-2025.1.0-notransform.txt
+
+for op in read write; do
+	cp util/upvec-${op}.tex ${data}
+	lualatex -output-directory ${data} ${data}/upvec-${op}
+	cp ${data}/upvec-${op}.pdf util
+done
+
+rm -rf ${data}
diff --git a/Microbenchmarks/CPU-DPU/util/upvec-write.tex b/Microbenchmarks/CPU-DPU/util/upvec-write.tex
new file mode 100644
index 0000000..f6d7bf5
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/util/upvec-write.tex
@@ -0,0 +1,38 @@
+\documentclass{standalone}
+
+\usepackage[T1]{fontenc}
+\usepackage[default]{opensans}
+\usepackage[scaled]{beramono}
+
+\usepackage{tikz}
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.18}
+\usepgfplotslibrary{statistics}
+
+\begin{document}
+	\begin{tikzpicture}
+		\begin{axis}[
+				ylabel={write [GB/s]},
+				xlabel={\# Ranks},
+				x label style={font=\footnotesize, yshift=2mm},
+				y label style={font=\footnotesize},
+				tick label style={/pgf/number format/assume math mode=true},
+				title={Benchmark Data},
+				title style={yshift=-3mm},
+				legend style={font=\footnotesize, legend columns=-1, column sep=1ex},
+				legend pos=south east,
+				legend entries={upstream,,{no transformation},},
+				reverse legend,
+				ymin=0,ymax=14,
+				xmin=0,xmax=41,
+				width=90mm,height=45mm
+			]
+			\addplot[thick, color=red, domain=1:40] {5.042768 + 0.258673 * min(x, 31.536102)};
+			\addplot[color=red,only marks,mark=*,mark size=0.9,opacity=.05]
+				table[x=n_ranks, y=value] {orig-NMC-transfer-writeThroughputGBps.txt};
+			\addplot[thick, color=blue, domain=1:40] {5.049962 + 0.308594 * min(x, 25.657012)};
+			\addplot[color=blue,only marks,mark=*,mark size=0.9,opacity=.05]
+				table[x=n_ranks, y=value] {notransform-NMC-transfer-writeThroughputGBps.txt};
+		\end{axis}
+	\end{tikzpicture}
+\end{document}
author	Birte Kristina Friesel <birte.friesel@uos.de>	2025-05-16 14:15:10 +0200
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2025-05-16 14:15:39 +0200
commit	3e9a46a8b16668ed69f2b8a155246e8878e1ca2a (patch)
tree	c44677aa5945458463a80348f9129d821d293d7f
parent	20f20b7bd24a70fde8074af42bb5d1c18996081c (diff)