2 files changed, 71 insertions, 0 deletions
diff --git a/Microbenchmarks/CPU-DPU/splc25-alloc.sh b/Microbenchmarks/CPU-DPU/splc25-alloc.sh
new file mode 100755
index 0000000..6f4f055
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/splc25-alloc.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	./make-size.sh ${size}
+	n_nops=$((size * 256))
+	if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then
+		for l in $(seq 1 20); do
+			bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}')
+		done
+	fi
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+	fn=log/$(hostname)/splc25-alloc-${sdk}
+
+	parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+		run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \
+		::: i $(seq 1 5) \
+		::: numa_rank -1 \
+		::: numa_cpu 0 1 \
+		::: nr_ranks $(seq 1 40) \
+		::: size $(seq 0 15) \
+	>> ${fn}.txt
+
+done
diff --git a/Microbenchmarks/CPU-DPU/splc25-transfer.sh b/Microbenchmarks/CPU-DPU/splc25-transfer.sh
new file mode 100755
index 0000000..0227cab
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/splc25-transfer.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+	local "$@"
+	set -e
+	sudo limit_ranks_to_numa_node ${numa_rank}
+	make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+	bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+	return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+	source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+	fn=log/$(hostname)/splc25-transfer-${sdk}
+
+	parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+		run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+		::: i $(seq 1 10) \
+		::: numa_rank -1 \
+		::: numa_in 1 \
+		::: numa_out 1 \
+		::: numa_cpu 1 \
+		::: nr_ranks $(seq 1 40) \
+		::: input_size 1 1048576 \
+	>> ${fn}.txt
+done