summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xHST-L/benchmark-scripts/ccmcc25-sim.sh25
-rwxr-xr-xHST-L/benchmark-scripts/ccmcc25.sh31
-rwxr-xr-xMicrobenchmarks/CPU-DPU/splc25-alloc.sh35
-rwxr-xr-xMicrobenchmarks/CPU-DPU/splc25-transfer.sh36
-rw-r--r--VA/baselines/cpu/Makefile2
-rw-r--r--VA/baselines/cpu/app_baseline.c10
6 files changed, 134 insertions, 5 deletions
diff --git a/HST-L/benchmark-scripts/ccmcc25-sim.sh b/HST-L/benchmark-scripts/ccmcc25-sim.sh
new file mode 100755
index 0000000..34e841a
--- /dev/null
+++ b/HST-L/benchmark-scripts/ccmcc25-sim.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+ dfatool_timing=0 aspectc=1 aspectc_timing=1
+ bin/host_code -w 0 -e 5 -b ${bin_size} -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim
+
+source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator
+
+echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 \
+ ::: nr_dpus 1 2 4 8 16 32 48 64 \
+ ::: input_size $((256 * 256)) $((512 * 512)) $((768 * 768)) $((1024 * 1024)) \
+>> ${fn}.txt
diff --git a/HST-L/benchmark-scripts/ccmcc25.sh b/HST-L/benchmark-scripts/ccmcc25.sh
new file mode 100755
index 0000000..1c939f1
--- /dev/null
+++ b/HST-L/benchmark-scripts/ccmcc25.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \
+ dfatool_timing=0 aspectc=1 aspectc_timing=1
+ bin/host_code -w 0 -e 50 -b ${bin_size} -i ${input_size}
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+ fn=log/$(hostname)/ccmcc25-sdk${sdk}
+
+ source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+
+ echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt
+
+ parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 numa_rank={numa_rank} \
+ ::: numa_rank any \
+ ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+ ::: input_size $((1024 * 1024)) $((1536 * 1024)) $((2048 * 1024)) \
+ >> ${fn}.txt
+
+done
diff --git a/Microbenchmarks/CPU-DPU/splc25-alloc.sh b/Microbenchmarks/CPU-DPU/splc25-alloc.sh
new file mode 100755
index 0000000..6f4f055
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/splc25-alloc.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ ./make-size.sh ${size}
+ n_nops=$((size * 256))
+ if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then
+ for l in $(seq 1 20); do
+ bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}')
+ done
+ fi
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+
+ source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+ fn=log/$(hostname)/splc25-alloc-${sdk}
+
+ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \
+ ::: i $(seq 1 5) \
+ ::: numa_rank -1 \
+ ::: numa_cpu 0 1 \
+ ::: nr_ranks $(seq 1 40) \
+ ::: size $(seq 0 15) \
+ >> ${fn}.txt
+
+done
diff --git a/Microbenchmarks/CPU-DPU/splc25-transfer.sh b/Microbenchmarks/CPU-DPU/splc25-transfer.sh
new file mode 100755
index 0000000..0227cab
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/splc25-transfer.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+ bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do
+ source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh
+ fn=log/$(hostname)/splc25-transfer-${sdk}
+
+ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+ ::: i $(seq 1 10) \
+ ::: numa_rank -1 \
+ ::: numa_in 1 \
+ ::: numa_out 1 \
+ ::: numa_cpu 1 \
+ ::: nr_ranks $(seq 1 40) \
+ ::: input_size 1 1048576 \
+ >> ${fn}.txt
+done
diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile
index 04aacb6..279b0f3 100644
--- a/VA/baselines/cpu/Makefile
+++ b/VA/baselines/cpu/Makefile
@@ -5,7 +5,7 @@ nop_sync ?= 0
numa ?= 0
numa_memcpy ?= 0
-CFLAGS =
+CFLAGS = -DDFATOOL_TIMING=1
LDFLAGS =
ifeq (${debug}, 1)
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c
index 7975200..fe5125d 100644
--- a/VA/baselines/cpu/app_baseline.c
+++ b/VA/baselines/cpu/app_baseline.c
@@ -15,7 +15,7 @@
#include <omp.h>
#if WITH_BENCHMARK
-#include "../../support/timer.h"
+#include "../../include/timer.h"
#else
#define start(...)
#define stop(...)
@@ -109,7 +109,7 @@ struct Params input_params(int argc, char **argv)
p.n_warmup = 1;
p.n_reps = 3;
p.exp = 1;
- p.n_threads = 5;
+ p.n_threads = 8;
#if NUMA
p.bitmask_in = NULL;
p.bitmask_out = NULL;
@@ -213,9 +213,11 @@ int main(int argc, char **argv)
C = (T *) malloc(input_size * sizeof(T));
#endif
+ omp_set_num_threads(p.n_threads);
+ #pragma omp parallel for
for (unsigned long i = 0; i < input_size; i++) {
- A[i] = (T) (rand());
- B[i] = (T) (rand());
+ A[i] = (T) i % (1<<31) + 5;
+ B[i] = (T) i % (1<<31) + 6;
}
#if NUMA