diff options
| -rwxr-xr-x | HST-L/benchmark-scripts/ccmcc25-sim.sh | 25 | ||||
| -rwxr-xr-x | HST-L/benchmark-scripts/ccmcc25.sh | 31 | ||||
| -rwxr-xr-x | Microbenchmarks/CPU-DPU/splc25-alloc.sh | 35 | ||||
| -rwxr-xr-x | Microbenchmarks/CPU-DPU/splc25-transfer.sh | 36 | ||||
| -rw-r--r-- | VA/baselines/cpu/Makefile | 2 | ||||
| -rw-r--r-- | VA/baselines/cpu/app_baseline.c | 10 |
6 files changed, 134 insertions, 5 deletions
diff --git a/HST-L/benchmark-scripts/ccmcc25-sim.sh b/HST-L/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..34e841a --- /dev/null +++ b/HST-L/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/host_code -w 0 -e 5 -b ${bin_size} -i ${input_size} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: input_size $((256 * 256)) $((512 * 512)) $((768 * 768)) $((1024 * 1024)) \ +>> ${fn}.txt diff --git a/HST-L/benchmark-scripts/ccmcc25.sh b/HST-L/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..1c939f1 --- /dev/null +++ b/HST-L/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/host_code -w 0 -e 50 -b ${bin_size} -i ${input_size} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size $((1024 * 1024)) $((1536 * 1024)) $((2048 * 1024)) \ + >> ${fn}.txt + +done diff --git a/Microbenchmarks/CPU-DPU/splc25-alloc.sh b/Microbenchmarks/CPU-DPU/splc25-alloc.sh new file mode 100755 index 0000000..6f4f055 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/splc25-alloc.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + ./make-size.sh ${size} + n_nops=$((size * 256)) + if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then + for l in $(seq 1 20); do + bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}') + done + fi + return $? +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + fn=log/$(hostname)/splc25-alloc-${sdk} + + parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \ + ::: i $(seq 1 5) \ + ::: numa_rank -1 \ + ::: numa_cpu 0 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: size $(seq 0 15) \ + >> ${fn}.txt + +done diff --git a/Microbenchmarks/CPU-DPU/splc25-transfer.sh b/Microbenchmarks/CPU-DPU/splc25-transfer.sh new file mode 100755 index 0000000..0227cab --- /dev/null +++ b/Microbenchmarks/CPU-DPU/splc25-transfer.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +./make-size.sh 0 + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1 + bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size} + return $? +} + +export -f run_benchmark_nmc + +# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output). +# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB). +# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory. + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + fn=log/$(hostname)/splc25-transfer-${sdk} + + parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \ + ::: i $(seq 1 10) \ + ::: numa_rank -1 \ + ::: numa_in 1 \ + ::: numa_out 1 \ + ::: numa_cpu 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: input_size 1 1048576 \ + >> ${fn}.txt +done diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile index 04aacb6..279b0f3 100644 --- a/VA/baselines/cpu/Makefile +++ b/VA/baselines/cpu/Makefile @@ -5,7 +5,7 @@ nop_sync ?= 0 numa ?= 0 numa_memcpy ?= 0 -CFLAGS = +CFLAGS = -DDFATOOL_TIMING=1 LDFLAGS = ifeq (${debug}, 1) diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c index 7975200..fe5125d 100644 --- a/VA/baselines/cpu/app_baseline.c +++ b/VA/baselines/cpu/app_baseline.c @@ -15,7 +15,7 @@ #include <omp.h> #if WITH_BENCHMARK -#include "../../support/timer.h" +#include "../../include/timer.h" #else #define start(...) #define stop(...) @@ -109,7 +109,7 @@ struct Params input_params(int argc, char **argv) p.n_warmup = 1; p.n_reps = 3; p.exp = 1; - p.n_threads = 5; + p.n_threads = 8; #if NUMA p.bitmask_in = NULL; p.bitmask_out = NULL; @@ -213,9 +213,11 @@ int main(int argc, char **argv) C = (T *) malloc(input_size * sizeof(T)); #endif + omp_set_num_threads(p.n_threads); + #pragma omp parallel for for (unsigned long i = 0; i < input_size; i++) { - A[i] = (T) (rand()); - B[i] = (T) (rand()); + A[i] = (T) i % (1<<31) + 5; + B[i] = (T) i % (1<<31) + 6; } #if NUMA |
