summaryrefslogtreecommitdiff
path: root/GEMV/dimes-hetsim-hbm.sh
diff options
context:
space:
mode:
Diffstat (limited to 'GEMV/dimes-hetsim-hbm.sh')
-rwxr-xr-xGEMV/dimes-hetsim-hbm.sh47
1 files changed, 30 insertions, 17 deletions
diff --git a/GEMV/dimes-hetsim-hbm.sh b/GEMV/dimes-hetsim-hbm.sh
index ad6895e..bbba9e0 100755
--- a/GEMV/dimes-hetsim-hbm.sh
+++ b/GEMV/dimes-hetsim-hbm.sh
@@ -1,10 +1,9 @@
#!/bin/bash
cd baselines/cpu
-make -B NUMA=1
mkdir -p log/$(hostname)
-fn=log/$(hostname)/$(date +%Y%m%d)
+fn=log/$(hostname)/dimes-hetsim-hbm
# upstream DPU version uses -m 163840 -n 4096
# → (163840 * 4096 + 4096) * uint32 ≈ 2.5 GiB
@@ -14,7 +13,7 @@ fn=log/$(hostname)/$(date +%Y%m%d)
run_benchmark() {
local "$@"
- OMP_NUM_THREADS=$nr_threads ./gemv $ram $ram $cpu
+ OMP_NUM_THREADS=$nr_threads ./gemv $ram_in $ram_out $cpu $ram_local $cpu_memcpy
return $?
}
@@ -22,22 +21,36 @@ export -f run_benchmark
(
-echo "CPU single-node (1/2)" >&2
+make -B NUMA=1 NUMA_MEMCPY=1
-parallel -j1 --eta --joblog ${fn}.1.joblog --header : \
- run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
- ::: cpu $(seq 0 7) \
- ::: ram $(seq 0 15) \
- ::: nr_threads 1 2 4 8 12 16
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
-echo "CPU multi-node (2/2)" >&2
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} ram_local={ram_local} cpu={cpu} cpu_memcpy={cpu_memcpy} \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: ram_in $(seq 0 15) \
+ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
+ ::: ram_local $(seq 0 15) \
+ :::+ cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15)
-parallel -j1 --eta --joblog ${fn}.2.joblog --header : \
- run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \
- ::: cpu -1 \
- ::: ram $(seq 0 15) \
- ::: nr_threads 32 48 64 96 128
+make -B NUMA=1
-) > ${fn}.txt
+echo "single-node execution, cpu/out on same node (2/3)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+ run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} cpu={cpu} \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: ram_in $(seq 0 15) \
+ ::: cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15)
+
+echo "multi-node execution (3/3)" >&2
+parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
+ run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} cpu={cpu} \
+ ::: nr_threads 32 48 64 96 128 \
+ ::: cpu -1 \
+ ::: ram_in $(seq 0 15) \
+ ::: ram_out $(seq 0 15)
-xz -f -v -9 -M 800M ${fn}.txt
+) >> ${fn}.txt