diff options
Diffstat (limited to 'GEMV/dimes-hetsim-hbm.sh')
-rwxr-xr-x | GEMV/dimes-hetsim-hbm.sh | 47 |
1 files changed, 30 insertions, 17 deletions
diff --git a/GEMV/dimes-hetsim-hbm.sh b/GEMV/dimes-hetsim-hbm.sh index ad6895e..bbba9e0 100755 --- a/GEMV/dimes-hetsim-hbm.sh +++ b/GEMV/dimes-hetsim-hbm.sh @@ -1,10 +1,9 @@ #!/bin/bash cd baselines/cpu -make -B NUMA=1 mkdir -p log/$(hostname) -fn=log/$(hostname)/$(date +%Y%m%d) +fn=log/$(hostname)/dimes-hetsim-hbm # upstream DPU version uses -m 163840 -n 4096 # → (163840 * 4096 + 4096) * uint32 ≈ 2.5 GiB @@ -14,7 +13,7 @@ fn=log/$(hostname)/$(date +%Y%m%d) run_benchmark() { local "$@" - OMP_NUM_THREADS=$nr_threads ./gemv $ram $ram $cpu + OMP_NUM_THREADS=$nr_threads ./gemv $ram_in $ram_out $cpu $ram_local $cpu_memcpy return $? } @@ -22,22 +21,36 @@ export -f run_benchmark ( -echo "CPU single-node (1/2)" >&2 +make -B NUMA=1 NUMA_MEMCPY=1 -parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ - run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ - ::: cpu $(seq 0 7) \ - ::: ram $(seq 0 15) \ - ::: nr_threads 1 2 4 8 12 16 +echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2 -echo "CPU multi-node (2/2)" >&2 +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} ram_local={ram_local} cpu={cpu} cpu_memcpy={cpu_memcpy} \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: ram_in $(seq 0 15) \ + :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \ + ::: ram_local $(seq 0 15) \ + :::+ cpu $(seq 0 7) $(seq 0 7) \ + :::+ ram_out $(seq 0 15) -parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ - run_benchmark nr_threads={nr_threads} ram={ram} cpu={cpu} \ - ::: cpu -1 \ - ::: ram $(seq 0 15) \ - ::: nr_threads 32 48 64 96 128 +make -B NUMA=1 -) > ${fn}.txt +echo "single-node execution, cpu/out on same node (2/3)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} cpu={cpu} \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: ram_in $(seq 0 15) \ + ::: cpu $(seq 0 7) $(seq 0 7) \ + :::+ ram_out $(seq 0 15) + +echo "multi-node execution (3/3)" >&2 +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ + run_benchmark nr_threads={nr_threads} ram_in={ram_in} ram_out={ram_out} cpu={cpu} \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram_in $(seq 0 15) \ + ::: ram_out $(seq 0 15) -xz -f -v -9 -M 800M ${fn}.txt +) >> ${fn}.txt |