summaryrefslogtreecommitdiff
path: root/HST-S/dimes-hetsim-hbm.sh
diff options
context:
space:
mode:
Diffstat (limited to 'HST-S/dimes-hetsim-hbm.sh')
-rwxr-xr-xHST-S/dimes-hetsim-hbm.sh60
1 files changed, 30 insertions, 30 deletions
diff --git a/HST-S/dimes-hetsim-hbm.sh b/HST-S/dimes-hetsim-hbm.sh
index 08842b7..44c191d 100755
--- a/HST-S/dimes-hetsim-hbm.sh
+++ b/HST-S/dimes-hetsim-hbm.sh
@@ -1,51 +1,51 @@
-#!/bin/bash
+#!/bin/sh
cd baselines/cpu
-make -B NUMA=1
mkdir -p log/$(hostname)
-fn=log/$(hostname)/$(date +%Y%m%d)
-
-# Upstream DPU version uses 256 bins and 1536 * 1024 * 64 uint32 elements == 384 MiB total (-x 2 with implicit -z 64)
-input_size_upstream=$((1536 * 1024 * 64))
+fn=log/$(hostname)/dimes-hetsim-hbm
-# Here: 2 GiB
-input_size_dpu=$((2**29))
+# Input: (2^29 == 536870912) * int32 == 2 GiB
(
-echo "single-node execution, upstream ref (1/4)" >&2
+make -B NUMA=1 NUMA_MEMCPY=1
+
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
- ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -D {ram_local} -M {cpu-memcpy} -t {nr_threads} \
+ ::: i $(seq 1 5) \
::: nr_threads 1 2 4 8 12 16 \
- ::: cpu $(seq 0 7) \
- ::: ram $(seq 0 15)
+ ::: ram_in $(seq 0 15) \
+ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
+ ::: ram_local $(seq 0 15) \
+ :::+ cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15) \
+ ::: input_size 536870912
+
+make -B NUMA=1
-echo "single-node execution, DPU ref (2/4)" >&2
+echo "single-node execution, cpu/out on same node (2/3)" >&2
parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
- ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -t {nr_threads} \
+ ::: i $(seq 1 5) \
::: nr_threads 1 2 4 8 12 16 \
- ::: cpu $(seq 0 7) \
- ::: ram $(seq 0 15)
+ ::: ram_in $(seq 0 15) \
+ ::: cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15) \
+ ::: input_size 536870912
-echo "multi-node execution, upstream ref (3/4)" >&2
+echo "multi-node execution (3/3)" >&2
parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
- ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -t {nr_threads} \
+ ::: i $(seq 1 20) \
::: nr_threads 32 48 64 96 128 \
::: cpu -1 \
- ::: ram $(seq 0 15)
-
-echo "multi-node execution, DPU ref (4/4)" >&2
-
-parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
- ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
- ::: nr_threads 32 48 64 96 128 \
- ::: cpu -1 \
- ::: ram $(seq 0 15)
-
-) > ${fn}.txt
+ ::: ram_in $(seq 0 15) \
+ ::: ram_out $(seq 0 15) \
+ ::: input_size 536870912
-xz -f -v -9 -M 800M ${fn}.txt
+) >> ${fn}.txt