diff options
-rw-r--r-- | HST-S/baselines/cpu/app_baseline.c | 2 | ||||
-rwxr-xr-x | HST-S/dimes-hetsim-hbm.sh | 51 | ||||
-rwxr-xr-x | HST-S/dimes-hetsim-nmc.sh | 103 | ||||
-rwxr-xr-x | HST-S/dimes-hetsim.sh | 20 |
4 files changed, 155 insertions, 21 deletions
diff --git a/HST-S/baselines/cpu/app_baseline.c b/HST-S/baselines/cpu/app_baseline.c index 0c766e0..0e16944 100644 --- a/HST-S/baselines/cpu/app_baseline.c +++ b/HST-S/baselines/cpu/app_baseline.c @@ -288,7 +288,7 @@ int main(int argc, char **argv) { #pragma omp atomic nr_threads++; - printf("[::] HST-S CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d" + printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d" #if NUMA " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" #endif diff --git a/HST-S/dimes-hetsim-hbm.sh b/HST-S/dimes-hetsim-hbm.sh new file mode 100755 index 0000000..08842b7 --- /dev/null +++ b/HST-S/dimes-hetsim-hbm.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +cd baselines/cpu +make -B NUMA=1 + +mkdir -p log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses 256 bins and 1536 * 1024 * 64 uint32 elements == 384 MiB total (-x 2 with implicit -z 64) +input_size_upstream=$((1536 * 1024 * 64)) + +# Here: 2 GiB +input_size_dpu=$((2**29)) + +( + +echo "single-node execution, upstream ref (1/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: cpu $(seq 0 7) \ + ::: ram $(seq 0 15) + +echo "single-node execution, DPU ref (2/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: cpu $(seq 0 7) \ + ::: ram $(seq 0 15) + +echo "multi-node execution, upstream ref (3/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ + ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram $(seq 0 15) + +echo "multi-node execution, DPU ref (4/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ + ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram $(seq 0 15) + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt diff --git a/HST-S/dimes-hetsim-nmc.sh b/HST-S/dimes-hetsim-nmc.sh new file mode 100755 index 0000000..b3f0669 --- /dev/null +++ b/HST-S/dimes-hetsim-nmc.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d) + +# Upstream DPU version uses 256 bins and 1536 * 1024 * 64 uint32 elements == 384 MiB total (-x 2 with implicit -z 64) +input_size_upstream=$((1536 * 1024 * 64)) + +# Here: 2 GiB +input_size_dpu=$((2**29)) + +run_benchmark_nmc() { + local "$@" + sudo limit_ranks_to_numa_node ${numa_rank} + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then + bin/host_code -w 0 -e 100 -b 256 -x 1 -i ${input_size} + fi + return $? +} + +export -f run_benchmark_nmc + +( + +echo "NMC single-node upstream-ref (1/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ + input_size=${input_size_upstream} \ + ::: numa_rank 0 1 \ + ::: nr_dpus 64 128 256 512 768 1024 + +echo "NMC multi-node upstream-ref (2/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ + input_size=${input_size_upstream} \ + ::: numa_rank -1 \ + ::: nr_dpus 1536 2048 2304 + +echo "NMC single-node DPU-ref (3/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ + input_size=${input_size_dpu} \ + ::: numa_rank 0 1 \ + ::: nr_dpus 64 128 256 512 768 1024 + +echo "NMC multi-node DPU-ref (4/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ + input_size=${input_size_dpu} \ + ::: numa_rank -1 \ + ::: nr_dpus 1536 2048 2304 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt + +cd baselines/cpu +make -B NUMA=1 + +( + +echo "CPU single-node upstream-ref (1/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 20 -x 1 \ + ::: cpu 0 1 \ + ::: ram 0 1 \ + ::: nr_threads 1 2 4 8 12 16 32 + +echo "CPU single-node DPU-ref (2/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 20 -x 1 \ + ::: i $(seq 1 20) \ + ::: cpu 0 1 \ + ::: ram 0 1 \ + ::: nr_threads 1 2 4 8 12 16 32 + +echo "CPU multi-node upstream-ref (3/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ + ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 20 -x 1 \ + ::: i $(seq 1 20) \ + ::: cpu -1 \ + ::: ram 0 1 \ + ::: nr_threads 48 64 + +echo "CPU multi-node DPU-ref (4/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ + ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 20 -x 1 \ + ::: i $(seq 1 20) \ + ::: cpu -1 \ + ::: ram 0 1 \ + ::: nr_threads 48 64 + +) > ${fn}.txt + +xz -f -v -9 -M 800M ${fn}.txt diff --git a/HST-S/dimes-hetsim.sh b/HST-S/dimes-hetsim.sh deleted file mode 100755 index 54b660d..0000000 --- a/HST-S/dimes-hetsim.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -cd baselines/cpu -make -B NUMA=1 - -# upstream uses 1006632960 uint32 elements == 3.75 GiB -# This is way larger than the input file... - -for i in `seq 1 10`; do - for nr_threads in 1 2 4 8 12 16; do - for cpu in 0 1 2 3 4 5 6 7; do - for ram in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do - for j in `seq 1 10`; do - ./hist -A $ram -B $ram -C $cpu -t $nr_threads -w 0 -e 100 -x 2 - done - ./hist -i 1006632960 -A $ram -B $ram -C $cpu -t $nr_threads -w 0 -e 20 -x 2 - done - done - done -done |