summaryrefslogtreecommitdiff
path: root/Microbenchmarks
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-03-04 07:50:24 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-03-04 07:50:24 +0100
commitcb66c09298004a6dc6ac88300bee8441d4046a29 (patch)
treebf9a74dc6391d29b87b824b3e825918d40235a09 /Microbenchmarks
parenta03a2f9fcc8bf83eb2ea0118d3abbfdfdbf5fb87 (diff)
CPU-DPU
Diffstat (limited to 'Microbenchmarks')
-rw-r--r--Microbenchmarks/CPU-DPU/host/app.c4
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-idle.sh9
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-stress.sh20
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer.sh108
4 files changed, 53 insertions, 88 deletions
diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c
index 504d0de..7ec4884 100644
--- a/Microbenchmarks/CPU-DPU/host/app.c
+++ b/Microbenchmarks/CPU-DPU/host/app.c
@@ -172,7 +172,11 @@ int main(int argc, char **argv) {
if (rep >= p.n_warmup) {
printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d n_instr=%d e_type=%s n_elements=%u n_elements_per_dpu=%u e_mode=%s"
" | latency_dram_mram_ns=%lu latency_mram_dram_ns=%lu throughput_dram_mram_Bps=%f throughput_mram_dram_Bps=%f",
+#ifdef BROADCAST
+ nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, p.n_instr, XSTR(T), transfer_size, transfer_size, transfer_mode,
+#else
nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, p.n_instr, XSTR(T), transfer_size, transfer_size / NR_DPUS, transfer_mode,
+#endif
timer.nanoseconds[1], timer.nanoseconds[3],
transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[1],
transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[3]);
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-idle.sh
new file mode 100755
index 0000000..58751c7
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer-idle.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+ts="$(date +%Y%m%d)"
+
+mkdir -p "$(hostname)-transfer"
+
+./run-transfer-rank.sh | tee "$(hostname)-transfer/${ts}-idle.txt"
+
+xz -f -v -9 -M 800M "$(hostname)-transfer/${ts}-idle.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-stress.sh b/Microbenchmarks/CPU-DPU/run-transfer-stress.sh
new file mode 100755
index 0000000..a508bd6
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer-stress.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+ts="$(date +%Y%m%d)"
+
+mkdir -p "$(hostname)-transfer"
+
+NCORES=$(grep -c '^processor' /proc/cpuinfo)
+cleanexit() {
+ pkill -f "stress -c ${NCORES}"
+}
+
+trap cleanexit TERM INT
+
+stress -c ${NCORES} &
+
+./run-transfer-rank.sh | tee "$(hostname)-transfer/${ts}-stress-c${NCORES}.txt"
+
+cleanexit
+
+xz -f -v -9 -M 800M "$(hostname)-transfer/${ts}-stress-c${NCORES}.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer.sh b/Microbenchmarks/CPU-DPU/run-transfer.sh
index e247105..d029c8c 100755
--- a/Microbenchmarks/CPU-DPU/run-transfer.sh
+++ b/Microbenchmarks/CPU-DPU/run-transfer.sh
@@ -1,100 +1,32 @@
-#!/bin/bash
-
-NCORES=$(grep -c '^processor' /proc/cpuinfo)
-
-cleanexit() {
- pkill -f "stress -c ${NCORES}"
- pkill -f mpstat
-}
-
-trap cleanexit TERM INT
+#!/bin/sh
set -e
-mkdir -p $(hostname)-transfer
-rm -f $(hostname)-transfer/*
-
-for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
- echo "prim-benchmarks CPU-DPU transfer (dfatool edition)" >> $f
- echo "Started at $(date)" >> $f
- echo "Revision $(git describe --always)" >> $f
-done
+echo "prim-benchmarks CPU-DPU transfer (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
./make-size.sh 0
-# runtime exclusive of host_code execution time: 25 seconds per l loop
-# *18 -> about 8 minutes per k loop
-# *3 -> about 23 minutes per i loop
-# *24 -> about 9 hours total
-for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 512); do
- for k in SERIAL PUSH BROADCAST; do
- # 8 B ... 64 MB
- for l in 1 16 256 4096 16384 65536 262144 1048576 2097152 4194304 6291456 8388608; do
- echo $i/512 $k $l/8388608
+completion=0
+for i in 1 4 8 $(seq 16 16 2543 | shuf); do
+ for k in BROADCAST; do
+ completion=$((completion+1))
+ echo "Running ${completion}/161 at $(date)"
+ # BROADCAST sends the same data to all DPUs, so data size must not exceed the amount of MRAM available on a single DPU (i.e., 64 MB)
+ for l in 4194304 6291456 8388608; do
make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
-
- uptime
- S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json &
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true
- pkill -f mpstat
-
- stress -c ${NCORES} &
- sleep 2
- uptime
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true
- pkill -f "stress -c ${NCORES}"
-
- sleep 10
-
- stress -c ${NCORES} &
- sleep 2
- uptime
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true
- pkill -f "stress -c ${NCORES}"
-
- sleep 10
+ bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $l
done
done
-done
-
-# runtime exclusive of host_code execution time: 25 seconds per l loop
-# *9 -> about 4 minutes per k loop
-# *3 -> about 12 minutes per i loop
-# *65 -> about 13 hours total
-for i in $(seq 512 32 2542) 2542; do
- for k in SERIAL PUSH BROADCAST; do
- # 1 MB ... 1024 MB
- for l in 1048576 2097152 4194304 6291456 838868 1677736 3355472 6710944 13421888; do
- echo $i/2542 $k $l/13421888
- make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
-
- uptime
- S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json &
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true
- pkill -f mpstat
-
- stress -c ${NCORES} &
- sleep 2
- uptime
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true
- pkill -f "stress -c ${NCORES}"
-
- sleep 10
-
- stress -c ${NCORES} &
- sleep 2
- uptime
- bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true
- pkill -f "stress -c ${NCORES}"
-
- sleep 10
- done
+ for k in SERIAL PUSH; do
+ echo "Running at $(date)"
+ make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
+ # utilize 50% to 100% of per-DPU MRAM capacity
+ bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 4194304 * i ))
+ bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 6291456 * i ))
+ bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 8388608 * i ))
done
-d
-for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
- echo "Completed at $(date)" >> $f
done
-for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
- xz -v -9 -M 800M $f
-done
+echo "Completed at $(date)"