summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2023-12-08 12:03:27 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2023-12-08 12:03:27 +0100
commit101150ace6c587ad8a9c17b4bc22ca2fbba37495 (patch)
tree5f6a50cb31675d334cc14866aee0f7e009613e3a
parent8834dec80ce31b44bcc280453f761c5be3fd7116 (diff)
CPU-DPU alloc and transfer microbenchmarks: -search space +cpu load
-rw-r--r--Microbenchmarks/CPU-DPU/host/app.c4
-rwxr-xr-xMicrobenchmarks/CPU-DPU/make-size.sh37
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-alloc.sh59
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-load.sh65
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer.sh95
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run.sh29
6 files changed, 140 insertions, 149 deletions
diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c
index 853cb6a..07431ee 100644
--- a/Microbenchmarks/CPU-DPU/host/app.c
+++ b/Microbenchmarks/CPU-DPU/host/app.c
@@ -103,9 +103,9 @@ int main(int argc, char **argv) {
read_input(A, B, input_size);
//printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
- printf("[::] NMC reconfiguration | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s"
+ printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s"
" | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n",
- nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode,
+ nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode,
timer.time[4], timer.time[5], timer.time[6]);
// Loop over main kernel
diff --git a/Microbenchmarks/CPU-DPU/make-size.sh b/Microbenchmarks/CPU-DPU/make-size.sh
index 203d1e4..6166d55 100755
--- a/Microbenchmarks/CPU-DPU/make-size.sh
+++ b/Microbenchmarks/CPU-DPU/make-size.sh
@@ -1,44 +1,9 @@
#!/bin/sh
-: > dpu/size.inc
: > dpu/nop.inc
for i in $(seq 1 $1); do
- for i in $(seq 1 128); do
+ for i in $(seq 1 256); do
echo 'asm volatile("nop");' >> dpu/nop.inc
done
- cat >>dpu/size.inc <<EOF
-volatile unsigned int data${i}a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
-volatile unsigned int data${i}b[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
-volatile unsigned int data${i}c[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
-volatile unsigned int data${i}d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
-EOF
done
diff --git a/Microbenchmarks/CPU-DPU/run-alloc.sh b/Microbenchmarks/CPU-DPU/run-alloc.sh
index d3e509e..9c553b1 100755
--- a/Microbenchmarks/CPU-DPU/run-alloc.sh
+++ b/Microbenchmarks/CPU-DPU/run-alloc.sh
@@ -1,39 +1,64 @@
#!/bin/bash
-trap 'pkill -f "stress -c 32"' INT
+NCORES=$(grep -c '^processor' /proc/cpuinfo)
+
+trap "pkill -f 'stress -c ${NCORES}'" INT
set -e
-: > tinos-idle.txt
-: > tinos-stress-c32.txt
-: > tinos-nice-stress-c32.txt
+mkdir -p $(hostname)-alloc
+rm -f $(hostname)-alloc/*
+
+for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do
+ echo "prim-benchmarks CPU-DPU alloc (dfatool edition)" >> $f
+ echo "Started at $(date)" >> $f
+ echo "Revision $(git describe --always)" >> $f
+done
-for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 2543) 2543; do
- for j in $(seq 0 32); do
- echo $i/2543 $j/32
+# runtime exclusive of host_code execution time: 25 seconds per inner loop
+# *16 -> about 7 minutes per outer loop
+# *163 -> about 18 hours total
+for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 16 2542) 2542; do
+ for j in $(seq 0 16); do
+ echo $i/2542 $j/16
./make-size.sh $j
- n_nops=$((j * 128))
+ n_nops=$((j * 256))
if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then
+
uptime
+ S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-alloc/n_dpus=${i}-n_nops=${n_nops}.json &
for l in $(seq 1 40); do
- bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-idle.txt || true
+ bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/idle.txt || true
done
- stress -c 32 &
+ pkill -f mpstat
+
+ stress -c ${NCORES} &
sleep 2
uptime
for l in $(seq 1 40); do
- bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-stress-c32.txt || true
+ bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/stress-c${NCORES}.txt || true
done
- pkill -f 'stress -c 32'
- sleep 30
- nice stress -c 32 &
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
+
+ nice stress -c ${NCORES} &
sleep 2
uptime
for l in $(seq 1 40); do
- bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-nice-stress-c32.txt || true
+ bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/nice-stress-c${NCORES}.txt || true
done
- pkill -f 'stress -c 32'
- sleep 30
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
fi
done
done
+
+for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do
+ echo "Completed at $(date)" >> $f
+done
+
+for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do
+ xz -v -9 -M 800M $f
+done
diff --git a/Microbenchmarks/CPU-DPU/run-load.sh b/Microbenchmarks/CPU-DPU/run-load.sh
deleted file mode 100755
index e4a3460..0000000
--- a/Microbenchmarks/CPU-DPU/run-load.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-trap 'pkill -f "stress -c 32"' INT
-
-set -e
-
-: > tinos-transfer-idle.txt
-: > tinos-transfer-stress-c32.txt
-: > tinos-transfer-nice-stress-c32.txt
-
-./make-size.sh 0
-
-for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 512); do
- for j in 1; do
- for k in SERIAL PUSH BROADCAST; do
- # 8 B ... 64 MB
- for l in 1 4 16 64 256 1024 4096 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 6291456 838868; do
- echo $i $j $k $l
- make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-idle.txt || true
-
- stress -c 32 &
- sleep 2
- uptime
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-stress-c32.txt || true
- pkill -f 'stress -c 32'
- sleep 30
-
- nice stress -c 32 &
- sleep 2
- uptime
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-nice-stress-c32.txt || true
- pkill -f 'stress -c 32'
- sleep 30
- done
- done
- done
-done
-
-for i in $(seq 512 32 2543) 2543; do
- for j in 1; do
- for k in SERIAL PUSH BROADCAST; do
- # 1 MB ... 1024 MB
- for l in 1048576 2097152 4194304 6291456 838868 1677736 3355472 6710944 13421888; do
- echo $i $j $k $l
- make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-idle.txt || true
-
- stress -c 32 &
- sleep 2
- uptime
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-stress-c32.txt || true
- pkill -f 'stress -c 32'
- sleep 30
-
- nice stress -c 32 &
- sleep 2
- uptime
- bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-nice-stress-c32.txt || true
- pkill -f 'stress -c 32'
- sleep 30
- done
- done
- done
-done
diff --git a/Microbenchmarks/CPU-DPU/run-transfer.sh b/Microbenchmarks/CPU-DPU/run-transfer.sh
new file mode 100755
index 0000000..f835f0c
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+NCORES=$(grep -c '^processor' /proc/cpuinfo)
+
+trap "pkill -f 'stress -c ${NCORES}'" INT
+
+set -e
+
+mkdir -p $(hostname)-transfer
+rm -f $(hostname)-transfer/*
+
+for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
+ echo "prim-benchmarks CPU-DPU transfer (dfatool edition)" >> $f
+ echo "Started at $(date)" >> $f
+ echo "Revision $(git describe --always)" >> $f
+done
+
+./make-size.sh 0
+
+# runtime exclusive of host_code execution time: 25 seconds per l loop
+# *18 -> about 8 minutes per k loop
+# *3 -> about 23 minutes per i loop
+# *24 -> about 9 hours total
+for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 512); do
+ for k in SERIAL PUSH BROADCAST; do
+ # 8 B ... 64 MB
+ for l in 1 16 256 4096 16384 65536 262144 1048576 2097152 4194304 6291456 8388608; do
+ echo $i/512 $k $l/8388608
+ make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
+
+ uptime
+ S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json &
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true
+ pkill -f mpstat
+
+ stress -c ${NCORES} &
+ sleep 2
+ uptime
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
+
+ stress -c ${NCORES} &
+ sleep 2
+ uptime
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
+ done
+ done
+done
+
+# runtime exclusive of host_code execution time: 25 seconds per l loop
+# *9 -> about 4 minutes per k loop
+# *3 -> about 12 minutes per i loop
+# *65 -> about 13 hours total
+for i in $(seq 512 32 2542) 2542; do
+ for k in SERIAL PUSH BROADCAST; do
+ # 1 MB ... 1024 MB
+ for l in 1048576 2097152 4194304 6291456 838868 1677736 3355472 6710944 13421888; do
+ echo $i/2542 $k $l/13421888
+ make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
+
+ uptime
+ S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json &
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true
+ pkill -f mpstat
+
+ stress -c ${NCORES} &
+ sleep 2
+ uptime
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
+
+ stress -c ${NCORES} &
+ sleep 2
+ uptime
+ bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true
+ pkill -f "stress -c ${NCORES}"
+
+ sleep 10
+ done
+ done
+d
+for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
+ echo "Completed at $(date)" >> $f
+done
+
+for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do
+ xz -v -9 -M 800M $f
+done
diff --git a/Microbenchmarks/CPU-DPU/run.sh b/Microbenchmarks/CPU-DPU/run.sh
deleted file mode 100755
index ced1e76..0000000
--- a/Microbenchmarks/CPU-DPU/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-set -e
-
-for i in 1 2 4 8 16 32 48 64 80 96 112 128 160 192 224 256 320 384 448 512; do
- for j in $(seq 0 32); do
- ./make-size.sh $j
- n_nops=$((j * 128))
- if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then
- for l in $(seq 1 40); do
- bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops || true
- done
- fi
- done
-done
-
-./make-size.sh 0
-
-for i in 1 2 4 8 16 32 48 64 80 96 112 128 160 192 224 256 320 384 448 512; do
- for j in 1; do
- for k in SERIAL PUSH BROADCAST; do
- # 8 B ... 64 MB
- for l in 1 4 16 64 256 1024 4096 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 6291456 838868; do
- make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k
- bin/host_code -w 0 -e 50 -x 1 -i $l || true
- done
- done
- done
-done