diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2023-12-08 12:03:27 +0100 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2023-12-08 12:03:27 +0100 |
commit | 101150ace6c587ad8a9c17b4bc22ca2fbba37495 (patch) | |
tree | 5f6a50cb31675d334cc14866aee0f7e009613e3a | |
parent | 8834dec80ce31b44bcc280453f761c5be3fd7116 (diff) |
CPU-DPU alloc and transfer microbenchmarks: -search space +cpu load
-rw-r--r-- | Microbenchmarks/CPU-DPU/host/app.c | 4 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/make-size.sh | 37 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-alloc.sh | 59 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-load.sh | 65 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run-transfer.sh | 95 | ||||
-rwxr-xr-x | Microbenchmarks/CPU-DPU/run.sh | 29 |
6 files changed, 140 insertions, 149 deletions
diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c index 853cb6a..07431ee 100644 --- a/Microbenchmarks/CPU-DPU/host/app.c +++ b/Microbenchmarks/CPU-DPU/host/app.c @@ -103,9 +103,9 @@ int main(int argc, char **argv) { read_input(A, B, input_size); //printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); - printf("[::] NMC reconfiguration | n_dpus=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" + printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n", - nr_of_dpus, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, + nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, timer.time[4], timer.time[5], timer.time[6]); // Loop over main kernel diff --git a/Microbenchmarks/CPU-DPU/make-size.sh b/Microbenchmarks/CPU-DPU/make-size.sh index 203d1e4..6166d55 100755 --- a/Microbenchmarks/CPU-DPU/make-size.sh +++ b/Microbenchmarks/CPU-DPU/make-size.sh @@ -1,44 +1,9 @@ #!/bin/sh -: > dpu/size.inc : > dpu/nop.inc for i in $(seq 1 $1); do - for i in $(seq 1 128); do + for i in $(seq 1 256); do echo 'asm volatile("nop");' >> dpu/nop.inc done - cat >>dpu/size.inc <<EOF -volatile unsigned int data${i}a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; -volatile unsigned int data${i}b[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; -volatile unsigned int data${i}c[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; -volatile unsigned int data${i}d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; -EOF done diff --git a/Microbenchmarks/CPU-DPU/run-alloc.sh b/Microbenchmarks/CPU-DPU/run-alloc.sh index d3e509e..9c553b1 100755 --- a/Microbenchmarks/CPU-DPU/run-alloc.sh +++ b/Microbenchmarks/CPU-DPU/run-alloc.sh @@ -1,39 +1,64 @@ #!/bin/bash -trap 'pkill -f "stress -c 32"' INT +NCORES=$(grep -c '^processor' /proc/cpuinfo) + +trap "pkill -f 'stress -c ${NCORES}'" INT set -e -: > tinos-idle.txt -: > tinos-stress-c32.txt -: > tinos-nice-stress-c32.txt +mkdir -p $(hostname)-alloc +rm -f $(hostname)-alloc/* + +for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do + echo "prim-benchmarks CPU-DPU alloc (dfatool edition)" >> $f + echo "Started at $(date)" >> $f + echo "Revision $(git describe --always)" >> $f +done -for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 2543) 2543; do - for j in $(seq 0 32); do - echo $i/2543 $j/32 +# runtime exclusive of host_code execution time: 25 seconds per inner loop +# *16 -> about 7 minutes per outer loop +# *163 -> about 18 hours total +for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 16 2542) 2542; do + for j in $(seq 0 16); do + echo $i/2542 $j/16 ./make-size.sh $j - n_nops=$((j * 128)) + n_nops=$((j * 256)) if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then + uptime + S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-alloc/n_dpus=${i}-n_nops=${n_nops}.json & for l in $(seq 1 40); do - bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-idle.txt || true + bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/idle.txt || true done - stress -c 32 & + pkill -f mpstat + + stress -c ${NCORES} & sleep 2 uptime for l in $(seq 1 40); do - bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-stress-c32.txt || true + bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/stress-c${NCORES}.txt || true done - pkill -f 'stress -c 32' - sleep 30 - nice stress -c 32 & + pkill -f "stress -c ${NCORES}" + + sleep 10 + + nice stress -c ${NCORES} & sleep 2 uptime for l in $(seq 1 40); do - bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> tinos-nice-stress-c32.txt || true + bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops >> $(hostname)-alloc/nice-stress-c${NCORES}.txt || true done - pkill -f 'stress -c 32' - sleep 30 + pkill -f "stress -c ${NCORES}" + + sleep 10 fi done done + +for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do + echo "Completed at $(date)" >> $f +done + +for f in $(hostname)-alloc/idle.txt $(hostname)-alloc/stress-c${NCORES}.txt $(hostname)-alloc/nice-stress-c${NCORES}.txt; do + xz -v -9 -M 800M $f +done diff --git a/Microbenchmarks/CPU-DPU/run-load.sh b/Microbenchmarks/CPU-DPU/run-load.sh deleted file mode 100755 index e4a3460..0000000 --- a/Microbenchmarks/CPU-DPU/run-load.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -trap 'pkill -f "stress -c 32"' INT - -set -e - -: > tinos-transfer-idle.txt -: > tinos-transfer-stress-c32.txt -: > tinos-transfer-nice-stress-c32.txt - -./make-size.sh 0 - -for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 512); do - for j in 1; do - for k in SERIAL PUSH BROADCAST; do - # 8 B ... 64 MB - for l in 1 4 16 64 256 1024 4096 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 6291456 838868; do - echo $i $j $k $l - make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-idle.txt || true - - stress -c 32 & - sleep 2 - uptime - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-stress-c32.txt || true - pkill -f 'stress -c 32' - sleep 30 - - nice stress -c 32 & - sleep 2 - uptime - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-nice-stress-c32.txt || true - pkill -f 'stress -c 32' - sleep 30 - done - done - done -done - -for i in $(seq 512 32 2543) 2543; do - for j in 1; do - for k in SERIAL PUSH BROADCAST; do - # 1 MB ... 1024 MB - for l in 1048576 2097152 4194304 6291456 838868 1677736 3355472 6710944 13421888; do - echo $i $j $k $l - make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-idle.txt || true - - stress -c 32 & - sleep 2 - uptime - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-stress-c32.txt || true - pkill -f 'stress -c 32' - sleep 30 - - nice stress -c 32 & - sleep 2 - uptime - bin/host_code -w 0 -e 50 -x 1 -i $l >> tinos-transfer-nice-stress-c32.txt || true - pkill -f 'stress -c 32' - sleep 30 - done - done - done -done diff --git a/Microbenchmarks/CPU-DPU/run-transfer.sh b/Microbenchmarks/CPU-DPU/run-transfer.sh new file mode 100755 index 0000000..f835f0c --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +NCORES=$(grep -c '^processor' /proc/cpuinfo) + +trap "pkill -f 'stress -c ${NCORES}'" INT + +set -e + +mkdir -p $(hostname)-transfer +rm -f $(hostname)-transfer/* + +for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do + echo "prim-benchmarks CPU-DPU transfer (dfatool edition)" >> $f + echo "Started at $(date)" >> $f + echo "Revision $(git describe --always)" >> $f +done + +./make-size.sh 0 + +# runtime exclusive of host_code execution time: 25 seconds per l loop +# *18 -> about 8 minutes per k loop +# *3 -> about 23 minutes per i loop +# *24 -> about 9 hours total +for i in 1 2 4 8 16 32 48 64 80 96 112 $(seq 128 32 512); do + for k in SERIAL PUSH BROADCAST; do + # 8 B ... 64 MB + for l in 1 16 256 4096 16384 65536 262144 1048576 2097152 4194304 6291456 8388608; do + echo $i/512 $k $l/8388608 + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + + uptime + S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json & + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true + pkill -f mpstat + + stress -c ${NCORES} & + sleep 2 + uptime + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true + pkill -f "stress -c ${NCORES}" + + sleep 10 + + stress -c ${NCORES} & + sleep 2 + uptime + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true + pkill -f "stress -c ${NCORES}" + + sleep 10 + done + done +done + +# runtime exclusive of host_code execution time: 25 seconds per l loop +# *9 -> about 4 minutes per k loop +# *3 -> about 12 minutes per i loop +# *65 -> about 13 hours total +for i in $(seq 512 32 2542) 2542; do + for k in SERIAL PUSH BROADCAST; do + # 1 MB ... 1024 MB + for l in 1048576 2097152 4194304 6291456 838868 1677736 3355472 6710944 13421888; do + echo $i/2542 $k $l/13421888 + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + + uptime + S_TIME_FORMAT=ISO mpstat -P ALL -N ALL -n -o JSON 1 > $(hostname)-transfer/n_dpus=${i}-e_mode=$k-n_elements=${l}.json & + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/idle.txt || true + pkill -f mpstat + + stress -c ${NCORES} & + sleep 2 + uptime + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/stress-c${NCORES}.txt || true + pkill -f "stress -c ${NCORES}" + + sleep 10 + + stress -c ${NCORES} & + sleep 2 + uptime + bin/host_code -w 0 -e 100 -x 1 -i $l >> $(hostname)-transfer/nice-stress-c${NCORES}.txt || true + pkill -f "stress -c ${NCORES}" + + sleep 10 + done + done +d +for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do + echo "Completed at $(date)" >> $f +done + +for f in $(hostname)-transfer/idle.txt $(hostname)-transfer/stress-c${NCORES}.txt $(hostname)-transfer/nice-stress-c${NCORES}.txt; do + xz -v -9 -M 800M $f +done diff --git a/Microbenchmarks/CPU-DPU/run.sh b/Microbenchmarks/CPU-DPU/run.sh deleted file mode 100755 index ced1e76..0000000 --- a/Microbenchmarks/CPU-DPU/run.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -for i in 1 2 4 8 16 32 48 64 80 96 112 128 160 192 224 256 320 384 448 512; do - for j in $(seq 0 32); do - ./make-size.sh $j - n_nops=$((j * 128)) - if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then - for l in $(seq 1 40); do - bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops || true - done - fi - done -done - -./make-size.sh 0 - -for i in 1 2 4 8 16 32 48 64 80 96 112 128 160 192 224 256 320 384 448 512; do - for j in 1; do - for k in SERIAL PUSH BROADCAST; do - # 8 B ... 64 MB - for l in 1 4 16 64 256 1024 4096 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 6291456 838868; do - make -B NR_DPUS=$i NR_TASKLETS=$j BL=10 TRANSFER=$k - bin/host_code -w 0 -e 50 -x 1 -i $l || true - done - done - done -done |