summaryrefslogtreecommitdiff
path: root/Microbenchmarks/STREAM/run-rank.sh
blob: 49253ea9261b78e11c4500671b1d0e632e4cb62a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash

set -e

# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
# T: data type
# -w: number of un-timed warmup iterations
# -e: number of timed iterations
# -i: input size (number of elements, not number of bytes!)
# Each DPU uses three buffers, each of which holds $i * sizeof($dt) bytes.
# With a total MRAM capacity of 64M, this gives us ~21M per buffer, or 16M when rounding down to the next power of two.
# With a maximum data type width of 8B (uint64_t, double), this limits the number of elements per DPU to 2097152.

(

echo "prim-benchmarks STREAM microbenchmark (dfatool edition)"
echo "Started at $(date)"
echo "Revision $(git describe --always)"

for i in 2097152 1048576 131072 16384 4096; do
	for nr_dpus in 1 4 8 16 32 48 64; do
		for nr_tasklets in 1 8 12 16; do
			for dt in uint64_t uint8_t uint16_t uint32_t float double; do
				for op in tried scale add copy copyw; do
					for bl in 3 4 5 6 8 10; do
						echo
						if make -B OP=${op} NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=${bl} T=${dt} UNROLL=1 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 \
							|| make -B OP=${op} NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=${bl} T=${dt} UNROLL=0 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then
							timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i $i -x 0 || true
						fi
					done
				done
			done
		done
	done
done
echo "Completed at $(date)"
) | tee "log-$(hostname)-rank-idle.txt"