diff options
Diffstat (limited to 'TS')
| -rw-r--r-- | TS/Makefile | 2 | ||||
| -rwxr-xr-x | TS/benchmark-scripts/ccmcc25-sim.sh | 25 | ||||
| -rwxr-xr-x | TS/benchmark-scripts/ccmcc25.sh | 23 | ||||
| -rw-r--r-- | TS/host/app.c | 30 | ||||
| -rwxr-xr-x | TS/run-paper-strong-full.sh | 29 | ||||
| -rwxr-xr-x | TS/run-paper-strong-rank.sh | 28 | ||||
| -rwxr-xr-x | TS/run-paper-weak.sh | 29 |
7 files changed, 53 insertions, 113 deletions
diff --git a/TS/Makefile b/TS/Makefile index 8eedaf9..2fce611 100644 --- a/TS/Makefile +++ b/TS/Makefile @@ -12,6 +12,8 @@ aspectc ?= 0 aspectc_timing ?= 0 dfatool_timing ?= 1 +HOST_CC := ${CC} + COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DBL=${BL} HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} -lm DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} diff --git a/TS/benchmark-scripts/ccmcc25-sim.sh b/TS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..0df03d9 --- /dev/null +++ b/TS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/ts_host -w 0 -e 5 -n ${ts_size} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks TS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 ts_size={ts_size} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: ts_size 2048 4096 8192 16384 32768 \ +>> ${fn}.txt diff --git a/TS/benchmark-scripts/ccmcc25.sh b/TS/benchmark-scripts/ccmcc25.sh index c7f93c3..74c8371 100755 --- a/TS/benchmark-scripts/ccmcc25.sh +++ b/TS/benchmark-scripts/ccmcc25.sh @@ -1,9 +1,6 @@ #!/bin/bash mkdir -p log/$(hostname) -fn=log/$(hostname)/ccmcc25 - -source /opt/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh run_benchmark_nmc() { local "$@" @@ -16,10 +13,18 @@ run_benchmark_nmc() { export -f run_benchmark_nmc -echo "prim-benchmarks TS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks TS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 numa_rank=any ts_size={ts_size} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: ts_size 8388608 16777216 33554432 67108864 \ + >> ${fn}.txt -parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ - run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 numa_rank=any ts_size={ts_size} \ - ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ - ::: ts_size 8388608 16777216 33554432 67108864 \ ->> ${fn}.txt +done diff --git a/TS/host/app.c b/TS/host/app.c index a75cfe9..bfa14df 100644 --- a/TS/host/app.c +++ b/TS/host/app.c @@ -224,6 +224,15 @@ int main(int argc, char **argv) for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + if (rep >= p.n_warmup) { + start(&timer, 6, 0); + } + streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, + query, query_length, query_mean, query_std); + if (rep >= p.n_warmup) { + stop(&timer, 6); + } + #if WITH_ALLOC_OVERHEAD if (rep >= p.n_warmup) { start(&timer, 0, 0); @@ -250,16 +259,10 @@ int main(int argc, char **argv) start(&timer, 2, 0); } uint32_t i = 0; - - DPU_FOREACH(dpu_set, dpu) { - input_arguments.exclusion_zone = 0; - - DPU_ASSERT(dpu_copy_to - (dpu, "DPU_INPUT_ARGUMENTS", 0, - (const void *)&input_arguments, - sizeof(input_arguments))); - i++; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT)); i = 0; mem_offset = 0; @@ -402,15 +405,6 @@ int main(int argc, char **argv) #endif #endif - if (rep >= p.n_warmup) { - start(&timer, 6, 0); - } - streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, - query, query_length, query_mean, query_std); - if (rep >= p.n_warmup) { - stop(&timer, 6); - } - int status = (minHost == result.minValue); if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET diff --git a/TS/run-paper-strong-full.sh b/TS/run-paper-strong-full.sh deleted file mode 100755 index 5b7656d..0000000 --- a/TS/run-paper-strong-full.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 is not part of upstream -# 12 tasklets are not part of upstream (code does not work with 16…) -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 12 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # This appears to be faster than BL=10. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then - timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n 33554432 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/TS/run-paper-strong-rank.sh b/TS/run-paper-strong-rank.sh deleted file mode 100755 index 58ad641..0000000 --- a/TS/run-paper-strong-rank.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream config space -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # BL=10 appears to be slightly faster. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 60m bin/ts_host -w 0 -e 50 -n 524288 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/TS/run-paper-weak.sh b/TS/run-paper-weak.sh deleted file mode 100755 index 64892f4..0000000 --- a/TS/run-paper-weak.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# 256 and 512 are not part of upstream -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # BL=10 appears to be slightly faster. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then - i=$(( nr_dpus * 524288 )) - timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n $i || true - fi - done -done -) | tee log-paper-weak.txt |
