summaryrefslogtreecommitdiff
path: root/SEL
diff options
context:
space:
mode:
authorDaniel Friesel <daniel.friesel@uos.de>2023-06-01 08:03:59 +0200
committerDaniel Friesel <daniel.friesel@uos.de>2023-06-01 08:03:59 +0200
commit7720bd5223c96c5f46efc9033ec023fc4038da46 (patch)
treedd3a199013dfc5ea583cf93f657dd9976c63ebe0 /SEL
parent0540fa9e26e2c8e7e11fcf3e5444e4981f811e1c (diff)
port SEL NMC to dfatool
Diffstat (limited to 'SEL')
-rw-r--r--SEL/baselines/cpu/Makefile6
-rw-r--r--SEL/baselines/cpu/app_baseline.c10
-rw-r--r--SEL/host/app.c51
-rwxr-xr-xSEL/run-paper-strong-full.sh25
-rwxr-xr-xSEL/run-paper-strong-rank.sh26
-rwxr-xr-xSEL/run-paper-weak.sh23
6 files changed, 114 insertions, 27 deletions
diff --git a/SEL/baselines/cpu/Makefile b/SEL/baselines/cpu/Makefile
index 02d930c..81f6d17 100644
--- a/SEL/baselines/cpu/Makefile
+++ b/SEL/baselines/cpu/Makefile
@@ -16,13 +16,15 @@ sel_O2: app_baseline.c
run: sel
./sel -i 1258291200 -t 4
+# upstream code does not include -e 20 and does 3 iterations instead
+
.PHONY: run_O0
run_O0: sel_O0
- ./sel_O0 -i 1258291200 -t 4
+ ./sel_O0 -i 1258291200 -t 4 -e 20
.PHONY: run_O2
run_O2: sel_O2
- ./sel_O2 -i 1258291200 -t 4
+ ./sel_O2 -i 1258291200 -t 4 -e 20
.PHONY: clean
clean:
diff --git a/SEL/baselines/cpu/app_baseline.c b/SEL/baselines/cpu/app_baseline.c
index 04a569f..6ee1cae 100644
--- a/SEL/baselines/cpu/app_baseline.c
+++ b/SEL/baselines/cpu/app_baseline.c
@@ -147,16 +147,12 @@ int main(int argc, char **argv) {
nr_threads++;
if (rep >= p.n_warmup) {
- printf("[::] n_threads=%d e_type=%s n_elements=%d "
- "| throughput_cpu_MBps=%f\n",
+ printf("[::] SEL CPU | n_threads=%d e_type=%s n_elements=%d "
+ "| throughput_MBps=%f",
nr_threads, XSTR(T), file_size,
file_size * 2 * sizeof(T) / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%d "
- "| throughput_cpu_MOpps=%f\n",
- nr_threads, XSTR(T), file_size,
+ printf(" throughput_MOpps=%f",
file_size / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%d |",
- nr_threads, XSTR(T), file_size);
printall(&timer, 0);
}
}
diff --git a/SEL/host/app.c b/SEL/host/app.c
index ef07cf9..2194c81 100644
--- a/SEL/host/app.c
+++ b/SEL/host/app.c
@@ -22,6 +22,9 @@
#define DPU_BINARY "./bin/dpu_code"
#endif
+#define XSTR(x) STR(x)
+#define STR(x) #x
+
#if ENERGY
#include <dpu_probe.h>
#endif
@@ -208,6 +211,35 @@ int main(int argc, char **argv) {
// Free memory
free(results_scan);
+
+ // Check output
+ bool status = true;
+ if(accum != total_count) status = false;
+ for (i = 0; i < accum; i++) {
+ if(C[i] != bufferC[i]){
+ status = false;
+#if PRINT
+ printf("%d: %lu -- %lu\n", i, C[i], bufferC[i]);
+#endif
+ }
+ }
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ if (rep >= p.n_warmup) {
+ printf("[::] SEL NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d "
+ "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f",
+ nr_of_dpus, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size,
+ input_size * sizeof(T) / timer.time[0],
+ input_size * sizeof(T) / timer.time[2],
+ input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+ printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f",
+ input_size / timer.time[0],
+ input_size / timer.time[2],
+ input_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+ printall(&timer, 4);
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ }
}
// Print timing results
@@ -228,28 +260,11 @@ int main(int argc, char **argv) {
printf("DPU Energy (J): %f\t", energy);
#endif
- // Check output
- bool status = true;
- if(accum != total_count) status = false;
- for (i = 0; i < accum; i++) {
- if(C[i] != bufferC[i]){
- status = false;
-#if PRINT
- printf("%d: %lu -- %lu\n", i, C[i], bufferC[i]);
-#endif
- }
- }
- if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
- } else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
- }
-
// Deallocation
free(A);
free(C);
free(C2);
DPU_ASSERT(dpu_free(dpu_set));
- return status ? 0 : -1;
+ return 0;
}
diff --git a/SEL/run-paper-strong-full.sh b/SEL/run-paper-strong-full.sh
new file mode 100755
index 0000000..cc1a99d
--- /dev/null
+++ b/SEL/run-paper-strong-full.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+(
+
+echo "prim-benchmarks UNI strong-full (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+for nr_dpus in 256 512 1024 2048; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 251658240 -x 1 || true
+ fi
+ done
+done
+) | tee log-paper-strong-full.txt
diff --git a/SEL/run-paper-strong-rank.sh b/SEL/run-paper-strong-rank.sh
new file mode 100755
index 0000000..6cffd65
--- /dev/null
+++ b/SEL/run-paper-strong-rank.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+(
+
+echo "prim-benchmarks UNI strong-rank (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+# 256 and 512 are not part of upstream config space
+for nr_dpus in 512 256 1 4 16 64; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 1 || true
+ fi
+ done
+done
+) | tee log-paper-strong-rank.txt
diff --git a/SEL/run-paper-weak.sh b/SEL/run-paper-weak.sh
new file mode 100755
index 0000000..5e83c5e
--- /dev/null
+++ b/SEL/run-paper-weak.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+
+# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module
+# T: data type
+# -w: number of un-timed warmup iterations
+# -e: number of timed iterations
+# -i; ignored, always uses 262144 elements
+
+echo "prim-benchmarks UNI weak (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+# 256 and 512 are not part of upstream config space
+for nr_dpus in 512 256 1 4 16 64; do
+ for nr_tasklets in 1 2 4 8 16; do
+ echo
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then
+ timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 0 || true
+ fi
+ done
+done | tee log-paper-weak.txt