summaryrefslogtreecommitdiff
path: root/Microbenchmarks/CPU-DPU
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-03-12 10:32:32 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-03-12 10:32:32 +0100
commit4db503a4fc166087ba67e276bd33d5c0cd32da91 (patch)
tree4dffab236c5f949485c148bfc0525646c617e18f /Microbenchmarks/CPU-DPU
parente28258f2a2da33a53faeaefa8e555c053d636628 (diff)
CPU-DPU: support large data transfers on >1000 DPUs (uint32 is a bit small)
Diffstat (limited to 'Microbenchmarks/CPU-DPU')
-rw-r--r--Microbenchmarks/CPU-DPU/dpu/task.c2
-rw-r--r--Microbenchmarks/CPU-DPU/host/app.c12
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-idle.sh6
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-rank-idle.sh6
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer.sh18
-rwxr-xr-xMicrobenchmarks/CPU-DPU/support/common.h4
6 files changed, 27 insertions, 21 deletions
diff --git a/Microbenchmarks/CPU-DPU/dpu/task.c b/Microbenchmarks/CPU-DPU/dpu/task.c
index cb68b4c..6ebdeba 100644
--- a/Microbenchmarks/CPU-DPU/dpu/task.c
+++ b/Microbenchmarks/CPU-DPU/dpu/task.c
@@ -15,7 +15,7 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// Barrier
-BARRIER_INIT(my_barrier, NR_TASKLETS);
+BARRIER_INIT(my_barrier, NR_TASKLETS)
extern int main_kernel1(void);
diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c
index 7ec4884..7ef5f21 100644
--- a/Microbenchmarks/CPU-DPU/host/app.c
+++ b/Microbenchmarks/CPU-DPU/host/app.c
@@ -80,15 +80,15 @@ int main(int argc, char **argv) {
//printf("Allocated %d DPU(s)\n", nr_of_dpus);
unsigned int i = 0;
- unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size;
+ uint64_t input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size;
//printf("Load input data\n");
// Input arguments
- const unsigned int input_size_dpu = input_size / nr_of_dpus;
+ const uint64_t input_size_dpu = input_size / nr_of_dpus;
#ifdef BROADCAST
- const unsigned int transfer_size = input_size;
+ const uint64_t transfer_size = input_size;
#else
- const unsigned int transfer_size = input_size;
+ const uint64_t transfer_size = input_size;
#endif
// Input/output allocation
@@ -103,7 +103,7 @@ int main(int argc, char **argv) {
read_input(A, B, input_size);
//printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
- printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d n_instr=%d e_type=%s n_elements=%u e_mode=%s"
+ printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d n_instr=%d e_type=%s n_elements=%lu e_mode=%s"
" | latency_dpu_alloc_ns=%lu latency_dpu_load_ns=%lu latency_dpu_get_ns=%lu\n",
nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, p.n_instr, XSTR(T), transfer_size, transfer_mode,
timer.nanoseconds[4], timer.nanoseconds[5], timer.nanoseconds[6]);
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
stop(&timer, 3);
if (rep >= p.n_warmup) {
- printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d n_instr=%d e_type=%s n_elements=%u n_elements_per_dpu=%u e_mode=%s"
+ printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d n_instr=%d e_type=%s n_elements=%lu n_elements_per_dpu=%lu e_mode=%s"
" | latency_dram_mram_ns=%lu latency_mram_dram_ns=%lu throughput_dram_mram_Bps=%f throughput_mram_dram_Bps=%f",
#ifdef BROADCAST
nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, p.n_instr, XSTR(T), transfer_size, transfer_size, transfer_mode,
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-idle.sh
index 58751c7..dcbde5d 100755
--- a/Microbenchmarks/CPU-DPU/run-transfer-idle.sh
+++ b/Microbenchmarks/CPU-DPU/run-transfer-idle.sh
@@ -1,9 +1,9 @@
#!/bin/sh
-ts="$(date +%Y%m%d)"
-
mkdir -p "$(hostname)-transfer"
-./run-transfer-rank.sh | tee "$(hostname)-transfer/${ts}-idle.txt"
+ts="$(date +%Y%m%d)"
+
+./run-transfer.sh | tee "$(hostname)-transfer/${ts}-idle.txt"
xz -f -v -9 -M 800M "$(hostname)-transfer/${ts}-idle.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh
index b69aa8e..56633d3 100755
--- a/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh
+++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh
@@ -2,6 +2,8 @@
mkdir -p "$(hostname)-transfer"
-./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-idle.txt"
+ts="$(date +%Y%m%d)"
-xz -f -v -9 -M 800M "$(hostname)-transfer/rank-idle.txt"
+./run-transfer-rank.sh | tee "$(hostname)-transfer/${ts}-rank-idle.txt"
+
+xz -f -v -9 -M 800M "$(hostname)-transfer/${ts}-rank-idle.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer.sh b/Microbenchmarks/CPU-DPU/run-transfer.sh
index d029c8c..f01ef86 100755
--- a/Microbenchmarks/CPU-DPU/run-transfer.sh
+++ b/Microbenchmarks/CPU-DPU/run-transfer.sh
@@ -10,22 +10,26 @@ echo "Revision $(git describe --always)"
completion=0
for i in 1 4 8 $(seq 16 16 2543 | shuf); do
+ nrep=$(perl -E 'my $r = int(7700/$ARGV[0]); say $r > 60 ? 60 : $r' "$i")
for k in BROADCAST; do
completion=$((completion+1))
echo "Running ${completion}/161 at $(date)"
# BROADCAST sends the same data to all DPUs, so data size must not exceed the amount of MRAM available on a single DPU (i.e., 64 MB)
for l in 4194304 6291456 8388608; do
- make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
- bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $l
+ if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k; then
+ bin/host_code -w 0 -e $nrep -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $l
+ fi
done
done
for k in SERIAL PUSH; do
echo "Running at $(date)"
- make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
- # utilize 50% to 100% of per-DPU MRAM capacity
- bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 4194304 * i ))
- bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 6291456 * i ))
- bin/host_code -w 0 -e 100 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 8388608 * i ))
+ if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k; then
+ bin/host_code -w 0 -e 60 -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i 8388608
+ # utilize 50% to 100% of per-DPU MRAM capacity
+ bin/host_code -w 0 -e $nrep -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 4194304 * i ))
+ #bin/host_code -w 0 -e $nrep -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 6291456 * i ))
+ #bin/host_code -w 0 -e $nrep -x 1 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i $(( 8388608 * i ))
+ fi
done
done
diff --git a/Microbenchmarks/CPU-DPU/support/common.h b/Microbenchmarks/CPU-DPU/support/common.h
index 3c3c6ae..df8c03c 100755
--- a/Microbenchmarks/CPU-DPU/support/common.h
+++ b/Microbenchmarks/CPU-DPU/support/common.h
@@ -1,9 +1,9 @@
#ifndef _COMMON_H_
#define _COMMON_H_
-// Structures used by both the host and the dpu to communicate information
+// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t size;
+ uint64_t size;
enum kernels {
kernel1 = 0,
nr_kernels = 1,