summaryrefslogtreecommitdiff
path: root/Microbenchmarks/CPU-DPU
diff options
context:
space:
mode:
Diffstat (limited to 'Microbenchmarks/CPU-DPU')
-rw-r--r--Microbenchmarks/CPU-DPU/Makefile2
-rw-r--r--Microbenchmarks/CPU-DPU/host/app.c35
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-alloc-rank-idle.sh7
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-alloc-rank-stress.sh18
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-alloc-rank.sh25
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-rank-idle.sh7
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-rank-stress.sh18
-rwxr-xr-xMicrobenchmarks/CPU-DPU/run-transfer-rank.sh27
-rwxr-xr-xMicrobenchmarks/CPU-DPU/support/timer.h60
9 files changed, 126 insertions, 73 deletions
diff --git a/Microbenchmarks/CPU-DPU/Makefile b/Microbenchmarks/CPU-DPU/Makefile
index 697dfcd..f354e67 100644
--- a/Microbenchmarks/CPU-DPU/Makefile
+++ b/Microbenchmarks/CPU-DPU/Makefile
@@ -8,7 +8,7 @@ COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard host/*.c)
COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY}
+HOST_FLAGS := ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TRANSFER}
QUIET = @
diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c
index 07431ee..7c402bc 100644
--- a/Microbenchmarks/CPU-DPU/host/app.c
+++ b/Microbenchmarks/CPU-DPU/host/app.c
@@ -104,9 +104,9 @@ int main(int argc, char **argv) {
//printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s"
- " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n",
+ " | latency_dpu_alloc_ns=%lu latency_dpu_load_ns=%lu latency_dpu_get_ns=%lu\n",
nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode,
- timer.time[4], timer.time[5], timer.time[6]);
+ timer.nanoseconds[4], timer.nanoseconds[5], timer.nanoseconds[6]);
// Loop over main kernel
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
@@ -171,32 +171,19 @@ int main(int argc, char **argv) {
if (rep >= p.n_warmup) {
printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s"
- " | throughput_dram_mram_MBps=%f throughput_mram_dram_MBps=%f",
+ " | latency_dram_mram_ns=%lu latency_mram_dram_ns=%lu throughput_dram_mram_Bps=%f throughput_mram_dram_Bps=%f",
nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode,
- transfer_size * sizeof(T) / timer.time[1],
- transfer_size * sizeof(T) / timer.time[3]);
- printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f",
- transfer_size / timer.time[1],
- transfer_size / timer.time[3]);
- printf(" latency_dpu_launch_us=%f\n",
- timer.time[2]);
+ timer.nanoseconds[1], timer.nanoseconds[3],
+ transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[1],
+ transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[3]);
+ printf(" throughput_dram_mram_Opps=%f throughput_mram_dram_Opps=%f",
+ transfer_size * 1e9 / timer.nanoseconds[1],
+ transfer_size * 1e9 / timer.nanoseconds[3]);
+ printf(" latency_dpu_launch_ns=%lu\n",
+ timer.nanoseconds[2]);
}
}
- // Print timing results
- /*
- printf("CPU-DPU ");
- print(&timer, 1, p.n_reps);
- double time_load = timer.time[1] / (1000 * 1);
- printf("CPU-DPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_load*1e6));
- printf("DPU Kernel ");
- print(&timer, 2, p.n_reps);
- printf("\n");
- printf("DPU-CPU ");
- print(&timer, 3, p.n_reps);
- double time_retrieve = timer.time[3] / (1000 * 1);
- printf("DPU-CPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_retrieve*1e6));
- */
// Check output
bool status = true;
#ifdef BROADCASTX
diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh
new file mode 100755
index 0000000..321f14c
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+mkdir -p "$(hostname)-alloc"
+
+./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-idle.txt"
+
+xz -v -9 -M 800M "$(hostname)-alloc/rank-idle.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh
new file mode 100755
index 0000000..cf90174
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+mkdir -p "$(hostname)-alloc"
+
+NCORES=$(grep -c '^processor' /proc/cpuinfo)
+cleanexit() {
+ pkill -f "stress -c ${NCORES}"
+}
+
+trap cleanexit TERM INT
+
+stress -c ${NCORES} &
+
+./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-stress-c${NCORES}.txt"
+
+cleanexit
+
+xz -v -9 -M 800M "$(hostname)-alloc/rank-stress-c${NCORES}.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh
new file mode 100755
index 0000000..a6907fe
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+set -e
+
+echo "prim-benchmarks CPU-DPU alloc (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+# runtime exclusive of host_code execution time: 25 seconds per inner loop
+# *16 -> about 7 minutes per outer loop
+# *163 -> about 18 hours total
+for i in 1 4 8 16 32 48 64; do
+ for j in $(seq 0 16); do
+ echo $i/64 $j/16
+ ./make-size.sh $j
+ n_nops=$((j * 256))
+ if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then
+ for l in $(seq 1 100); do
+ bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops || true
+ done
+ fi
+ done
+done
+
+echo "Completed at $(date)"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh
new file mode 100755
index 0000000..89dda03
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+mkdir -p "$(hostname)-transfer"
+
+./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-idle.txt"
+
+xz -v -9 -M 800M "$(hostname)-transfer/rank-idle.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh
new file mode 100755
index 0000000..e2f4020
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+mkdir -p "$(hostname)-transfer"
+
+NCORES=$(grep -c '^processor' /proc/cpuinfo)
+cleanexit() {
+ pkill -f "stress -c ${NCORES}"
+}
+
+trap cleanexit TERM INT
+
+stress -c ${NCORES} &
+
+./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-stress-c${NCORES}.txt"
+
+cleanexit
+
+xz -v -9 -M 800M "$(hostname)-transfer/rank-stress-c${NCORES}.txt"
diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh
new file mode 100755
index 0000000..61e9dba
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+set -e
+
+echo "prim-benchmarks CPU-DPU transfer (dfatool edition)"
+echo "Started at $(date)"
+echo "Revision $(git describe --always)"
+
+./make-size.sh 0
+
+for i in 1 4 8 16 32 48 64; do
+ for k in SERIAL PUSH BROADCAST; do
+ # BROADCAST sends the same data to all DPUs, so data size must not exceed the amount of MRAM available on a single DPU (i.e., 64 MB)
+ # 8 B ... 64 MB
+ for l in 1 16 256 4096 65536 262144 1048576 4194304 6291456 8388608; do
+ make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
+ bin/host_code -w 0 -e 100 -x 1 -i $l
+ done
+ done
+ # maximum amount of data
+ for k in SERIAL PUSH; do
+ make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k
+ bin/host_code -w 0 -e 100 -x 1 -i $(( 4194304 * i ))
+ bin/host_code -w 0 -e 100 -x 1 -i $(( 6291456 * i ))
+ bin/host_code -w 0 -e 100 -x 1 -i $(( 8388608 * i ))
+ done
+done
diff --git a/Microbenchmarks/CPU-DPU/support/timer.h b/Microbenchmarks/CPU-DPU/support/timer.h
index 7c24f3b..0a4d6a1 100755
--- a/Microbenchmarks/CPU-DPU/support/timer.h
+++ b/Microbenchmarks/CPU-DPU/support/timer.h
@@ -1,59 +1,23 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
+#pragma once
+#include <time.h>
-#include <sys/time.h>
+typedef struct Timer {
-typedef struct Timer{
+ struct timespec startTime[7];
+ struct timespec stopTime[7];
+ uint64_t nanoseconds[7];
- struct timeval startTime[10];
- struct timeval stopTime[10];
- double time[10];
-
-}Timer;
+} Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
- timer->time[i] = 0.0;
+ timer->nanoseconds[i] = 0;
}
- gettimeofday(&timer->startTime[i], NULL);
+ clock_gettime(CLOCK_MONOTONIC, &timer->startTime[i]);
}
void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+ clock_gettime(CLOCK_MONOTONIC, &timer->stopTime[i]);
+ timer->nanoseconds[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000000 +
+ (timer->stopTime[i].tv_nsec - timer->startTime[i].tv_nsec);
}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }