From d7b9186a2a0285356b9aa38b84c8a7a151e48593 Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Thu, 22 Feb 2024 12:24:04 +0100 Subject: CPU-DPU microbenchmark: switch to nanoseconds --- Microbenchmarks/CPU-DPU/Makefile | 2 +- Microbenchmarks/CPU-DPU/host/app.c | 35 ++++--------- Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh | 7 +++ Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh | 18 +++++++ Microbenchmarks/CPU-DPU/run-alloc-rank.sh | 25 +++++++++ Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh | 7 +++ .../CPU-DPU/run-transfer-rank-stress.sh | 18 +++++++ Microbenchmarks/CPU-DPU/run-transfer-rank.sh | 27 ++++++++++ Microbenchmarks/CPU-DPU/support/timer.h | 60 +++++----------------- 9 files changed, 126 insertions(+), 73 deletions(-) create mode 100755 Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh create mode 100755 Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh create mode 100755 Microbenchmarks/CPU-DPU/run-alloc-rank.sh create mode 100755 Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh create mode 100755 Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh create mode 100755 Microbenchmarks/CPU-DPU/run-transfer-rank.sh (limited to 'Microbenchmarks') diff --git a/Microbenchmarks/CPU-DPU/Makefile b/Microbenchmarks/CPU-DPU/Makefile index 697dfcd..f354e67 100644 --- a/Microbenchmarks/CPU-DPU/Makefile +++ b/Microbenchmarks/CPU-DPU/Makefile @@ -8,7 +8,7 @@ COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} +HOST_FLAGS := ${COMMON_FLAGS} -D_POSIX_C_SOURCE=200809L -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TRANSFER} -DDPU_BINARY=${DPU_BINARY} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TRANSFER} QUIET = @ diff --git a/Microbenchmarks/CPU-DPU/host/app.c b/Microbenchmarks/CPU-DPU/host/app.c index 07431ee..7c402bc 100644 --- a/Microbenchmarks/CPU-DPU/host/app.c +++ b/Microbenchmarks/CPU-DPU/host/app.c @@ -104,9 +104,9 @@ int main(int argc, char **argv) { //printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); printf("[::] NMC reconfiguration | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" - " | latency_dpu_alloc_us=%f latency_dpu_load_us=%f latency_dpu_get_us=%f\n", + " | latency_dpu_alloc_ns=%lu latency_dpu_load_ns=%lu latency_dpu_get_ns=%lu\n", nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, - timer.time[4], timer.time[5], timer.time[6]); + timer.nanoseconds[4], timer.nanoseconds[5], timer.nanoseconds[6]); // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { @@ -171,32 +171,19 @@ int main(int argc, char **argv) { if (rep >= p.n_warmup) { printf("[::] transfer UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d n_nops=%d e_type=%s n_elements=%u e_mode=%s" - " | throughput_dram_mram_MBps=%f throughput_mram_dram_MBps=%f", + " | latency_dram_mram_ns=%lu latency_mram_dram_ns=%lu throughput_dram_mram_Bps=%f throughput_mram_dram_Bps=%f", nr_of_dpus, nr_of_ranks, NR_TASKLETS, p.n_nops, XSTR(T), transfer_size, transfer_mode, - transfer_size * sizeof(T) / timer.time[1], - transfer_size * sizeof(T) / timer.time[3]); - printf(" throughput_dram_mram_MOpps=%f throughput_mram_dram_MOpps=%f", - transfer_size / timer.time[1], - transfer_size / timer.time[3]); - printf(" latency_dpu_launch_us=%f\n", - timer.time[2]); + timer.nanoseconds[1], timer.nanoseconds[3], + transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[1], + transfer_size * sizeof(T) * 1e9 / timer.nanoseconds[3]); + printf(" throughput_dram_mram_Opps=%f throughput_mram_dram_Opps=%f", + transfer_size * 1e9 / timer.nanoseconds[1], + transfer_size * 1e9 / timer.nanoseconds[3]); + printf(" latency_dpu_launch_ns=%lu\n", + timer.nanoseconds[2]); } } - // Print timing results - /* - printf("CPU-DPU "); - print(&timer, 1, p.n_reps); - double time_load = timer.time[1] / (1000 * 1); - printf("CPU-DPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_load*1e6)); - printf("DPU Kernel "); - print(&timer, 2, p.n_reps); - printf("\n"); - printf("DPU-CPU "); - print(&timer, 3, p.n_reps); - double time_retrieve = timer.time[3] / (1000 * 1); - printf("DPU-CPU Bandwidth (GB/s): %f\n", (input_size * 8)/(time_retrieve*1e6)); - */ // Check output bool status = true; #ifdef BROADCASTX diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh new file mode 100755 index 0000000..321f14c --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-idle.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +mkdir -p "$(hostname)-alloc" + +./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-idle.txt" + +xz -v -9 -M 800M "$(hostname)-alloc/rank-idle.txt" diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh new file mode 100755 index 0000000..cf90174 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank-stress.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +mkdir -p "$(hostname)-alloc" + +NCORES=$(grep -c '^processor' /proc/cpuinfo) +cleanexit() { + pkill -f "stress -c ${NCORES}" +} + +trap cleanexit TERM INT + +stress -c ${NCORES} & + +./run-alloc-rank.sh | tee "$(hostname)-alloc/rank-stress-c${NCORES}.txt" + +cleanexit + +xz -v -9 -M 800M "$(hostname)-alloc/rank-stress-c${NCORES}.txt" diff --git a/Microbenchmarks/CPU-DPU/run-alloc-rank.sh b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh new file mode 100755 index 0000000..a6907fe --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-alloc-rank.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +set -e + +echo "prim-benchmarks CPU-DPU alloc (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# runtime exclusive of host_code execution time: 25 seconds per inner loop +# *16 -> about 7 minutes per outer loop +# *163 -> about 18 hours total +for i in 1 4 8 16 32 48 64; do + for j in $(seq 0 16); do + echo $i/64 $j/16 + ./make-size.sh $j + n_nops=$((j * 256)) + if make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\'; then + for l in $(seq 1 100); do + bin/host_code -w 1 -e 0 -x 1 -i 65536 -N $n_nops || true + done + fi + done +done + +echo "Completed at $(date)" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh new file mode 100755 index 0000000..89dda03 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-idle.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +mkdir -p "$(hostname)-transfer" + +./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-idle.txt" + +xz -v -9 -M 800M "$(hostname)-transfer/rank-idle.txt" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh new file mode 100755 index 0000000..e2f4020 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank-stress.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +mkdir -p "$(hostname)-transfer" + +NCORES=$(grep -c '^processor' /proc/cpuinfo) +cleanexit() { + pkill -f "stress -c ${NCORES}" +} + +trap cleanexit TERM INT + +stress -c ${NCORES} & + +./run-transfer-rank.sh | tee "$(hostname)-transfer/rank-stress-c${NCORES}.txt" + +cleanexit + +xz -v -9 -M 800M "$(hostname)-transfer/rank-stress-c${NCORES}.txt" diff --git a/Microbenchmarks/CPU-DPU/run-transfer-rank.sh b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh new file mode 100755 index 0000000..61e9dba --- /dev/null +++ b/Microbenchmarks/CPU-DPU/run-transfer-rank.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +set -e + +echo "prim-benchmarks CPU-DPU transfer (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +./make-size.sh 0 + +for i in 1 4 8 16 32 48 64; do + for k in SERIAL PUSH BROADCAST; do + # BROADCAST sends the same data to all DPUs, so data size must not exceed the amount of MRAM available on a single DPU (i.e., 64 MB) + # 8 B ... 64 MB + for l in 1 16 256 4096 65536 262144 1048576 4194304 6291456 8388608; do + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + bin/host_code -w 0 -e 100 -x 1 -i $l + done + done + # maximum amount of data + for k in SERIAL PUSH; do + make -B NR_DPUS=$i NR_TASKLETS=1 BL=10 TRANSFER=$k + bin/host_code -w 0 -e 100 -x 1 -i $(( 4194304 * i )) + bin/host_code -w 0 -e 100 -x 1 -i $(( 6291456 * i )) + bin/host_code -w 0 -e 100 -x 1 -i $(( 8388608 * i )) + done +done diff --git a/Microbenchmarks/CPU-DPU/support/timer.h b/Microbenchmarks/CPU-DPU/support/timer.h index 7c24f3b..0a4d6a1 100755 --- a/Microbenchmarks/CPU-DPU/support/timer.h +++ b/Microbenchmarks/CPU-DPU/support/timer.h @@ -1,59 +1,23 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ +#pragma once +#include -#include +typedef struct Timer { -typedef struct Timer{ + struct timespec startTime[7]; + struct timespec stopTime[7]; + uint64_t nanoseconds[7]; - struct timeval startTime[10]; - struct timeval stopTime[10]; - double time[10]; - -}Timer; +} Timer; void start(Timer *timer, int i, int rep) { if(rep == 0) { - timer->time[i] = 0.0; + timer->nanoseconds[i] = 0; } - gettimeofday(&timer->startTime[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &timer->startTime[i]); } void stop(Timer *timer, int i) { - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); + clock_gettime(CLOCK_MONOTONIC, &timer->stopTime[i]); + timer->nanoseconds[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000000 + + (timer->stopTime[i].tv_nsec - timer->startTime[i].tv_nsec); } - -void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); } -- cgit v1.2.3