summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-08-19 15:48:50 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-08-19 15:48:50 +0200
commit43e4649d0902f1bf5d96f8055ddcf155dfbf29b7 (patch)
tree9a99a91e95ec547f00fbe318fc232d83e67fa13a
parenta1f6d4ca0216bcd983ab8d10f035d5a9a0b35def (diff)
HST-S: Add memcpy variant and update HBM eval script
-rw-r--r--HST-S/baselines/cpu/Makefile3
-rw-r--r--HST-S/baselines/cpu/app_baseline.c120
-rwxr-xr-xHST-S/baselines/cpu/run.sh25
-rwxr-xr-xHST-S/dimes-hetsim-hbm.sh60
4 files changed, 139 insertions, 69 deletions
diff --git a/HST-S/baselines/cpu/Makefile b/HST-S/baselines/cpu/Makefile
index cfab81b..e373492 100644
--- a/HST-S/baselines/cpu/Makefile
+++ b/HST-S/baselines/cpu/Makefile
@@ -1,4 +1,5 @@
NUMA ?= 0
+NUMA_MEMCPY ?= 0
FLAGS =
ifeq (${NUMA}, 1)
@@ -8,7 +9,7 @@ endif
all: hist
hist: app_baseline.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -o hist -fopenmp app_baseline.c ${FLAGS}
+ gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -o hist -fopenmp app_baseline.c ${FLAGS}
hist_O0: app_baseline.c
gcc -o hist_O0 -fopenmp app_baseline.c
diff --git a/HST-S/baselines/cpu/app_baseline.c b/HST-S/baselines/cpu/app_baseline.c
index 0e16944..745e384 100644
--- a/HST-S/baselines/cpu/app_baseline.c
+++ b/HST-S/baselines/cpu/app_baseline.c
@@ -35,6 +35,12 @@ int numa_node_out = -1;
int numa_node_cpu = -1;
#endif
+#if NUMA_MEMCPY
+int numa_node_cpu_memcpy = -1;
+int numa_node_local = -1;
+int numa_node_in_is_local = 0;
+#endif
+
#include "../../support/common.h"
#include "../../support/timer.h"
@@ -44,6 +50,7 @@ int numa_node_cpu = -1;
// Pointer declaration
static T* A;
+static T *A_local;
static unsigned int* histo_host;
typedef struct Params {
@@ -59,6 +66,10 @@ typedef struct Params {
struct bitmask* bitmask_out;
int numa_node_cpu;
#endif
+#if NUMA_MEMCPY
+ int numa_node_cpu_memcpy;
+ struct bitmask* bitmask_cpu;
+#endif
}Params;
/**
@@ -150,9 +161,13 @@ struct Params input_params(int argc, char **argv) {
p.bitmask_out = NULL;
p.numa_node_cpu = -1;
#endif
+#if NUMA_MEMCPY
+ p.numa_node_cpu_memcpy = -1;
+ p.bitmask_cpu = NULL;
+#endif
int opt;
- while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:")) >= 0) {
+ while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) {
switch(opt) {
case 'h':
usage();
@@ -169,7 +184,11 @@ struct Params input_params(int argc, char **argv) {
case 'A': p.bitmask_in = numa_parse_nodestring(optarg); break;
case 'B': p.bitmask_out = numa_parse_nodestring(optarg); break;
case 'C': p.numa_node_cpu = atoi(optarg); break;
-#endif
+#if NUMA_MEMCPY
+ case 'D': p.bitmask_cpu = numa_parse_nodestring(optarg); break;
+ case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
+#endif // NUMA_MEMCPY
+#endif // NUMA
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
@@ -207,6 +226,9 @@ int main(int argc, char **argv) {
A = malloc(input_size * sizeof(T));
#endif
+ // Create an input file with arbitrary data.
+ read_input(A, p);
+
#if NUMA
if (p.bitmask_out) {
numa_set_membind(p.bitmask_out);
@@ -230,14 +252,18 @@ int main(int argc, char **argv) {
}
#if NUMA
+#if NUMA_MEMCPY
+ if (p.bitmask_cpu) {
+ numa_set_membind(p.bitmask_cpu);
+ numa_free_nodemask(p.bitmask_cpu);
+ }
+#else
struct bitmask *bitmask_all = numa_allocate_nodemask();
numa_bitmask_setall(bitmask_all);
numa_set_membind(bitmask_all);
numa_free_nodemask(bitmask_all);
-#endif
-
- // Create an input file with arbitrary data.
- read_input(A, p);
+#endif // NUMA_MEMCPY
+#endif // NUMA
#if NUMA
mp_pages[0] = A;
@@ -271,23 +297,91 @@ int main(int argc, char **argv) {
}
#endif
+#if NUMA_MEMCPY
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
Timer timer;
+
+#if NUMA_MEMCPY
+ numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ A_local = (T*) numa_alloc(input_size * sizeof(T));
+ }
+ stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (p.numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ memcpy(A_local, A, input_size * sizeof(T));
+ } else {
+ A_local = A;
+ }
+ stop(&timer, 2);
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_local = mp_status[0];
+ }
+#else
+ A_local = A;
+#endif
+
start(&timer, 0, 0);
- if(!p.exp)
- memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
+ if(!p.exp)
+ memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
else
- memset(histo_host, 0, p.bins * sizeof(unsigned int));
+ memset(histo_host, 0, p.bins * sizeof(unsigned int));
- histogram_host(histo_host, A, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads);
+ histogram_host(histo_host, A_local, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads);
stop(&timer, 0);
+#if NUMA_MEMCPY
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(A_local, input_size * sizeof(T));
+ }
+ stop(&timer, 3);
+#endif
+
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
nr_threads++;
+#if NUMA_MEMCPY
+ printf("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
+ " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus,
+ numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ input_size * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f",
+ input_size / timer.time[0]);
+ printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+#else
printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
#if NUMA
" numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
@@ -298,9 +392,9 @@ int main(int argc, char **argv) {
numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
#endif
input_size * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- input_size / timer.time[0]);
- printall(&timer, 0);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ input_size / timer.time[0], timer.time[0]);
+#endif // NUMA_MEMCPY
#if NUMA
numa_free(A, input_size * sizeof(T));
diff --git a/HST-S/baselines/cpu/run.sh b/HST-S/baselines/cpu/run.sh
deleted file mode 100755
index ffb2cc6..0000000
--- a/HST-S/baselines/cpu/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/sh
-
-set -e
-
-HOST="$(hostname)"
-
-echo $HOST
-
-(
-echo "prim-benchmarks HST-S CPU (dfatool edition)"
-echo "Started at $(date)"
-echo "Revision $(git describe --always)"
-
-# baseline ./hist supports -x, however -x 0 references uninitialized variables
-# and likely never has been used or tested. So we'll leave that out here.
-
-make -B verbose=1
-
-for nr_threads in 88 64 44 32 24 20 1 2 4 6 8 12 16; do
- for i in `seq 1 20`; do
- timeout --foreground -k 1m 30m ./hist -t ${nr_threads} -i 1006632960 || true
- sleep 10
- done
-done
-) | tee "${HOST}-explore.txt"
diff --git a/HST-S/dimes-hetsim-hbm.sh b/HST-S/dimes-hetsim-hbm.sh
index 08842b7..44c191d 100755
--- a/HST-S/dimes-hetsim-hbm.sh
+++ b/HST-S/dimes-hetsim-hbm.sh
@@ -1,51 +1,51 @@
-#!/bin/bash
+#!/bin/sh
cd baselines/cpu
-make -B NUMA=1
mkdir -p log/$(hostname)
-fn=log/$(hostname)/$(date +%Y%m%d)
-
-# Upstream DPU version uses 256 bins and 1536 * 1024 * 64 uint32 elements == 384 MiB total (-x 2 with implicit -z 64)
-input_size_upstream=$((1536 * 1024 * 64))
+fn=log/$(hostname)/dimes-hetsim-hbm
-# Here: 2 GiB
-input_size_dpu=$((2**29))
+# Input: (2^29 == 536870912) * int32 == 2 GiB
(
-echo "single-node execution, upstream ref (1/4)" >&2
+make -B NUMA=1 NUMA_MEMCPY=1
+
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
- ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -D {ram_local} -M {cpu-memcpy} -t {nr_threads} \
+ ::: i $(seq 1 5) \
::: nr_threads 1 2 4 8 12 16 \
- ::: cpu $(seq 0 7) \
- ::: ram $(seq 0 15)
+ ::: ram_in $(seq 0 15) \
+ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
+ ::: ram_local $(seq 0 15) \
+ :::+ cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15) \
+ ::: input_size 536870912
+
+make -B NUMA=1
-echo "single-node execution, DPU ref (2/4)" >&2
+echo "single-node execution, cpu/out on same node (2/3)" >&2
parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
- ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -t {nr_threads} \
+ ::: i $(seq 1 5) \
::: nr_threads 1 2 4 8 12 16 \
- ::: cpu $(seq 0 7) \
- ::: ram $(seq 0 15)
+ ::: ram_in $(seq 0 15) \
+ ::: cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15) \
+ ::: input_size 536870912
-echo "multi-node execution, upstream ref (3/4)" >&2
+echo "multi-node execution (3/3)" >&2
parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
- ./hist -i ${input_size_upstream} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
+ ./hist -i {input_size} -b 256 -A {ram_in} -B {ram_out} -C {cpu} -t {nr_threads} \
+ ::: i $(seq 1 20) \
::: nr_threads 32 48 64 96 128 \
::: cpu -1 \
- ::: ram $(seq 0 15)
-
-echo "multi-node execution, DPU ref (4/4)" >&2
-
-parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
- ./hist -i ${input_size_dpu} -A {ram} -B {ram} -C {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
- ::: nr_threads 32 48 64 96 128 \
- ::: cpu -1 \
- ::: ram $(seq 0 15)
-
-) > ${fn}.txt
+ ::: ram_in $(seq 0 15) \
+ ::: ram_out $(seq 0 15) \
+ ::: input_size 536870912
-xz -f -v -9 -M 800M ${fn}.txt
+) >> ${fn}.txt