summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-07-22 16:20:00 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-07-22 16:20:00 +0200
commit32f8ed0eabb3e21d8d5315b11c3c8e7e06e668af (patch)
tree4ba212e6eb19011021e919199aeae69b4c3fae0c
parent27765c0a341ccdb8772481645f5e9b4fcf9c6abf (diff)
VA baseline: configurable memcpy NUMA binding
-rw-r--r--VA/baselines/cpu/app_baseline.c29
-rwxr-xr-xVA/dimes-hetsim-hbm.sh15
-rwxr-xr-xVA/dimes-hetsim-nmc.sh24
3 files changed, 50 insertions, 18 deletions
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c
index 0451079..4c8610a 100644
--- a/VA/baselines/cpu/app_baseline.c
+++ b/VA/baselines/cpu/app_baseline.c
@@ -39,6 +39,7 @@ static T *B;
static T *C;
#if NUMA_MEMCPY
+int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
static T *A_local;
@@ -73,6 +74,7 @@ typedef struct Params {
int numa_node_cpu;
#endif
#if NUMA_MEMCPY
+ int numa_node_cpu_memcpy;
struct bitmask* bitmask_cpu;
#endif
}Params;
@@ -106,11 +108,12 @@ struct Params input_params(int argc, char **argv) {
p.numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
+ p.numa_node_cpu_memcpy = -1;
p.bitmask_cpu = NULL;
#endif
int opt;
- while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:")) >= 0) {
+ while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
switch(opt) {
case 'h':
usage();
@@ -127,6 +130,7 @@ struct Params input_params(int argc, char **argv) {
case 'c': p.numa_node_cpu = atoi(optarg); break;
#if NUMA_MEMCPY
case 'C': p.bitmask_cpu = numa_parse_nodestring(optarg); break;
+ case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
#endif // NUMA_MEMCPY
#endif // NUMA
default:
@@ -222,8 +226,8 @@ int main(int argc, char **argv) {
}
numa_node_cpu = p.numa_node_cpu;
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
perror("numa_run_on_node");
numa_node_cpu = -1;
}
@@ -239,12 +243,21 @@ int main(int argc, char **argv) {
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if NUMA_MEMCPY
+ numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
start(&timer, 1, 0);
if (!numa_node_in_is_local) {
A_local = (T*) numa_alloc(input_size * sizeof(T));
B_local = (T*) numa_alloc(input_size * sizeof(T));
}
stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (p.numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
start(&timer, 2, 0);
if (!numa_node_in_is_local) {
memcpy(A_local, A, input_size * sizeof(T));
@@ -254,6 +267,12 @@ int main(int argc, char **argv) {
B_local = B;
}
stop(&timer, 2);
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
mp_pages[0] = A_local;
if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
perror("move_pages(A_local)");
@@ -287,10 +306,10 @@ int main(int argc, char **argv) {
if (rep >= p.n_warmup) {
#if NUMA_MEMCPY
printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
- " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
" | throughput_MBps=%f",
nr_threads, XSTR(T), input_size,
- numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
input_size * 3 * sizeof(T) / timer.time[0]);
printf(" throughput_MOpps=%f",
input_size / timer.time[0]);
diff --git a/VA/dimes-hetsim-hbm.sh b/VA/dimes-hetsim-hbm.sh
index 7c0df73..d260c1e 100755
--- a/VA/dimes-hetsim-hbm.sh
+++ b/VA/dimes-hetsim-hbm.sh
@@ -11,15 +11,16 @@ fn=log/$(hostname)/dimes-hetsim-hbm
make -B NUMA=1 NUMA_MEMCPY=1
-echo "CPU single-node operation with setup cost, cpu/out on same node (1/4)" >&2
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/4)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
- ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {ram_local} -t {nr_threads} -w 0 -e 40 \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {ram_local} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \
::: nr_threads 1 2 4 8 12 16 \
- ::: ram_in $(seq 0 15) \
- ::: cpu $(seq 0 7) $(seq 0 7) \
- :::+ ram_local $(seq 0 15) \
- :::+ ram_out $(seq 0 15) \
+ ::: ram_in $(seq 0 15) \
+ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
+ ::: ram_local $(seq 0 15) \
+ :::+ cpu $(seq 0 7) $(seq 0 7) \
+ :::+ ram_out $(seq 0 15) \
::: input_size 167772160
make -B NUMA=1
@@ -47,7 +48,7 @@ parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
echo "multi-node execution (4/4)" >&2
parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
- ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \
::: nr_threads 32 48 64 96 128 \
::: cpu -1 \
::: ram_in $(seq 0 15) \
diff --git a/VA/dimes-hetsim-nmc.sh b/VA/dimes-hetsim-nmc.sh
index cabae57..a176027 100755
--- a/VA/dimes-hetsim-nmc.sh
+++ b/VA/dimes-hetsim-nmc.sh
@@ -40,20 +40,20 @@ make -B NUMA=1
(
-echo "CPU single-node operation (1/3)" >&2
+echo "CPU single-node operation (1/4)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
- ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \
::: ram_in 0 1 \
::: ram_out 0 1 \
::: cpu 0 1 \
::: nr_threads 1 2 4 8 12 16 \
::: input_size 167772160
-echo "CPU multi-node operation (2/3)" >&2
+echo "CPU multi-node operation (2/4)" >&2
parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
- ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \
::: ram_in 0 1 \
::: ram_out 0 1 \
::: cpu -1 \
@@ -66,11 +66,23 @@ make -B NUMA=1 NUMA_MEMCPY=1
(
-echo "CPU single-node operation with setup cost, cpu/out on same node (3/3)" >&2
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (3/4)" >&2
parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
- ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -t {nr_threads} -w 0 -e 40 \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \
+ ::: ram_in 0 1 \
+ :::+ cpu_memcpy 0 1 \
+ ::: cpu 0 1 \
+ :::+ ram_out 0 1 \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: input_size 167772160
+
+echo "CPU single-node operation with setup cost, memcpy node == 0, cpu node == output node (4/4)" >&2
+
+parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
+ ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \
::: ram_in 0 1 \
+ ::: cpu_memcpy 0 \
::: cpu 0 1 \
:::+ ram_out 0 1 \
::: nr_threads 1 2 4 8 12 16 \