diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-22 16:20:00 +0200 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-22 16:20:00 +0200 |
commit | 32f8ed0eabb3e21d8d5315b11c3c8e7e06e668af (patch) | |
tree | 4ba212e6eb19011021e919199aeae69b4c3fae0c /VA | |
parent | 27765c0a341ccdb8772481645f5e9b4fcf9c6abf (diff) |
VA baseline: configurable memcpy NUMA binding
Diffstat (limited to 'VA')
-rw-r--r-- | VA/baselines/cpu/app_baseline.c | 29 | ||||
-rwxr-xr-x | VA/dimes-hetsim-hbm.sh | 15 | ||||
-rwxr-xr-x | VA/dimes-hetsim-nmc.sh | 24 |
3 files changed, 50 insertions, 18 deletions
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c index 0451079..4c8610a 100644 --- a/VA/baselines/cpu/app_baseline.c +++ b/VA/baselines/cpu/app_baseline.c @@ -39,6 +39,7 @@ static T *B; static T *C; #if NUMA_MEMCPY +int numa_node_cpu_memcpy = -1; int numa_node_local = -1; int numa_node_in_is_local = 0; static T *A_local; @@ -73,6 +74,7 @@ typedef struct Params { int numa_node_cpu; #endif #if NUMA_MEMCPY + int numa_node_cpu_memcpy; struct bitmask* bitmask_cpu; #endif }Params; @@ -106,11 +108,12 @@ struct Params input_params(int argc, char **argv) { p.numa_node_cpu = -1; #endif #if NUMA_MEMCPY + p.numa_node_cpu_memcpy = -1; p.bitmask_cpu = NULL; #endif int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:")) >= 0) { + while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) { switch(opt) { case 'h': usage(); @@ -127,6 +130,7 @@ struct Params input_params(int argc, char **argv) { case 'c': p.numa_node_cpu = atoi(optarg); break; #if NUMA_MEMCPY case 'C': p.bitmask_cpu = numa_parse_nodestring(optarg); break; + case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break; #endif // NUMA_MEMCPY #endif // NUMA default: @@ -222,8 +226,8 @@ int main(int argc, char **argv) { } numa_node_cpu = p.numa_node_cpu; - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { + if (p.numa_node_cpu != -1) { + if (numa_run_on_node(p.numa_node_cpu) == -1) { perror("numa_run_on_node"); numa_node_cpu = -1; } @@ -239,12 +243,21 @@ int main(int argc, char **argv) { for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if NUMA_MEMCPY + numa_node_cpu_memcpy = p.numa_node_cpu_memcpy; start(&timer, 1, 0); if (!numa_node_in_is_local) { A_local = (T*) numa_alloc(input_size * sizeof(T)); B_local = (T*) numa_alloc(input_size * sizeof(T)); } stop(&timer, 1); + if (!numa_node_in_is_local) { + if (p.numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } start(&timer, 2, 0); if (!numa_node_in_is_local) { memcpy(A_local, A, input_size * sizeof(T)); @@ -254,6 +267,12 @@ int main(int argc, char **argv) { B_local = B; } stop(&timer, 2); + if (p.numa_node_cpu != -1) { + if (numa_run_on_node(p.numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } mp_pages[0] = A_local; if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { perror("move_pages(A_local)"); @@ -287,10 +306,10 @@ int main(int argc, char **argv) { if (rep >= p.n_warmup) { #if NUMA_MEMCPY printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d" - " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" " | throughput_MBps=%f", nr_threads, XSTR(T), input_size, - numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), input_size * 3 * sizeof(T) / timer.time[0]); printf(" throughput_MOpps=%f", input_size / timer.time[0]); diff --git a/VA/dimes-hetsim-hbm.sh b/VA/dimes-hetsim-hbm.sh index 7c0df73..d260c1e 100755 --- a/VA/dimes-hetsim-hbm.sh +++ b/VA/dimes-hetsim-hbm.sh @@ -11,15 +11,16 @@ fn=log/$(hostname)/dimes-hetsim-hbm make -B NUMA=1 NUMA_MEMCPY=1 -echo "CPU single-node operation with setup cost, cpu/out on same node (1/4)" >&2 +echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/4)" >&2 parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ - ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {ram_local} -t {nr_threads} -w 0 -e 40 \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {ram_local} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \ ::: nr_threads 1 2 4 8 12 16 \ - ::: ram_in $(seq 0 15) \ - ::: cpu $(seq 0 7) $(seq 0 7) \ - :::+ ram_local $(seq 0 15) \ - :::+ ram_out $(seq 0 15) \ + ::: ram_in $(seq 0 15) \ + :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \ + ::: ram_local $(seq 0 15) \ + :::+ cpu $(seq 0 7) $(seq 0 7) \ + :::+ ram_out $(seq 0 15) \ ::: input_size 167772160 make -B NUMA=1 @@ -47,7 +48,7 @@ parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ echo "multi-node execution (4/4)" >&2 parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ - ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \ ::: nr_threads 32 48 64 96 128 \ ::: cpu -1 \ ::: ram_in $(seq 0 15) \ diff --git a/VA/dimes-hetsim-nmc.sh b/VA/dimes-hetsim-nmc.sh index cabae57..a176027 100755 --- a/VA/dimes-hetsim-nmc.sh +++ b/VA/dimes-hetsim-nmc.sh @@ -40,20 +40,20 @@ make -B NUMA=1 ( -echo "CPU single-node operation (1/3)" >&2 +echo "CPU single-node operation (1/4)" >&2 parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ - ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \ ::: ram_in 0 1 \ ::: ram_out 0 1 \ ::: cpu 0 1 \ ::: nr_threads 1 2 4 8 12 16 \ ::: input_size 167772160 -echo "CPU multi-node operation (2/3)" >&2 +echo "CPU multi-node operation (2/4)" >&2 parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ - ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 40 \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -t {nr_threads} -w 0 -e 20 \ ::: ram_in 0 1 \ ::: ram_out 0 1 \ ::: cpu -1 \ @@ -66,11 +66,23 @@ make -B NUMA=1 NUMA_MEMCPY=1 ( -echo "CPU single-node operation with setup cost, cpu/out on same node (3/3)" >&2 +echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (3/4)" >&2 parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ - ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -t {nr_threads} -w 0 -e 40 \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \ + ::: ram_in 0 1 \ + :::+ cpu_memcpy 0 1 \ + ::: cpu 0 1 \ + :::+ ram_out 0 1 \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: input_size 167772160 + +echo "CPU single-node operation with setup cost, memcpy node == 0, cpu node == output node (4/4)" >&2 + +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ + ./va -i {input_size} -a {ram_in} -b {ram_out} -c {cpu} -C {cpu} -M {cpu_memcpy} -t {nr_threads} -w 0 -e 20 \ ::: ram_in 0 1 \ + ::: cpu_memcpy 0 \ ::: cpu 0 1 \ :::+ ram_out 0 1 \ ::: nr_threads 1 2 4 8 12 16 \ |