summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-07-25 15:39:31 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-07-25 15:39:31 +0200
commitb3d5b55cc00cb7977adfbde6c03599a345357767 (patch)
treebd3bc11162e7656631d83170bbf12c4727ba61ed
parent3120c6f5a7064da4da650d1d5029110897206613 (diff)
TRNS: specify memcpy node
-rw-r--r--TRNS/baselines/cpu/main.cpp27
-rwxr-xr-xTRNS/dimes-hetsim-hbm.sh15
2 files changed, 32 insertions, 10 deletions
diff --git a/TRNS/baselines/cpu/main.cpp b/TRNS/baselines/cpu/main.cpp
index 837c75d..c8cccaf 100644
--- a/TRNS/baselines/cpu/main.cpp
+++ b/TRNS/baselines/cpu/main.cpp
@@ -59,6 +59,7 @@ int numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
+int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
#endif
@@ -79,6 +80,7 @@ struct Params {
int numa_node_cpu;
#endif
#if NUMA_MEMCPY
+ int numa_node_cpu_memcpy;
struct bitmask* bitmask_cpu;
#endif
@@ -95,10 +97,11 @@ struct Params {
numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
+ numa_node_cpu_memcpy = -1;
bitmask_cpu = NULL;
#endif
int opt;
- while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:c:C:")) >= 0) {
+ while((opt = getopt(argc, argv, "ht:w:r:m:n:o:p:a:c:C:M:")) >= 0) {
switch(opt) {
case 'h':
usage();
@@ -116,6 +119,7 @@ struct Params {
case 'c': numa_node_cpu = atoi(optarg); break;
#if NUMA_MEMCPY
case 'C': bitmask_cpu = numa_parse_nodestring(optarg); break;
+ case 'M': numa_node_cpu_memcpy = atoi(optarg); break;
#endif // NUMA_MEMCPY
#endif // NUMA
default:
@@ -256,6 +260,7 @@ int main(int argc, char **argv) {
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if NUMA_MEMCPY
+ numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
if(rep >= p.n_warmup)
timer.start("local alloc");
if (!numa_node_in_is_local) {
@@ -264,6 +269,15 @@ int main(int argc, char **argv) {
if(rep >= p.n_warmup)
timer.stop("local alloc");
+ if (!numa_node_in_is_local) {
+ if (p.numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+
if(rep >= p.n_warmup)
timer.start("memcpy");
if (!numa_node_in_is_local) {
@@ -272,6 +286,13 @@ int main(int argc, char **argv) {
if(rep >= p.n_warmup)
timer.stop("memcpy");
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+
mp_pages[0] = h_local;
if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
perror("move_pages(A_local)");
@@ -344,10 +365,10 @@ int main(int argc, char **argv) {
if (rep >= p.n_warmup) {
#if NUMA_MEMCPY
printf("[::] TRNS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
- " numa_node_inout=%d numa_node_cpu=%d numa_distance_inout_cpu=%d"
+ " numa_node_inout=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_inout_cpu=%d"
" | throughput_MBps=%f",
p.n_threads, XSTR(T), in_size,
- numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+ numa_node_in, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
in_size * sizeof(T) / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
printf(" throughput_MOpps=%f",
in_size / (timer.get("Step 1") + timer.get("Step 2") + timer.get("Step 3")));
diff --git a/TRNS/dimes-hetsim-hbm.sh b/TRNS/dimes-hetsim-hbm.sh
index dfb914f..6793f8b 100755
--- a/TRNS/dimes-hetsim-hbm.sh
+++ b/TRNS/dimes-hetsim-hbm.sh
@@ -34,22 +34,23 @@ fn=log/$(hostname)/dimes-hetsim-hbm
make -B NUMA=1 NUMA_MEMCPY=1
-echo "CPU single-node operation with setup cost, cpu/out on same node (1/3)" >&2
+echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
- ./trns -w 0 -r 5 -p {p} -o 2048 -m 16 -n 8 -t {nr_threads} -a {ram} -c {cpu} -C {ram_local} \
+ ./trns -w 0 -r 5 -p {p} -o 2048 -m 16 -n 8 -t {nr_threads} -a {ram_in} -c {cpu} -C {ram_local} -M {cpu_memcpy} \
::: p 64 128 256 512 768 1024 1536 2048 2304 \
::: nr_threads 1 2 4 8 12 16 \
- ::: ram $(seq 0 15) \
- ::: cpu $(seq 0 7) $(seq 0 7) \
- :::+ ram_local $(seq 0 15) \
+ ::: ram_in $(seq 0 15) \
+ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
+ ::: ram_local $(seq 0 15) \
+ :::+ cpu $(seq 0 7) $(seq 0 7) \
::: input_size 167772160
make -B NUMA=1
echo "CPU single-node operation (2/3)" >&2
-parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
./trns -w 0 -r 5 -p {p} -o 2048 -m 16 -n 8 -t {nr_threads} -a {ram} -c {cpu} \
::: p 64 128 256 512 768 1024 1536 2048 2304 \
::: ram $(seq 0 15) \
@@ -58,7 +59,7 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
echo "CPU multi-node operation (3/3)" >&2
-parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \
./trns -w 0 -r 40 -p {p} -o 2048 -m 16 -n 8 -t {nr_threads} -a {ram} -c {cpu} \
::: p 64 128 256 512 768 1024 1536 2048 2304 \
::: ram $(seq 0 15) \