summaryrefslogtreecommitdiff
path: root/SCAN-RSS
diff options
context:
space:
mode:
Diffstat (limited to 'SCAN-RSS')
-rw-r--r--SCAN-RSS/baselines/cpu/Makefile9
-rw-r--r--SCAN-RSS/baselines/cpu/app_baseline.cpp108
-rwxr-xr-xSCAN-RSS/dimes-hetsim-hbm.sh33
-rwxr-xr-xSCAN-RSS/dimes-hetsim-nmc.sh66
-rw-r--r--SCAN-RSS/host/app.c31
5 files changed, 231 insertions, 16 deletions
diff --git a/SCAN-RSS/baselines/cpu/Makefile b/SCAN-RSS/baselines/cpu/Makefile
index 0bdedf6..88ba57d 100644
--- a/SCAN-RSS/baselines/cpu/Makefile
+++ b/SCAN-RSS/baselines/cpu/Makefile
@@ -1,9 +1,16 @@
+NUMA ?= 0
+FLAGS =
+
+ifeq (${NUMA}, 1)
+ FLAGS += -lnuma
+endif
+
TYPE ?= UINT64
all: scan
scan: app_baseline.cpp
- g++ -O2 app_baseline.cpp -fopenmp -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP -lgomp -I/usr/local/cuda-8.0/include -lm -o scan -D${TYPE}
+ g++ -Wall -Wextra -pedantic -march=native -O2 app_baseline.cpp -fopenmp -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP -lgomp -I/usr/local/cuda-8.0/include -lm -o scan -DNUMA=${NUMA} -D${TYPE} ${FLAGS}
run: scan
./scan -i 1258291200 -t 4
diff --git a/SCAN-RSS/baselines/cpu/app_baseline.cpp b/SCAN-RSS/baselines/cpu/app_baseline.cpp
index f54f123..b0ab600 100644
--- a/SCAN-RSS/baselines/cpu/app_baseline.cpp
+++ b/SCAN-RSS/baselines/cpu/app_baseline.cpp
@@ -34,6 +34,18 @@
#include "../../support/common.h"
#include "../../support/timer.h"
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+int numa_node_in = -1;
+int numa_node_out = -1;
+int numa_node_cpu = -1;
+#endif
+
#define XSTR(x) STR(x)
#define STR(x) #x
@@ -44,7 +56,6 @@
// Pointer declaration
static T* A;
static T* C;
-static T* C2;
/**
* @brief creates input arrays
@@ -76,6 +87,11 @@ typedef struct Params {
int n_reps;
int n_threads;
int exp;
+#if NUMA
+ struct bitmask* bitmask_in;
+ struct bitmask* bitmask_out;
+ int numa_node_cpu;
+#endif
}Params;
void usage() {
@@ -101,9 +117,14 @@ struct Params input_params(int argc, char **argv) {
p.n_reps = 3;
p.exp = 0;
p.n_threads = 8;
+#if NUMA
+ p.bitmask_in = NULL;
+ p.bitmask_out = NULL;
+ p.numa_node_cpu = -1;
+#endif
int opt;
- while((opt = getopt(argc, argv, "hi:w:e:x:t:")) >= 0) {
+ while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:")) >= 0) {
switch(opt) {
case 'h':
usage();
@@ -114,6 +135,11 @@ struct Params input_params(int argc, char **argv) {
case 'e': p.n_reps = atoi(optarg); break;
case 'x': p.exp = atoi(optarg); break;
case 't': p.n_threads = atoi(optarg); break;
+#if NUMA
+ case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break;
+ case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break;
+ case 'c': p.numa_node_cpu = atoi(optarg); break;
+#endif
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
@@ -132,23 +158,76 @@ int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
- unsigned int nr_of_dpus = 1;
-
unsigned int i = 0;
const unsigned int input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
assert(input_size % (p.n_threads) == 0 && "Input size!");
// Input/output allocation
+
+#if NUMA
+ if (p.bitmask_in) {
+ numa_set_membind(p.bitmask_in);
+ numa_free_nodemask(p.bitmask_in);
+ }
+ A = (T*) numa_alloc(input_size * sizeof(T));
+#else
A = (T*)malloc(input_size * sizeof(T));
- C = (T*)malloc(input_size * sizeof(T));
- T *bufferA = A;
+#endif
+
+#if NUMA
+ if (p.bitmask_out) {
+ numa_set_membind(p.bitmask_out);
+ numa_free_nodemask(p.bitmask_out);
+ }
+ C = (T*) numa_alloc(input_size * sizeof(T));
+#else
+ C = (T*) malloc(input_size * sizeof(T));
+#endif
// Create an input file with arbitrary data.
read_input(A, input_size);
+#if NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif
+
+#if NUMA
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = C;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(C)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_out = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+#endif
+
// Timer declaration
Timer timer;
- float time_gpu = 0;
thrust::omp::vector<T> h_output(input_size);
@@ -189,9 +268,15 @@ int main(int argc, char **argv) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
if(rep >= p.n_warmup) {
- printf("[::] SCAN-RSS CPU | n_threads=%d e_type=%s n_elements=%d "
- "| throughput_cpu_ref_MBps=%f throughput_MBps=%f",
+ printf("[::] SCAN-RSS-CPU | n_threads=%d e_type=%s n_elements=%d"
+#if NUMA
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+#endif
+ " | throughput_cpu_ref_MBps=%f throughput_MBps=%f",
nr_threads, XSTR(T), input_size,
+#if NUMA
+ numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+#endif
input_size * sizeof(T) / timer.time[0],
input_size * sizeof(T) / timer.time[1]);
printf(" throughput_cpu_ref_MOpps=%f throughput_MOpps=%f",
@@ -213,8 +298,13 @@ int main(int argc, char **argv) {
// Deallocation
+#if NUMA
+ numa_free(A, input_size * sizeof(T));
+ numa_free(C, input_size * sizeof(T));
+#else
free(A);
free(C);
+#endif
return 0;
}
diff --git a/SCAN-RSS/dimes-hetsim-hbm.sh b/SCAN-RSS/dimes-hetsim-hbm.sh
new file mode 100755
index 0000000..0a198f5
--- /dev/null
+++ b/SCAN-RSS/dimes-hetsim-hbm.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+cd baselines/cpu
+make -B NUMA=1
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/$(date +%Y%m%d)
+
+# upstream uses 251658240 * INT64 == 1.875 GiB
+
+(
+
+echo "single-node execution (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ ./scan -i {input_size} -a {ram} -b {ram} -c {cpu} -t {nr_threads} -w 0 -e 5 -x 1 \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: cpu $(seq 0 7) \
+ ::: ram $(seq 0 15) \
+ ::: input_size 251658240
+
+echo "multi-node execution (2/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+ ./scan -i {input_size} -a {ram} -b {ram} -c {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
+ ::: nr_threads 32 48 64 96 128 \
+ ::: cpu -1 \
+ ::: ram $(seq 0 15) \
+ ::: input_size 251658240
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt
diff --git a/SCAN-RSS/dimes-hetsim-nmc.sh b/SCAN-RSS/dimes-hetsim-nmc.sh
new file mode 100755
index 0000000..820ca4d
--- /dev/null
+++ b/SCAN-RSS/dimes-hetsim-nmc.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
+fn=log/$(hostname)/$(date +%Y%m%d)
+
+# upstream uses 251658240 * INT64 == 1.875 GiB
+
+run_benchmark_nmc() {
+ local "$@"
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then
+ bin/host_code -w 0 -e 100 -i ${input_size} -x 1
+ fi
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+(
+
+echo "NMC single-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \
+ ::: numa_rank 0 1 \
+ ::: nr_dpus 64 128 256 512 768 1024 \
+ ::: input_size 251658240
+
+echo "NMC multi-node operation (2/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \
+ ::: numa_rank any \
+ ::: nr_dpus 1536 2048 2304 \
+ ::: input_size 251658240
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt
+
+cd baselines/cpu
+make -B NUMA=1
+
+(
+
+echo "CPU single-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ ./scan -i {input_size} -a {ram} -b {ram} -c {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
+ ::: ram 0 1 \
+ ::: cpu 0 1 \
+ ::: nr_threads 1 2 4 8 12 16 32 \
+ ::: input_size 251658240
+
+echo "CPU multi-node operation (1/2)" >&2
+
+parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
+ ./scan -i {input_size} -a {ram} -b {ram} -c {cpu} -t {nr_threads} -w 0 -e 40 -x 1 \
+ ::: ram 0-1 \
+ ::: cpu -1 \
+ ::: nr_threads 48 64 \
+ ::: input_size 251658240
+
+) > ${fn}.txt
+
+xz -f -v -9 -M 800M ${fn}.txt
diff --git a/SCAN-RSS/host/app.c b/SCAN-RSS/host/app.c
index 93c2c47..6771207 100644
--- a/SCAN-RSS/host/app.c
+++ b/SCAN-RSS/host/app.c
@@ -29,10 +29,12 @@
#include <dpu_probe.h>
#endif
+#include <dpu_management.h>
+#include <dpu_target_macros.h>
+
// Pointer declaration
static T* A;
static T* C;
-static T* C2;
// Create input arrays
static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) {
@@ -71,6 +73,8 @@ int main(int argc, char **argv) {
// Timer declaration
Timer timer;
+ int numa_node_rank = -2;
+
// Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
@@ -98,9 +102,8 @@ int main(int argc, char **argv) {
// Input/output allocation
A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
C = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
- C2 = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
T *bufferA = A;
- T *bufferC = C2;
+ T *bufferC = C;
// Create an input file with arbitrary data
read_input(A, input_size, input_size_dpu_round * NR_DPUS);
@@ -132,6 +135,23 @@ int main(int argc, char **argv) {
assert(nr_of_dpus == NR_DPUS);
#endif
+ // int prev_rank_id = -1;
+ int rank_id = -1;
+ DPU_FOREACH (dpu_set, dpu) {
+ rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
+ if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+ numa_node_rank = -1;
+ } else {
+ numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+ }
+ /*
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
+ }
+
// Compute output on CPU (performance comparison and verification purposes)
if(rep >= p.n_warmup) {
start(&timer, 2, 0);
@@ -314,8 +334,8 @@ int main(int argc, char **argv) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
printf("[::] SCAN-RSS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%d",
NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, UNROLL, input_size);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
+ printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_free_us=%f",
timer.time[0],
timer.time[1],
@@ -359,7 +379,6 @@ int main(int argc, char **argv) {
// Deallocation
free(A);
free(C);
- free(C2);
#if !WITH_ALLOC_OVERHEAD
DPU_ASSERT(dpu_free(dpu_set));