diff options
-rw-r--r-- | VA/baselines/cpu/Makefile | 9 | ||||
-rw-r--r-- | VA/baselines/cpu/app_baseline.c | 137 | ||||
-rwxr-xr-x | VA/dimes-hetsim.sh | 23 |
3 files changed, 145 insertions, 24 deletions
diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile index fb7c383..e7c60c0 100644 --- a/VA/baselines/cpu/Makefile +++ b/VA/baselines/cpu/Makefile @@ -1,10 +1,17 @@ +NUMA ?= 0 +FLAGS = + +ifeq (${NUMA}, 1) + FLAGS += -lnuma +endif + .PHONY: all all: va TYPE ?= int32_t va: app_baseline.c - gcc -O2 -o va -fopenmp -DT=${TYPE} app_baseline.c + gcc -O2 -Wall -Wextra -pedantic -march=native -o va -fopenmp -DNUMA=${NUMA} -DT=${TYPE} app_baseline.c ${FLAGS} va_O0: app_baseline.c gcc -o va_O0 -fopenmp app_baseline.c diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c index 8d95479..458cf41 100644 --- a/VA/baselines/cpu/app_baseline.c +++ b/VA/baselines/cpu/app_baseline.c @@ -15,6 +15,18 @@ #include <omp.h> #include "../../support/timer.h" +#if NUMA +#include <numaif.h> +#include <numa.h> + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +int numa_node_in = -1; +int numa_node_out = -1; +int numa_node_cpu = -1; +#endif + #define XSTR(x) STR(x) #define STR(x) #x @@ -25,25 +37,6 @@ static T *A; static T *B; static T *C; -static T *C2; - -/** -* @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values -* @param nr_elements how many 32-bit elements we want the file to be -* @return the buffer address -*/ -void *create_test_file(unsigned int nr_elements) { - srand(0); - A = (T*) malloc(nr_elements * sizeof(T)); - B = (T*) malloc(nr_elements * sizeof(T)); - C = (T*) malloc(nr_elements * sizeof(T)); - - for (int i = 0; i < nr_elements; i++) { - A[i] = (T) (rand()); - B[i] = (T) (rand()); - } - -} /** * @brief compute output in the host @@ -63,6 +56,11 @@ typedef struct Params { int n_reps; int exp; int n_threads; +#if NUMA + struct bitmask* bitmask_in; + struct bitmask* bitmask_out; + int numa_node_cpu; +#endif }Params; void usage() { @@ -88,9 +86,14 @@ struct Params input_params(int argc, char **argv) { p.n_reps = 3; p.exp = 0; p.n_threads = 5; +#if NUMA + p.bitmask_in = NULL; + p.bitmask_out = NULL; + p.numa_node_cpu = -1; +#endif int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:t:")) >= 0) { + while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:")) >= 0) { switch(opt) { case 'h': usage(); @@ -101,6 +104,11 @@ struct Params input_params(int argc, char **argv) { case 'e': p.n_reps = atoi(optarg); break; case 'x': p.exp = atoi(optarg); break; case 't': p.n_threads = atoi(optarg); break; +#if NUMA + case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break; + case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break; + case 'c': p.numa_node_cpu = atoi(optarg); break; +#endif default: fprintf(stderr, "\nUnrecognized option!\n"); usage(); @@ -122,7 +130,78 @@ int main(int argc, char **argv) { const unsigned int input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size; // Create an input file with arbitrary data. - create_test_file(input_size); + /** + * @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values + * @param nr_elements how many 32-bit elements we want the file to be + * @return the buffer address + */ + srand(0); + +#if NUMA + if (p.bitmask_in) { + numa_set_membind(p.bitmask_in); + numa_free_nodemask(p.bitmask_in); + } + A = (T*) numa_alloc(input_size * sizeof(T)); + B = (T*) numa_alloc(input_size * sizeof(T)); +#else + A = (T*) malloc(input_size * sizeof(T)); + B = (T*) malloc(input_size * sizeof(T)); +#endif + +#if NUMA + if (p.bitmask_out) { + numa_set_membind(p.bitmask_out); + numa_free_nodemask(p.bitmask_out); + } + C = (T*) numa_alloc(input_size * sizeof(T)); +#else + C = (T*) malloc(input_size * sizeof(T)); +#endif + + for (unsigned int i = 0; i < input_size; i++) { + A[i] = (T) (rand()); + B[i] = (T) (rand()); + } + +#if NUMA + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif + +#if NUMA + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_in = mp_status[0]; + } + + mp_pages[0] = C; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(C)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_out = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } +#endif Timer timer; @@ -137,9 +216,15 @@ int main(int argc, char **argv) { nr_threads++; if (rep >= p.n_warmup) { - printf("[::] VA CPU | n_threads=%d e_type=%s n_elements=%d " - "| throughput_MBps=%f", + printf("[::] VA CPU | n_threads=%d e_type=%s n_elements=%d" +#if NUMA + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" +#endif + " | throughput_MBps=%f", nr_threads, XSTR(T), input_size, +#if NUMA + numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), +#endif input_size * 3 * sizeof(T) / timer.time[0]); printf(" throughput_MOpps=%f", input_size / timer.time[0]); @@ -147,9 +232,15 @@ int main(int argc, char **argv) { } } +#if NUMA + numa_free(A, input_size * sizeof(T)); + numa_free(B, input_size * sizeof(T)); + numa_free(C, input_size * sizeof(T)); +#else free(A); free(B); free(C); +#endif return 0; } diff --git a/VA/dimes-hetsim.sh b/VA/dimes-hetsim.sh new file mode 100755 index 0000000..e4c8ee2 --- /dev/null +++ b/VA/dimes-hetsim.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +cd baselines/cpu +make -B NUMA=1 + +# upstream uses 16777216 * int32 == 64 MiB by default +# 2^29 elements * int32 == 2 GiB + +for nr_threads in 1 2 4 8 12 16; do + for cpu in 0 1 2 3 4 5 6 7; do + for ram in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + ./va -a $ram -b $ram -c $cpu -t $nr_threads -w 0 -e 50 + ./va -i $(perl -E 'say 2 ** 29') -a $ram -b $ram -c $cpu -t $nr_threads -w 0 -e 10 + done + done +done + +for nr_threads in 32 48 64 96 128; do + for ram in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + ./va -a $ram -b $ram -c -1 -t $nr_threads -w 0 -e 50 + ./va -i $(perl -E 'say 2 ** 29') -a $ram -b $ram -c -1 -t $nr_threads -w 0 -e 50 + done +done |