diff options
Diffstat (limited to 'MLP/baselines/cpu')
-rw-r--r-- | MLP/baselines/cpu/Makefile | 23 | ||||
-rw-r--r-- | MLP/baselines/cpu/mlp_openmp.c | 170 |
2 files changed, 144 insertions, 49 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile index 3404638..7eb5f00 100644 --- a/MLP/baselines/cpu/Makefile +++ b/MLP/baselines/cpu/Makefile @@ -1,7 +1,28 @@ +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 + +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma +endif + all: mlp_openmp mlp_openmp: mlp_openmp.c - gcc -Wall -Wextra -pedantic -march=native -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99 + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS} mlp_openmp_O0: mlp_openmp.c gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99 diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c index 8f95e7c..2257e63 100644 --- a/MLP/baselines/cpu/mlp_openmp.c +++ b/MLP/baselines/cpu/mlp_openmp.c @@ -11,19 +11,41 @@ #include <getopt.h> #include <assert.h> #include <stdint.h> -#include "../../support/timer.h" #include "../../support/common.h" +#if WITH_BENCHMARK +#include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif + +#if NUMA +#include <numaif.h> +#include <numa.h> + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +int numa_node_data = -1; +int numa_node_cpu = -1; +#endif + #define XSTR(x) STR(x) #define STR(x) #x +// weights T** A; + +// input/output T* B; + +// intermediate T* C; // Create input arrays -static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){ - for (unsigned int l = 0; l < NUM_LAYERS; l++) +static void init_data(T** A, unsigned int m_size, unsigned int n_size){ + for (unsigned int l = 0; l < NUM_LAYERS; l++) { for (unsigned int i = 0; i < m_size * n_size; i++){ if(i % 100 < 98){ A[l][i] = 0; @@ -31,6 +53,10 @@ static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){ A[l][i] = (l+i) % 2; } } + } +} + +static void init_B(T* B, unsigned int n_size){ for (unsigned int i = 0; i < n_size; i++){ if(i % 50 < 48){ B[i] = 0; @@ -60,7 +86,7 @@ static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size } } -static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) { +static uint64_t mlp_host_sum(uint64_t n_size) { uint64_t sum = 0; for (uint64_t m = 0; m < n_size; m++){ sum += B[m]; @@ -70,55 +96,51 @@ static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) { // Params --------------------------------------------------------------------- typedef struct Params { - char* dpu_type; - int nr_of_ranks; int input_size_n; int input_size_m; - int n_warmup; int n_reps; +#if NUMA + struct bitmask* bitmask; + int numa_node_cpu; +#endif }Params; void usage() { fprintf(stderr, "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -d <D> DPU type (default=fsim)" - "\n -r <R> # of ranks (default=2)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=8M elements)" "\n"); - } +} struct Params input_params(int argc, char **argv) { struct Params p; - p.dpu_type = "fsim"; - p.nr_of_ranks = 1; - p.input_size_n = 1 << 9; - p.input_size_m = 1 << 9; - p.n_warmup = 2; - p.n_reps = 3; + p.input_size_n = 8192; + p.input_size_m = 20480; + p.n_reps = 100; +#if NUMA + p.bitmask = NULL; + p.numa_node_cpu = -1; +#endif int opt; - while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) { + while((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) { switch(opt) { case 'h': usage(); exit(0); break; - case 'd': p.dpu_type = optarg; break; - case 'r': p.nr_of_ranks = atoi(optarg); break; + case 'e': p.n_reps = atoi(optarg); break; case 'n': p.input_size_n = atoi(optarg); break; case 'm': p.input_size_m = atoi(optarg); break; +#if NUMA + case 'A': p.bitmask = numa_parse_nodestring(optarg); break; + case 'C': p.numa_node_cpu = atoi(optarg); break; +#endif default: fprintf(stderr, "\nUnrecognized option!\n"); usage(); exit(0); } } - assert(p.nr_of_ranks > 0 && "Invalid # of ranks!"); return p; } @@ -129,55 +151,107 @@ void usage() { int main(int argc, char **argv) { struct Params p = input_params(argc, argv); - uint64_t n_size = 8192; - uint64_t m_size = 20480; + uint64_t n_size = p.input_size_n; + uint64_t m_size = p.input_size_m; +#if WITH_BENCHMARK Timer timer; +#endif + +#if NUMA + if (p.bitmask) { + numa_set_membind(p.bitmask); + numa_free_nodemask(p.bitmask); + } + A = numa_alloc(NUM_LAYERS * sizeof(T*)); + for(int l = 0; l < NUM_LAYERS; l++) { + A[l] = numa_alloc(n_size*m_size*sizeof(unsigned int)); + } + B = numa_alloc(m_size*sizeof(unsigned int)); + C = numa_alloc(m_size*sizeof(unsigned int)); + + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_data = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } +#else A = malloc(NUM_LAYERS * sizeof(T*)); - for(int l = 0; l < NUM_LAYERS; l++) + for(int l = 0; l < NUM_LAYERS; l++) { A[l] = malloc(n_size*m_size*sizeof(unsigned int)); + } B = malloc(m_size*sizeof(unsigned int)); C = malloc(m_size*sizeof(unsigned int)); +#endif - for (int i = 0; i < 100; i++) { - // Create an input file with arbitrary data. - init_data(A, B, m_size, n_size); + // Create an input file with arbitrary data. + init_data(A, m_size, n_size); + + for (int i = 0; i < p.n_reps; i++) { + init_B(B, n_size); start(&timer, 0, 0); mlp_host(C, A, B, n_size, m_size); stop(&timer, 0); +#if WITH_BENCHMARK unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic nr_threads++; - printf("[::] n_threads=%d e_type=%s n_elements=%lu " - "| throughput_cpu_omp_MBps=%f\n", - nr_threads, XSTR(T), n_size * m_size, - n_size * m_size * sizeof(T) / timer.time[0]); - printf("[::] n_threads=%d e_type=%s n_elements=%lu " - "| throughput_cpu_omp_MOpps=%f\n", - nr_threads, XSTR(T), n_size * m_size, - n_size * m_size / timer.time[0]); - printf("[::] n_threads=%d e_type=%s n_elements=%lu |", + printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu", nr_threads, XSTR(T), n_size * m_size); - printall(&timer, 0); +#if NUMA + printf(" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d", + numa_node_data, numa_node_cpu, numa_distance(numa_node_data, numa_node_cpu)); +#endif + printf(" | throughput_MBps=%f throughput_MOpps=%f", + n_size * m_size * sizeof(T) / timer.time[0], + n_size * m_size / timer.time[0]); + printf(" latency_us=%f\n", + timer.time[0]); +#endif // WITH_BENCHMARK } - uint32_t sum = mlp_host_sum(n_size, m_size); - - printf("Kernel "); - print(&timer, 0, 1); - printf("\n"); +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + + uint32_t sum = mlp_host_sum(n_size); printf("SUM = %d \n", sum); - for(int l = 0; l < NUM_LAYERS; l++) +#if NUMA + for(int l = 0; l < NUM_LAYERS; l++) { + numa_free(A[l], n_size*m_size*sizeof(unsigned int)); + } + numa_free(A, NUM_LAYERS * sizeof(T*)); + numa_free(B, m_size*sizeof(unsigned int)); + numa_free(C, m_size*sizeof(unsigned int)); +#else + for(int l = 0; l < NUM_LAYERS; l++) { free(A[l]); + } free(A); free(B); free(C); +#endif return 0; } |