2 files changed, 144 insertions, 49 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
index 3404638..7eb5f00 100644
--- a/MLP/baselines/cpu/Makefile
+++ b/MLP/baselines/cpu/Makefile
@@ -1,7 +1,28 @@
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+	CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+	CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+	LDFLAGS += -lnuma
+endif
+
 all: mlp_openmp
 
 mlp_openmp: mlp_openmp.c
-	gcc -Wall -Wextra -pedantic -march=native -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+	gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS}
 
 mlp_openmp_O0: mlp_openmp.c
 	gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 8f95e7c..2257e63 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -11,19 +11,41 @@
 #include <getopt.h>
 #include <assert.h>
 #include <stdint.h>
-#include "../../support/timer.h"
 #include "../../support/common.h"
 
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
+
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+int numa_node_data = -1;
+int numa_node_cpu = -1;
+#endif
+
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
+// weights
 T** A;
+
+// input/output
 T* B;
+
+// intermediate
 T* C;
 
 // Create input arrays
-static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
-    for (unsigned int l = 0; l < NUM_LAYERS; l++)
+static void init_data(T** A, unsigned int m_size, unsigned int n_size){
+    for (unsigned int l = 0; l < NUM_LAYERS; l++) {
 		for (unsigned int i = 0; i < m_size * n_size; i++){
 			if(i % 100 < 98){
 				A[l][i] = 0;
@@ -31,6 +53,10 @@ static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
 				A[l][i] = (l+i) % 2;
 			}
 		}
+	}
+}
+
+static void init_B(T* B, unsigned int n_size){
 	for (unsigned int i = 0; i < n_size; i++){
 		if(i % 50 < 48){
 			B[i] = 0;
@@ -60,7 +86,7 @@ static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size
 	}
 }
 
-static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
+static uint64_t mlp_host_sum(uint64_t n_size) {
   uint64_t sum = 0;
   for (uint64_t m = 0; m < n_size; m++){
     sum += B[m];
@@ -70,55 +96,51 @@ static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-  char* dpu_type;
-  int   nr_of_ranks;
   int   input_size_n;
   int   input_size_m;
-  int   n_warmup;
   int   n_reps;
+#if NUMA
+  struct bitmask* bitmask;
+  int numa_node_cpu;
+#endif
 }Params;
 
 void usage() {
   fprintf(stderr,
     "\nUsage:  ./program [options]"
-    "\n"
-    "\nGeneral options:"
-    "\n    -h        help"
-    "\n    -d <D>    DPU type (default=fsim)"
-    "\n    -r <R>    # of ranks (default=2)"
-    "\n"
-    "\nBenchmark-specific options:"
-    "\n    -i <I>    input size (default=8M elements)"
     "\n");
-  }
+}
 
   struct Params input_params(int argc, char **argv) {
     struct Params p;
-    p.dpu_type      = "fsim";
-    p.nr_of_ranks   = 1;
-    p.input_size_n  = 1 << 9;
-    p.input_size_m  = 1 << 9;
-    p.n_warmup      = 2;
-    p.n_reps        = 3;
+    p.input_size_n  = 8192;
+    p.input_size_m  = 20480;
+    p.n_reps        = 100;
+#if NUMA
+    p.bitmask = NULL;
+    p.numa_node_cpu = -1;
+#endif
 
     int opt;
-    while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
+    while((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
       switch(opt) {
         case 'h':
         usage();
         exit(0);
         break;
-        case 'd': p.dpu_type        = optarg; break;
-        case 'r': p.nr_of_ranks     = atoi(optarg); break;
+        case 'e': p.n_reps = atoi(optarg); break;
         case 'n': p.input_size_n    = atoi(optarg); break;
         case 'm': p.input_size_m    = atoi(optarg); break;
+#if NUMA
+        case 'A': p.bitmask         = numa_parse_nodestring(optarg); break;
+        case 'C': p.numa_node_cpu  = atoi(optarg); break;
+#endif
         default:
         fprintf(stderr, "\nUnrecognized option!\n");
         usage();
         exit(0);
       }
     }
-    assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
 
     return p;
   }
@@ -129,55 +151,107 @@ void usage() {
   int main(int argc, char **argv) {
 
     struct Params p = input_params(argc, argv);
-    uint64_t n_size = 8192;
-    uint64_t m_size = 20480;
+    uint64_t n_size = p.input_size_n;
+    uint64_t m_size = p.input_size_m;
 
+#if WITH_BENCHMARK
     Timer timer;
+#endif
+
+#if NUMA
+    if (p.bitmask) {
+        numa_set_membind(p.bitmask);
+        numa_free_nodemask(p.bitmask);
+    }
+    A = numa_alloc(NUM_LAYERS * sizeof(T*));
+    for(int l = 0; l < NUM_LAYERS; l++) {
+        A[l] = numa_alloc(n_size*m_size*sizeof(unsigned int));
+    }
+    B = numa_alloc(m_size*sizeof(unsigned int));
+    C = numa_alloc(m_size*sizeof(unsigned int));
+
+    mp_pages[0] = A;
+    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+        perror("move_pages(A)");
+    }
+    else if (mp_status[0] < 0) {
+        printf("move_pages error: %d", mp_status[0]);
+    }
+    else {
+        numa_node_data = mp_status[0];
+    }
+
+    numa_node_cpu = p.numa_node_cpu;
+    if (numa_node_cpu != -1) {
+        if (numa_run_on_node(numa_node_cpu) == -1) {
+            perror("numa_run_on_node");
+            numa_node_cpu = -1;
+        }
+    }
+#else
     A = malloc(NUM_LAYERS * sizeof(T*));
-    for(int l = 0; l < NUM_LAYERS; l++)
+    for(int l = 0; l < NUM_LAYERS; l++) {
         A[l] = malloc(n_size*m_size*sizeof(unsigned int));
+    }
     B = malloc(m_size*sizeof(unsigned int));
     C = malloc(m_size*sizeof(unsigned int));
+#endif
 
-    for (int i = 0; i < 100; i++) {
-        // Create an input file with arbitrary data.
-        init_data(A, B, m_size, n_size);
+    // Create an input file with arbitrary data.
+    init_data(A, m_size, n_size);
+
+    for (int i = 0; i < p.n_reps; i++) {
+        init_B(B, n_size);
 
         start(&timer, 0, 0);
         mlp_host(C, A, B, n_size, m_size);
         stop(&timer, 0);
 
+#if WITH_BENCHMARK
         unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
         nr_threads++;
 
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu "
-            "| throughput_cpu_omp_MBps=%f\n",
-            nr_threads, XSTR(T), n_size * m_size,
-            n_size * m_size * sizeof(T) / timer.time[0]);
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu "
-            "| throughput_cpu_omp_MOpps=%f\n",
-            nr_threads, XSTR(T), n_size * m_size,
-            n_size * m_size / timer.time[0]);
-        printf("[::] n_threads=%d e_type=%s n_elements=%lu |",
+        printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
             nr_threads, XSTR(T), n_size * m_size);
-        printall(&timer, 0);
+#if NUMA
+        printf(" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+            numa_node_data, numa_node_cpu, numa_distance(numa_node_data, numa_node_cpu));
+#endif
+        printf(" | throughput_MBps=%f throughput_MOpps=%f",
+            n_size * m_size * sizeof(T) / timer.time[0],
+            n_size * m_size / timer.time[0]);
+        printf(" latency_us=%f\n",
+            timer.time[0]);
+#endif // WITH_BENCHMARK
     }
 
-    uint32_t sum = mlp_host_sum(n_size, m_size);
-   
-    printf("Kernel ");
-    print(&timer, 0, 1);
-    printf("\n");
+#if NOP_SYNC
+    for(int rep = 0; rep < 200000; rep++) {
+        asm volatile("nop" ::);
+    }
+#endif
+
+    uint32_t sum = mlp_host_sum(n_size);
 
     printf("SUM = %d \n", sum);
 
-    for(int l = 0; l < NUM_LAYERS; l++)
+#if NUMA
+    for(int l = 0; l < NUM_LAYERS; l++) {
+        numa_free(A[l], n_size*m_size*sizeof(unsigned int));
+    }
+    numa_free(A, NUM_LAYERS * sizeof(T*));
+    numa_free(B, m_size*sizeof(unsigned int));
+    numa_free(C, m_size*sizeof(unsigned int));
+#else
+    for(int l = 0; l < NUM_LAYERS; l++) {
         free(A[l]);
+    }
     free(A);
     free(B);
     free(C);
+#endif
 
     return 0;
 }