1 files changed, 157 insertions, 143 deletions
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 2257e63..b473d7a 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -24,7 +24,7 @@
 #include <numaif.h>
 #include <numa.h>
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_data = -1;
@@ -35,223 +35,237 @@ int numa_node_cpu = -1;
 #define STR(x) #x
 
 // weights
-T** A;
+T **A;
 
 // input/output
-T* B;
+T *B;
 
 // intermediate
-T* C;
+T *C;
 
 // Create input arrays
-static void init_data(T** A, unsigned int m_size, unsigned int n_size){
-    for (unsigned int l = 0; l < NUM_LAYERS; l++) {
-		for (unsigned int i = 0; i < m_size * n_size; i++){
-			if(i % 100 < 98){
+static void init_data(T **A, unsigned int m_size, unsigned int n_size)
+{
+	for (unsigned int l = 0; l < NUM_LAYERS; l++) {
+		for (unsigned int i = 0; i < m_size * n_size; i++) {
+			if (i % 100 < 98) {
 				A[l][i] = 0;
-			}else{
-				A[l][i] = (l+i) % 2;
+			} else {
+				A[l][i] = (l + i) % 2;
 			}
 		}
 	}
 }
 
-static void init_B(T* B, unsigned int n_size){
-	for (unsigned int i = 0; i < n_size; i++){
-		if(i % 50 < 48){
+static void init_B(T *B, unsigned int n_size)
+{
+	for (unsigned int i = 0; i < n_size; i++) {
+		if (i % 50 < 48) {
 			B[i] = 0;
-		}
-		else{
+		} else {
 			B[i] = i % 2;
 		}
 	}
 }
 
 // Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
-	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
-		for (unsigned int m = 0; m < m_size; m++){
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+		     unsigned int n_size)
+{
+	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+		for (unsigned int m = 0; m < m_size; m++) {
 			C[m] = 0;
 		}
-		#pragma omp parallel for
-		for (unsigned int m = 0; m < m_size; m++){
-			for (unsigned int n = 0; n < n_size; n++){
+#pragma omp parallel for
+		for (unsigned int m = 0; m < m_size; m++) {
+			for (unsigned int n = 0; n < n_size; n++) {
 				C[m] += A[nl][m * n_size + n] * B[n];
 			}
 			C[m] = max(0, C[m]);
 		}
-		for (unsigned int n = 0; n < n_size; n++){
+		for (unsigned int n = 0; n < n_size; n++) {
 			B[n] = C[n];
 		}
 	}
 }
 
-static uint64_t mlp_host_sum(uint64_t n_size) {
-  uint64_t sum = 0;
-  for (uint64_t m = 0; m < n_size; m++){
-    sum += B[m];
-  }
-  return sum;
+static uint64_t mlp_host_sum(uint64_t n_size)
+{
+	uint64_t sum = 0;
+	for (uint64_t m = 0; m < n_size; m++) {
+		sum += B[m];
+	}
+	return sum;
 }
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-  int   input_size_n;
-  int   input_size_m;
-  int   n_reps;
+	int input_size_n;
+	int input_size_m;
+	int n_reps;
 #if NUMA
-  struct bitmask* bitmask;
-  int numa_node_cpu;
+	struct bitmask *bitmask;
+	int numa_node_cpu;
 #endif
-}Params;
+} Params;
 
-void usage() {
-  fprintf(stderr,
-    "\nUsage:  ./program [options]"
-    "\n");
+void usage()
+{
+	fprintf(stderr, "\nUsage:  ./program [options]" "\n");
 }
 
-  struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size_n  = 8192;
-    p.input_size_m  = 20480;
-    p.n_reps        = 100;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size_n = 8192;
+	p.input_size_m = 20480;
+	p.n_reps = 100;
 #if NUMA
-    p.bitmask = NULL;
-    p.numa_node_cpu = -1;
+	p.bitmask = NULL;
+	p.numa_node_cpu = -1;
 #endif
 
-    int opt;
-    while((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
-      switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'e': p.n_reps = atoi(optarg); break;
-        case 'n': p.input_size_n    = atoi(optarg); break;
-        case 'm': p.input_size_m    = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'n':
+			p.input_size_n = atoi(optarg);
+			break;
+		case 'm':
+			p.input_size_m = atoi(optarg);
+			break;
 #if NUMA
-        case 'A': p.bitmask         = numa_parse_nodestring(optarg); break;
-        case 'C': p.numa_node_cpu  = atoi(optarg); break;
+		case 'A':
+			p.bitmask = numa_parse_nodestring(optarg);
+			break;
+		case 'C':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #endif
-        default:
-        fprintf(stderr, "\nUnrecognized option!\n");
-        usage();
-        exit(0);
-      }
-    }
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
 
-    return p;
-  }
+	return p;
+}
 
   /**
   * @brief Main of the Host Application.
   */
-  int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
-    uint64_t n_size = p.input_size_n;
-    uint64_t m_size = p.input_size_m;
+	struct Params p = input_params(argc, argv);
+	uint64_t n_size = p.input_size_n;
+	uint64_t m_size = p.input_size_m;
 
 #if WITH_BENCHMARK
-    Timer timer;
+	Timer timer;
 #endif
 
 #if NUMA
-    if (p.bitmask) {
-        numa_set_membind(p.bitmask);
-        numa_free_nodemask(p.bitmask);
-    }
-    A = numa_alloc(NUM_LAYERS * sizeof(T*));
-    for(int l = 0; l < NUM_LAYERS; l++) {
-        A[l] = numa_alloc(n_size*m_size*sizeof(unsigned int));
-    }
-    B = numa_alloc(m_size*sizeof(unsigned int));
-    C = numa_alloc(m_size*sizeof(unsigned int));
-
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_data = mp_status[0];
-    }
-
-    numa_node_cpu = p.numa_node_cpu;
-    if (numa_node_cpu != -1) {
-        if (numa_run_on_node(numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	if (p.bitmask) {
+		numa_set_membind(p.bitmask);
+		numa_free_nodemask(p.bitmask);
+	}
+	A = numa_alloc(NUM_LAYERS * sizeof(T *));
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		A[l] = numa_alloc(n_size * m_size * sizeof(unsigned int));
+	}
+	B = numa_alloc(m_size * sizeof(unsigned int));
+	C = numa_alloc(m_size * sizeof(unsigned int));
+
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_data = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (numa_node_cpu != -1) {
+		if (numa_run_on_node(numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #else
-    A = malloc(NUM_LAYERS * sizeof(T*));
-    for(int l = 0; l < NUM_LAYERS; l++) {
-        A[l] = malloc(n_size*m_size*sizeof(unsigned int));
-    }
-    B = malloc(m_size*sizeof(unsigned int));
-    C = malloc(m_size*sizeof(unsigned int));
+	A = malloc(NUM_LAYERS * sizeof(T *));
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		A[l] = malloc(n_size * m_size * sizeof(unsigned int));
+	}
+	B = malloc(m_size * sizeof(unsigned int));
+	C = malloc(m_size * sizeof(unsigned int));
 #endif
 
-    // Create an input file with arbitrary data.
-    init_data(A, m_size, n_size);
+	// Create an input file with arbitrary data.
+	init_data(A, m_size, n_size);
 
-    for (int i = 0; i < p.n_reps; i++) {
-        init_B(B, n_size);
+	for (int i = 0; i < p.n_reps; i++) {
+		init_B(B, n_size);
 
-        start(&timer, 0, 0);
-        mlp_host(C, A, B, n_size, m_size);
-        stop(&timer, 0);
+		start(&timer, 0, 0);
+		mlp_host(C, A, B, n_size, m_size);
+		stop(&timer, 0);
 
 #if WITH_BENCHMARK
-        unsigned int nr_threads = 0;
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
+		nr_threads++;
 
-        printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
-            nr_threads, XSTR(T), n_size * m_size);
+		printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
+		       nr_threads, XSTR(T), n_size * m_size);
 #if NUMA
-        printf(" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
-            numa_node_data, numa_node_cpu, numa_distance(numa_node_data, numa_node_cpu));
+		printf
+		    (" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+		     numa_node_data, numa_node_cpu,
+		     numa_distance(numa_node_data, numa_node_cpu));
 #endif
-        printf(" | throughput_MBps=%f throughput_MOpps=%f",
-            n_size * m_size * sizeof(T) / timer.time[0],
-            n_size * m_size / timer.time[0]);
-        printf(" latency_us=%f\n",
-            timer.time[0]);
-#endif // WITH_BENCHMARK
-    }
+		printf(" | throughput_MBps=%f throughput_MOpps=%f",
+		       n_size * m_size * sizeof(T) / timer.time[0],
+		       n_size * m_size / timer.time[0]);
+		printf(" latency_us=%f\n", timer.time[0]);
+#endif				// WITH_BENCHMARK
+	}
 
 #if NOP_SYNC
-    for(int rep = 0; rep < 200000; rep++) {
-        asm volatile("nop" ::);
-    }
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
 
-    uint32_t sum = mlp_host_sum(n_size);
+	uint32_t sum = mlp_host_sum(n_size);
 
-    printf("SUM = %d \n", sum);
+	printf("SUM = %d \n", sum);
 
 #if NUMA
-    for(int l = 0; l < NUM_LAYERS; l++) {
-        numa_free(A[l], n_size*m_size*sizeof(unsigned int));
-    }
-    numa_free(A, NUM_LAYERS * sizeof(T*));
-    numa_free(B, m_size*sizeof(unsigned int));
-    numa_free(C, m_size*sizeof(unsigned int));
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		numa_free(A[l], n_size * m_size * sizeof(unsigned int));
+	}
+	numa_free(A, NUM_LAYERS * sizeof(T *));
+	numa_free(B, m_size * sizeof(unsigned int));
+	numa_free(C, m_size * sizeof(unsigned int));
 #else
-    for(int l = 0; l < NUM_LAYERS; l++) {
-        free(A[l]);
-    }
-    free(A);
-    free(B);
-    free(C);
+	for (int l = 0; l < NUM_LAYERS; l++) {
+		free(A[l]);
+	}
+	free(A);
+	free(B);
+	free(C);
 #endif
 
-    return 0;
+	return 0;
 }