summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MLP/baselines/cpu/Makefile23
-rw-r--r--MLP/baselines/cpu/mlp_openmp.c170
2 files changed, 144 insertions, 49 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
index 3404638..7eb5f00 100644
--- a/MLP/baselines/cpu/Makefile
+++ b/MLP/baselines/cpu/Makefile
@@ -1,7 +1,28 @@
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
+endif
+
all: mlp_openmp
mlp_openmp: mlp_openmp.c
- gcc -Wall -Wextra -pedantic -march=native -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS}
mlp_openmp_O0: mlp_openmp.c
gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 8f95e7c..2257e63 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -11,19 +11,41 @@
#include <getopt.h>
#include <assert.h>
#include <stdint.h>
-#include "../../support/timer.h"
#include "../../support/common.h"
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
+
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+int numa_node_data = -1;
+int numa_node_cpu = -1;
+#endif
+
#define XSTR(x) STR(x)
#define STR(x) #x
+// weights
T** A;
+
+// input/output
T* B;
+
+// intermediate
T* C;
// Create input arrays
-static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
- for (unsigned int l = 0; l < NUM_LAYERS; l++)
+static void init_data(T** A, unsigned int m_size, unsigned int n_size){
+ for (unsigned int l = 0; l < NUM_LAYERS; l++) {
for (unsigned int i = 0; i < m_size * n_size; i++){
if(i % 100 < 98){
A[l][i] = 0;
@@ -31,6 +53,10 @@ static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
A[l][i] = (l+i) % 2;
}
}
+ }
+}
+
+static void init_B(T* B, unsigned int n_size){
for (unsigned int i = 0; i < n_size; i++){
if(i % 50 < 48){
B[i] = 0;
@@ -60,7 +86,7 @@ static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size
}
}
-static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
+static uint64_t mlp_host_sum(uint64_t n_size) {
uint64_t sum = 0;
for (uint64_t m = 0; m < n_size; m++){
sum += B[m];
@@ -70,55 +96,51 @@ static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
// Params ---------------------------------------------------------------------
typedef struct Params {
- char* dpu_type;
- int nr_of_ranks;
int input_size_n;
int input_size_m;
- int n_warmup;
int n_reps;
+#if NUMA
+ struct bitmask* bitmask;
+ int numa_node_cpu;
+#endif
}Params;
void usage() {
fprintf(stderr,
"\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -d <D> DPU type (default=fsim)"
- "\n -r <R> # of ranks (default=2)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=8M elements)"
"\n");
- }
+}
struct Params input_params(int argc, char **argv) {
struct Params p;
- p.dpu_type = "fsim";
- p.nr_of_ranks = 1;
- p.input_size_n = 1 << 9;
- p.input_size_m = 1 << 9;
- p.n_warmup = 2;
- p.n_reps = 3;
+ p.input_size_n = 8192;
+ p.input_size_m = 20480;
+ p.n_reps = 100;
+#if NUMA
+ p.bitmask = NULL;
+ p.numa_node_cpu = -1;
+#endif
int opt;
- while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
+ while((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
- case 'd': p.dpu_type = optarg; break;
- case 'r': p.nr_of_ranks = atoi(optarg); break;
+ case 'e': p.n_reps = atoi(optarg); break;
case 'n': p.input_size_n = atoi(optarg); break;
case 'm': p.input_size_m = atoi(optarg); break;
+#if NUMA
+ case 'A': p.bitmask = numa_parse_nodestring(optarg); break;
+ case 'C': p.numa_node_cpu = atoi(optarg); break;
+#endif
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
- assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
return p;
}
@@ -129,55 +151,107 @@ void usage() {
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
- uint64_t n_size = 8192;
- uint64_t m_size = 20480;
+ uint64_t n_size = p.input_size_n;
+ uint64_t m_size = p.input_size_m;
+#if WITH_BENCHMARK
Timer timer;
+#endif
+
+#if NUMA
+ if (p.bitmask) {
+ numa_set_membind(p.bitmask);
+ numa_free_nodemask(p.bitmask);
+ }
+ A = numa_alloc(NUM_LAYERS * sizeof(T*));
+ for(int l = 0; l < NUM_LAYERS; l++) {
+ A[l] = numa_alloc(n_size*m_size*sizeof(unsigned int));
+ }
+ B = numa_alloc(m_size*sizeof(unsigned int));
+ C = numa_alloc(m_size*sizeof(unsigned int));
+
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ }
+ else {
+ numa_node_data = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+#else
A = malloc(NUM_LAYERS * sizeof(T*));
- for(int l = 0; l < NUM_LAYERS; l++)
+ for(int l = 0; l < NUM_LAYERS; l++) {
A[l] = malloc(n_size*m_size*sizeof(unsigned int));
+ }
B = malloc(m_size*sizeof(unsigned int));
C = malloc(m_size*sizeof(unsigned int));
+#endif
- for (int i = 0; i < 100; i++) {
- // Create an input file with arbitrary data.
- init_data(A, B, m_size, n_size);
+ // Create an input file with arbitrary data.
+ init_data(A, m_size, n_size);
+
+ for (int i = 0; i < p.n_reps; i++) {
+ init_B(B, n_size);
start(&timer, 0, 0);
mlp_host(C, A, B, n_size, m_size);
stop(&timer, 0);
+#if WITH_BENCHMARK
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
nr_threads++;
- printf("[::] n_threads=%d e_type=%s n_elements=%lu "
- "| throughput_cpu_omp_MBps=%f\n",
- nr_threads, XSTR(T), n_size * m_size,
- n_size * m_size * sizeof(T) / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%lu "
- "| throughput_cpu_omp_MOpps=%f\n",
- nr_threads, XSTR(T), n_size * m_size,
- n_size * m_size / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%lu |",
+ printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
nr_threads, XSTR(T), n_size * m_size);
- printall(&timer, 0);
+#if NUMA
+ printf(" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+ numa_node_data, numa_node_cpu, numa_distance(numa_node_data, numa_node_cpu));
+#endif
+ printf(" | throughput_MBps=%f throughput_MOpps=%f",
+ n_size * m_size * sizeof(T) / timer.time[0],
+ n_size * m_size / timer.time[0]);
+ printf(" latency_us=%f\n",
+ timer.time[0]);
+#endif // WITH_BENCHMARK
}
- uint32_t sum = mlp_host_sum(n_size, m_size);
-
- printf("Kernel ");
- print(&timer, 0, 1);
- printf("\n");
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
+ uint32_t sum = mlp_host_sum(n_size);
printf("SUM = %d \n", sum);
- for(int l = 0; l < NUM_LAYERS; l++)
+#if NUMA
+ for(int l = 0; l < NUM_LAYERS; l++) {
+ numa_free(A[l], n_size*m_size*sizeof(unsigned int));
+ }
+ numa_free(A, NUM_LAYERS * sizeof(T*));
+ numa_free(B, m_size*sizeof(unsigned int));
+ numa_free(C, m_size*sizeof(unsigned int));
+#else
+ for(int l = 0; l < NUM_LAYERS; l++) {
free(A[l]);
+ }
free(A);
free(B);
free(C);
+#endif
return 0;
}