summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MLP/baselines/cpu/mlp_openmp.c300
1 files changed, 157 insertions, 143 deletions
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 2257e63..b473d7a 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -24,7 +24,7 @@
#include <numaif.h>
#include <numa.h>
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_data = -1;
@@ -35,223 +35,237 @@ int numa_node_cpu = -1;
#define STR(x) #x
// weights
-T** A;
+T **A;
// input/output
-T* B;
+T *B;
// intermediate
-T* C;
+T *C;
// Create input arrays
-static void init_data(T** A, unsigned int m_size, unsigned int n_size){
- for (unsigned int l = 0; l < NUM_LAYERS; l++) {
- for (unsigned int i = 0; i < m_size * n_size; i++){
- if(i % 100 < 98){
+static void init_data(T **A, unsigned int m_size, unsigned int n_size)
+{
+ for (unsigned int l = 0; l < NUM_LAYERS; l++) {
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ if (i % 100 < 98) {
A[l][i] = 0;
- }else{
- A[l][i] = (l+i) % 2;
+ } else {
+ A[l][i] = (l + i) % 2;
}
}
}
}
-static void init_B(T* B, unsigned int n_size){
- for (unsigned int i = 0; i < n_size; i++){
- if(i % 50 < 48){
+static void init_B(T *B, unsigned int n_size)
+{
+ for (unsigned int i = 0; i < n_size; i++) {
+ if (i % 50 < 48) {
B[i] = 0;
- }
- else{
+ } else {
B[i] = i % 2;
}
}
}
// Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
- for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
- for (unsigned int m = 0; m < m_size; m++){
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
+ for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+ for (unsigned int m = 0; m < m_size; m++) {
C[m] = 0;
}
- #pragma omp parallel for
- for (unsigned int m = 0; m < m_size; m++){
- for (unsigned int n = 0; n < n_size; n++){
+#pragma omp parallel for
+ for (unsigned int m = 0; m < m_size; m++) {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[nl][m * n_size + n] * B[n];
}
C[m] = max(0, C[m]);
}
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int n = 0; n < n_size; n++) {
B[n] = C[n];
}
}
}
-static uint64_t mlp_host_sum(uint64_t n_size) {
- uint64_t sum = 0;
- for (uint64_t m = 0; m < n_size; m++){
- sum += B[m];
- }
- return sum;
+static uint64_t mlp_host_sum(uint64_t n_size)
+{
+ uint64_t sum = 0;
+ for (uint64_t m = 0; m < n_size; m++) {
+ sum += B[m];
+ }
+ return sum;
}
// Params ---------------------------------------------------------------------
typedef struct Params {
- int input_size_n;
- int input_size_m;
- int n_reps;
+ int input_size_n;
+ int input_size_m;
+ int n_reps;
#if NUMA
- struct bitmask* bitmask;
- int numa_node_cpu;
+ struct bitmask *bitmask;
+ int numa_node_cpu;
#endif
-}Params;
+} Params;
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n");
+void usage()
+{
+ fprintf(stderr, "\nUsage: ./program [options]" "\n");
}
- struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size_n = 8192;
- p.input_size_m = 20480;
- p.n_reps = 100;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size_n = 8192;
+ p.input_size_m = 20480;
+ p.n_reps = 100;
#if NUMA
- p.bitmask = NULL;
- p.numa_node_cpu = -1;
+ p.bitmask = NULL;
+ p.numa_node_cpu = -1;
#endif
- int opt;
- while((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'n': p.input_size_n = atoi(optarg); break;
- case 'm': p.input_size_m = atoi(optarg); break;
+ int opt;
+ while ((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'n':
+ p.input_size_n = atoi(optarg);
+ break;
+ case 'm':
+ p.input_size_m = atoi(optarg);
+ break;
#if NUMA
- case 'A': p.bitmask = numa_parse_nodestring(optarg); break;
- case 'C': p.numa_node_cpu = atoi(optarg); break;
+ case 'A':
+ p.bitmask = numa_parse_nodestring(optarg);
+ break;
+ case 'C':
+ p.numa_node_cpu = atoi(optarg);
+ break;
#endif
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
- return p;
- }
+ return p;
+}
/**
* @brief Main of the Host Application.
*/
- int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
- uint64_t n_size = p.input_size_n;
- uint64_t m_size = p.input_size_m;
+ struct Params p = input_params(argc, argv);
+ uint64_t n_size = p.input_size_n;
+ uint64_t m_size = p.input_size_m;
#if WITH_BENCHMARK
- Timer timer;
+ Timer timer;
#endif
#if NUMA
- if (p.bitmask) {
- numa_set_membind(p.bitmask);
- numa_free_nodemask(p.bitmask);
- }
- A = numa_alloc(NUM_LAYERS * sizeof(T*));
- for(int l = 0; l < NUM_LAYERS; l++) {
- A[l] = numa_alloc(n_size*m_size*sizeof(unsigned int));
- }
- B = numa_alloc(m_size*sizeof(unsigned int));
- C = numa_alloc(m_size*sizeof(unsigned int));
-
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_data = mp_status[0];
- }
-
- numa_node_cpu = p.numa_node_cpu;
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ if (p.bitmask) {
+ numa_set_membind(p.bitmask);
+ numa_free_nodemask(p.bitmask);
+ }
+ A = numa_alloc(NUM_LAYERS * sizeof(T *));
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ A[l] = numa_alloc(n_size * m_size * sizeof(unsigned int));
+ }
+ B = numa_alloc(m_size * sizeof(unsigned int));
+ C = numa_alloc(m_size * sizeof(unsigned int));
+
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_data = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#else
- A = malloc(NUM_LAYERS * sizeof(T*));
- for(int l = 0; l < NUM_LAYERS; l++) {
- A[l] = malloc(n_size*m_size*sizeof(unsigned int));
- }
- B = malloc(m_size*sizeof(unsigned int));
- C = malloc(m_size*sizeof(unsigned int));
+ A = malloc(NUM_LAYERS * sizeof(T *));
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ A[l] = malloc(n_size * m_size * sizeof(unsigned int));
+ }
+ B = malloc(m_size * sizeof(unsigned int));
+ C = malloc(m_size * sizeof(unsigned int));
#endif
- // Create an input file with arbitrary data.
- init_data(A, m_size, n_size);
+ // Create an input file with arbitrary data.
+ init_data(A, m_size, n_size);
- for (int i = 0; i < p.n_reps; i++) {
- init_B(B, n_size);
+ for (int i = 0; i < p.n_reps; i++) {
+ init_B(B, n_size);
- start(&timer, 0, 0);
- mlp_host(C, A, B, n_size, m_size);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ mlp_host(C, A, B, n_size, m_size);
+ stop(&timer, 0);
#if WITH_BENCHMARK
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
- nr_threads, XSTR(T), n_size * m_size);
+ printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
+ nr_threads, XSTR(T), n_size * m_size);
#if NUMA
- printf(" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
- numa_node_data, numa_node_cpu, numa_distance(numa_node_data, numa_node_cpu));
+ printf
+ (" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+ numa_node_data, numa_node_cpu,
+ numa_distance(numa_node_data, numa_node_cpu));
#endif
- printf(" | throughput_MBps=%f throughput_MOpps=%f",
- n_size * m_size * sizeof(T) / timer.time[0],
- n_size * m_size / timer.time[0]);
- printf(" latency_us=%f\n",
- timer.time[0]);
-#endif // WITH_BENCHMARK
- }
+ printf(" | throughput_MBps=%f throughput_MOpps=%f",
+ n_size * m_size * sizeof(T) / timer.time[0],
+ n_size * m_size / timer.time[0]);
+ printf(" latency_us=%f\n", timer.time[0]);
+#endif // WITH_BENCHMARK
+ }
#if NOP_SYNC
- for(int rep = 0; rep < 200000; rep++) {
- asm volatile("nop" ::);
- }
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
#endif
- uint32_t sum = mlp_host_sum(n_size);
+ uint32_t sum = mlp_host_sum(n_size);
- printf("SUM = %d \n", sum);
+ printf("SUM = %d \n", sum);
#if NUMA
- for(int l = 0; l < NUM_LAYERS; l++) {
- numa_free(A[l], n_size*m_size*sizeof(unsigned int));
- }
- numa_free(A, NUM_LAYERS * sizeof(T*));
- numa_free(B, m_size*sizeof(unsigned int));
- numa_free(C, m_size*sizeof(unsigned int));
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ numa_free(A[l], n_size * m_size * sizeof(unsigned int));
+ }
+ numa_free(A, NUM_LAYERS * sizeof(T *));
+ numa_free(B, m_size * sizeof(unsigned int));
+ numa_free(C, m_size * sizeof(unsigned int));
#else
- for(int l = 0; l < NUM_LAYERS; l++) {
- free(A[l]);
- }
- free(A);
- free(B);
- free(C);
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ free(A[l]);
+ }
+ free(A);
+ free(B);
+ free(C);
#endif
- return 0;
+ return 0;
}