6 files changed, 396 insertions, 0 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
new file mode 100644
index 0000000..581897e
--- /dev/null
+++ b/MLP/baselines/cpu/Makefile
@@ -0,0 +1,4 @@
+all:
+	gcc mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+run:
+	./mlp_openmp
diff --git a/MLP/baselines/cpu/README b/MLP/baselines/cpu/README
new file mode 100644
index 0000000..b928195
--- /dev/null
+++ b/MLP/baselines/cpu/README
@@ -0,0 +1,9 @@
+Multilayer Perceptron (MLP)
+
+Compilation instructions
+
+    make
+
+Execution instructions
+
+    ./mlp_openmp
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
new file mode 100644
index 0000000..ef478c1
--- /dev/null
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -0,0 +1,161 @@
+/**
+* @file app.c
+* @brief Template for a Host Application Source File.
+*
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
+#include <stdint.h>
+#include "../../support/timer.h"
+#include "../../support/common.h"
+
+T** A;
+T* B;
+T* C;
+
+// Create input arrays
+static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
+    for (unsigned int l = 0; l < NUM_LAYERS; l++)
+		for (unsigned int i = 0; i < m_size * n_size; i++){
+			if(i % 100 < 98){
+				A[l][i] = 0;
+			}else{
+				A[l][i] = (l+i) % 2;
+			}
+		}
+	for (unsigned int i = 0; i < n_size; i++){
+		if(i % 50 < 48){
+			B[i] = 0;
+		}
+		else{
+			B[i] = i % 2;
+		}
+	}
+}
+
+// Compute output in the host
+static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
+	for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
+		for (unsigned int m = 0; m < m_size; m++){
+			C[m] = 0;
+		}
+		#pragma omp parallel for
+		for (unsigned int m = 0; m < m_size; m++){
+			for (unsigned int n = 0; n < n_size; n++){
+				C[m] += A[nl][m * n_size + n] * B[n];
+			}
+			C[m] = max(0, C[m]);
+		}
+		for (unsigned int n = 0; n < n_size; n++){
+			B[n] = C[n];
+		}
+	}
+}
+
+static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
+  uint64_t sum = 0;
+  for (uint64_t m = 0; m < n_size; m++){
+    sum += B[m];
+  }
+  return sum;
+}
+
+// Params ---------------------------------------------------------------------
+typedef struct Params {
+  char* dpu_type;
+  int   nr_of_ranks;
+  int   input_size_n;
+  int   input_size_m;
+  int   n_warmup;
+  int   n_reps;
+}Params;
+
+void usage() {
+  fprintf(stderr,
+    "\nUsage:  ./program [options]"
+    "\n"
+    "\nGeneral options:"
+    "\n    -h        help"
+    "\n    -d <D>    DPU type (default=fsim)"
+    "\n    -r <R>    # of ranks (default=2)"
+    "\n"
+    "\nBenchmark-specific options:"
+    "\n    -i <I>    input size (default=8M elements)"
+    "\n");
+  }
+
+  struct Params input_params(int argc, char **argv) {
+    struct Params p;
+    p.dpu_type      = "fsim";
+    p.nr_of_ranks   = 1;
+    p.input_size_n  = 1 << 9;
+    p.input_size_m  = 1 << 9;
+    p.n_warmup      = 2;
+    p.n_reps        = 3;
+
+    int opt;
+    while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
+      switch(opt) {
+        case 'h':
+        usage();
+        exit(0);
+        break;
+        case 'd': p.dpu_type        = optarg; break;
+        case 'r': p.nr_of_ranks     = atoi(optarg); break;
+        case 'n': p.input_size_n    = atoi(optarg); break;
+        case 'm': p.input_size_m    = atoi(optarg); break;
+        default:
+        fprintf(stderr, "\nUnrecognized option!\n");
+        usage();
+        exit(0);
+      }
+    }
+    assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
+
+    return p;
+  }
+
+  /**
+  * @brief Main of the Host Application.
+  */
+  int main(int argc, char **argv) {
+
+    struct Params p = input_params(argc, argv);
+    uint64_t n_size = 8192;
+    uint64_t m_size = 20480;
+
+    Timer timer;
+    A = malloc(NUM_LAYERS * sizeof(T*));
+    for(int l = 0; l < NUM_LAYERS; l++)
+        A[l] = malloc(n_size*m_size*sizeof(unsigned int));
+    B = malloc(m_size*sizeof(unsigned int));
+    C = malloc(m_size*sizeof(unsigned int));
+
+    // Create an input file with arbitrary data.
+    init_data(A, B, m_size, n_size);
+
+    start(&timer, 0, 1);
+    mlp_host(C, A, B, n_size, m_size);
+    stop(&timer, 0);
+
+    uint32_t sum = mlp_host_sum(n_size, m_size);
+   
+    printf("Kernel ");
+    print(&timer, 0, 1);
+    printf("\n");
+
+    printf("SUM = %d \n", sum);
+
+    for(int l = 0; l < NUM_LAYERS; l++)
+        free(A[l]);
+    free(A);
+    free(B);
+    free(C);
+
+    return 0;
+}
diff --git a/MLP/baselines/gpu/Makefile b/MLP/baselines/gpu/Makefile
new file mode 100644
index 0000000..69ee49c
--- /dev/null
+++ b/MLP/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+	/usr/local/cuda/bin/nvcc mlp.cu -I/usr/local/cuda/include -lm -o mlp
+
+clean:
+	rm mlp
diff --git a/MLP/baselines/gpu/README b/MLP/baselines/gpu/README
new file mode 100644
index 0000000..253c8e3
--- /dev/null
+++ b/MLP/baselines/gpu/README
@@ -0,0 +1,9 @@
+Multilayer Perceptron (MLP)
+
+Compilation instructions
+
+    make
+
+Execution instructions
+
+    ./mlp
diff --git a/MLP/baselines/gpu/mlp.cu b/MLP/baselines/gpu/mlp.cu
new file mode 100644
index 0000000..c912d10
--- /dev/null
+++ b/MLP/baselines/gpu/mlp.cu
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <cuda.h>
+#include "../../support/common.h"
+
+#define THREAD 128
+
+__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans);
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans);
+
+double gettime()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec + (double)tv.tv_usec*1.0e-6;
+}
+
+int main(int argc, char **argv)
+{
+	/* for CPU */
+	int i, j;
+	T **bdim; 
+	T *c, *ans, *h_ans, *h_c;
+	int n = 8192;
+	int m = 20480;
+
+	bdim = (T**) malloc(NUM_LAYERS * sizeof(T*));
+	for(int l = 0; l < NUM_LAYERS; l++)
+		bdim[l] = (T*)malloc(sizeof(T)*m*n);
+	c = (T*)malloc(sizeof(T) *n);
+	h_c = (T*)malloc(sizeof(T) *n);
+	ans = (T*)malloc(sizeof(T) *m);
+	h_ans = (T*)malloc(sizeof(T) *m);
+
+	/* for GPU */
+	T *d_bdim; 
+	T *d_c, *d_ans;
+	cudaMalloc((void **)&d_bdim, sizeof(T)*m*n);
+	cudaMalloc((void **)&d_c, sizeof(T)*n);
+	cudaMalloc((void **)&d_ans, sizeof(T)*m);
+
+	for(i = 0; i < n; i++)
+	{
+		if(i % 50 < 48)
+		{
+			c[i] = 0;
+			h_c[i] = 0;
+		}
+		else
+		{
+			c[i] = i % 2;
+			h_c[i] = i % 2;
+		}
+	}
+	for(int l = 0; l < NUM_LAYERS; l++)
+		for(i = 0; i < n; i++)
+		{
+			for(j = 0; j < m; j++){
+				if(j % 100 < 98)
+				{
+
+					bdim[l][i*m+j] = 0;
+				}
+				else
+				{
+
+					bdim[l][i*m+j] = (l + i) % 2;
+				}
+			}
+		}
+
+	for(j = 0; j < m; j++){
+		ans[j] = 0;
+		h_ans[j] = 0;
+	}
+	// Computation on the host for verification
+	T* vector = c;
+	T* output = ans;
+	T* matrix;
+	int mm = m;
+	int nn = n;
+	for(int l = 0; l < NUM_LAYERS; l++){
+		matrix = bdim[l];
+		cgemv(mm, nn, matrix, vector, output);
+		vector = output;
+                h_ans = output;
+		mm = n; nn = m;
+	}
+
+	// Event creation
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	float time1 = 0;
+	float time2 = 0;
+	cudaMemcpy(d_ans, h_ans, sizeof(T)*m, cudaMemcpyHostToDevice);
+	cudaMemcpy(d_c, h_c, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+	vector = d_c;
+	output = d_ans;
+	mm = m;
+	nn = n;
+	for(int l = 0; l < NUM_LAYERS; l++){
+		cudaMemcpy(d_bdim, bdim[l], sizeof(T)*m*n, cudaMemcpyHostToDevice);
+		matrix = d_bdim;
+		// Start timer
+		cudaEventRecord( start, 0 );
+		gemv<<<mm, THREAD>>>(mm, nn, matrix, vector, output);
+		// End timer
+		cudaEventRecord( stop, 0 );
+		cudaEventSynchronize( stop );
+		cudaEventElapsedTime( &time2, start, stop );
+		time1 += time2;
+		vector = output;
+		d_ans = output;
+		mm = n; nn = m;
+	}
+
+	cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_c, d_c, sizeof(T)*n, cudaMemcpyDeviceToHost);
+
+	for(i = 0; i < m; i++)
+	{
+		if(ans[i] != h_ans[i])
+		printf("ERROR in Ans %d -> %d -- %d\n", i, ans[i], h_ans[i]);
+        }
+
+	for(i = 0; i < n; i++)
+	{
+		if(c[i] != h_c[i])
+		printf("ERROR in C %d -> %d -- %d\n", i, c[i], h_c[i]);
+	}
+	printf("Execution time = %f ms\n", time1);
+
+
+	for(int l = 0; l < NUM_LAYERS; l++)
+		free(bdim[l]);
+
+
+	free(bdim);
+	free(c);
+	free(ans);
+	free(h_c);
+	cudaFree(d_bdim);
+	cudaFree(d_c);
+	cudaFree(d_ans);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+
+	return 0;
+} 
+
+__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans)
+{
+	int i;
+	int div = n/THREAD;
+	__shared__ T tmp[THREAD];
+
+	tmp[threadIdx.x] = 0.0;
+
+	for(i = 0; i < div; i++){
+		tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x];
+	}
+	if(threadIdx.x < m%THREAD)
+		tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x];
+
+	__syncthreads();
+
+	for(i = THREAD / 2; i > 31; i = i / 2)
+	{
+		if(threadIdx.x < i)
+			tmp[threadIdx.x] += tmp[threadIdx.x + i];
+		__syncthreads();
+	}
+
+	if(threadIdx.x < 16)
+	{
+		tmp[threadIdx.x] += tmp[threadIdx.x + 16];
+		__syncthreads();
+		tmp[threadIdx.x] += tmp[threadIdx.x + 8];
+		__syncthreads();
+		tmp[threadIdx.x] += tmp[threadIdx.x + 4];
+		__syncthreads();
+		tmp[threadIdx.x] += tmp[threadIdx.x + 2];
+		__syncthreads();
+		tmp[threadIdx.x] += tmp[threadIdx.x + 1];
+		__syncthreads();
+	}
+
+
+	if(threadIdx.x == 0)
+		d_ans[blockIdx.x] = max(0, tmp[0]);
+
+}
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans)
+{
+	int i, j;
+
+	for(i = 0; i < m; i++){
+		for(j = 0; j < n; j++)
+			d_ans[i] += adim[i*n+j] * b[j];
+		d_ans[i] = max(0, d_ans[i]);
+	}
+
+}