summaryrefslogtreecommitdiff
path: root/MLP/baselines
diff options
context:
space:
mode:
Diffstat (limited to 'MLP/baselines')
-rw-r--r--MLP/baselines/cpu/Makefile4
-rw-r--r--MLP/baselines/cpu/README9
-rw-r--r--MLP/baselines/cpu/mlp_openmp.c161
-rw-r--r--MLP/baselines/gpu/Makefile5
-rw-r--r--MLP/baselines/gpu/README9
-rw-r--r--MLP/baselines/gpu/mlp.cu208
6 files changed, 396 insertions, 0 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
new file mode 100644
index 0000000..581897e
--- /dev/null
+++ b/MLP/baselines/cpu/Makefile
@@ -0,0 +1,4 @@
+all:
+ gcc mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+run:
+ ./mlp_openmp
diff --git a/MLP/baselines/cpu/README b/MLP/baselines/cpu/README
new file mode 100644
index 0000000..b928195
--- /dev/null
+++ b/MLP/baselines/cpu/README
@@ -0,0 +1,9 @@
+Multilayer Perceptron (MLP)
+
+Compilation instructions
+
+ make
+
+Execution instructions
+
+ ./mlp_openmp
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
new file mode 100644
index 0000000..ef478c1
--- /dev/null
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -0,0 +1,161 @@
+/**
+* @file app.c
+* @brief Template for a Host Application Source File.
+*
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
+#include <stdint.h>
+#include "../../support/timer.h"
+#include "../../support/common.h"
+
+T** A;
+T* B;
+T* C;
+
+// Create input arrays
+static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
+ for (unsigned int l = 0; l < NUM_LAYERS; l++)
+ for (unsigned int i = 0; i < m_size * n_size; i++){
+ if(i % 100 < 98){
+ A[l][i] = 0;
+ }else{
+ A[l][i] = (l+i) % 2;
+ }
+ }
+ for (unsigned int i = 0; i < n_size; i++){
+ if(i % 50 < 48){
+ B[i] = 0;
+ }
+ else{
+ B[i] = i % 2;
+ }
+ }
+}
+
+// Compute output in the host
+static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
+ for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
+ for (unsigned int m = 0; m < m_size; m++){
+ C[m] = 0;
+ }
+ #pragma omp parallel for
+ for (unsigned int m = 0; m < m_size; m++){
+ for (unsigned int n = 0; n < n_size; n++){
+ C[m] += A[nl][m * n_size + n] * B[n];
+ }
+ C[m] = max(0, C[m]);
+ }
+ for (unsigned int n = 0; n < n_size; n++){
+ B[n] = C[n];
+ }
+ }
+}
+
+static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
+ uint64_t sum = 0;
+ for (uint64_t m = 0; m < n_size; m++){
+ sum += B[m];
+ }
+ return sum;
+}
+
+// Params ---------------------------------------------------------------------
+typedef struct Params {
+ char* dpu_type;
+ int nr_of_ranks;
+ int input_size_n;
+ int input_size_m;
+ int n_warmup;
+ int n_reps;
+}Params;
+
+void usage() {
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -d <D> DPU type (default=fsim)"
+ "\n -r <R> # of ranks (default=2)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=8M elements)"
+ "\n");
+ }
+
+ struct Params input_params(int argc, char **argv) {
+ struct Params p;
+ p.dpu_type = "fsim";
+ p.nr_of_ranks = 1;
+ p.input_size_n = 1 << 9;
+ p.input_size_m = 1 << 9;
+ p.n_warmup = 2;
+ p.n_reps = 3;
+
+ int opt;
+ while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
+ switch(opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'd': p.dpu_type = optarg; break;
+ case 'r': p.nr_of_ranks = atoi(optarg); break;
+ case 'n': p.input_size_n = atoi(optarg); break;
+ case 'm': p.input_size_m = atoi(optarg); break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
+
+ return p;
+ }
+
+ /**
+ * @brief Main of the Host Application.
+ */
+ int main(int argc, char **argv) {
+
+ struct Params p = input_params(argc, argv);
+ uint64_t n_size = 8192;
+ uint64_t m_size = 20480;
+
+ Timer timer;
+ A = malloc(NUM_LAYERS * sizeof(T*));
+ for(int l = 0; l < NUM_LAYERS; l++)
+ A[l] = malloc(n_size*m_size*sizeof(unsigned int));
+ B = malloc(m_size*sizeof(unsigned int));
+ C = malloc(m_size*sizeof(unsigned int));
+
+ // Create an input file with arbitrary data.
+ init_data(A, B, m_size, n_size);
+
+ start(&timer, 0, 1);
+ mlp_host(C, A, B, n_size, m_size);
+ stop(&timer, 0);
+
+ uint32_t sum = mlp_host_sum(n_size, m_size);
+
+ printf("Kernel ");
+ print(&timer, 0, 1);
+ printf("\n");
+
+ printf("SUM = %d \n", sum);
+
+ for(int l = 0; l < NUM_LAYERS; l++)
+ free(A[l]);
+ free(A);
+ free(B);
+ free(C);
+
+ return 0;
+}
diff --git a/MLP/baselines/gpu/Makefile b/MLP/baselines/gpu/Makefile
new file mode 100644
index 0000000..69ee49c
--- /dev/null
+++ b/MLP/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+ /usr/local/cuda/bin/nvcc mlp.cu -I/usr/local/cuda/include -lm -o mlp
+
+clean:
+ rm mlp
diff --git a/MLP/baselines/gpu/README b/MLP/baselines/gpu/README
new file mode 100644
index 0000000..253c8e3
--- /dev/null
+++ b/MLP/baselines/gpu/README
@@ -0,0 +1,9 @@
+Multilayer Perceptron (MLP)
+
+Compilation instructions
+
+ make
+
+Execution instructions
+
+ ./mlp
diff --git a/MLP/baselines/gpu/mlp.cu b/MLP/baselines/gpu/mlp.cu
new file mode 100644
index 0000000..c912d10
--- /dev/null
+++ b/MLP/baselines/gpu/mlp.cu
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <cuda.h>
+#include "../../support/common.h"
+
+#define THREAD 128
+
+__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans);
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans);
+
+double gettime()
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return tv.tv_sec + (double)tv.tv_usec*1.0e-6;
+}
+
+int main(int argc, char **argv)
+{
+ /* for CPU */
+ int i, j;
+ T **bdim;
+ T *c, *ans, *h_ans, *h_c;
+ int n = 8192;
+ int m = 20480;
+
+ bdim = (T**) malloc(NUM_LAYERS * sizeof(T*));
+ for(int l = 0; l < NUM_LAYERS; l++)
+ bdim[l] = (T*)malloc(sizeof(T)*m*n);
+ c = (T*)malloc(sizeof(T) *n);
+ h_c = (T*)malloc(sizeof(T) *n);
+ ans = (T*)malloc(sizeof(T) *m);
+ h_ans = (T*)malloc(sizeof(T) *m);
+
+ /* for GPU */
+ T *d_bdim;
+ T *d_c, *d_ans;
+ cudaMalloc((void **)&d_bdim, sizeof(T)*m*n);
+ cudaMalloc((void **)&d_c, sizeof(T)*n);
+ cudaMalloc((void **)&d_ans, sizeof(T)*m);
+
+ for(i = 0; i < n; i++)
+ {
+ if(i % 50 < 48)
+ {
+ c[i] = 0;
+ h_c[i] = 0;
+ }
+ else
+ {
+ c[i] = i % 2;
+ h_c[i] = i % 2;
+ }
+ }
+ for(int l = 0; l < NUM_LAYERS; l++)
+ for(i = 0; i < n; i++)
+ {
+ for(j = 0; j < m; j++){
+ if(j % 100 < 98)
+ {
+
+ bdim[l][i*m+j] = 0;
+ }
+ else
+ {
+
+ bdim[l][i*m+j] = (l + i) % 2;
+ }
+ }
+ }
+
+ for(j = 0; j < m; j++){
+ ans[j] = 0;
+ h_ans[j] = 0;
+ }
+ // Computation on the host for verification
+ T* vector = c;
+ T* output = ans;
+ T* matrix;
+ int mm = m;
+ int nn = n;
+ for(int l = 0; l < NUM_LAYERS; l++){
+ matrix = bdim[l];
+ cgemv(mm, nn, matrix, vector, output);
+ vector = output;
+ h_ans = output;
+ mm = n; nn = m;
+ }
+
+ // Event creation
+ cudaEvent_t start, stop;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+ float time1 = 0;
+ float time2 = 0;
+ cudaMemcpy(d_ans, h_ans, sizeof(T)*m, cudaMemcpyHostToDevice);
+ cudaMemcpy(d_c, h_c, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+ vector = d_c;
+ output = d_ans;
+ mm = m;
+ nn = n;
+ for(int l = 0; l < NUM_LAYERS; l++){
+ cudaMemcpy(d_bdim, bdim[l], sizeof(T)*m*n, cudaMemcpyHostToDevice);
+ matrix = d_bdim;
+ // Start timer
+ cudaEventRecord( start, 0 );
+ gemv<<<mm, THREAD>>>(mm, nn, matrix, vector, output);
+ // End timer
+ cudaEventRecord( stop, 0 );
+ cudaEventSynchronize( stop );
+ cudaEventElapsedTime( &time2, start, stop );
+ time1 += time2;
+ vector = output;
+ d_ans = output;
+ mm = n; nn = m;
+ }
+
+ cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost);
+ cudaMemcpy(h_c, d_c, sizeof(T)*n, cudaMemcpyDeviceToHost);
+
+ for(i = 0; i < m; i++)
+ {
+ if(ans[i] != h_ans[i])
+ printf("ERROR in Ans %d -> %d -- %d\n", i, ans[i], h_ans[i]);
+ }
+
+ for(i = 0; i < n; i++)
+ {
+ if(c[i] != h_c[i])
+ printf("ERROR in C %d -> %d -- %d\n", i, c[i], h_c[i]);
+ }
+ printf("Execution time = %f ms\n", time1);
+
+
+ for(int l = 0; l < NUM_LAYERS; l++)
+ free(bdim[l]);
+
+
+ free(bdim);
+ free(c);
+ free(ans);
+ free(h_c);
+ cudaFree(d_bdim);
+ cudaFree(d_c);
+ cudaFree(d_ans);
+ cudaEventDestroy(start);
+ cudaEventDestroy(stop);
+
+ return 0;
+}
+
+__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans)
+{
+ int i;
+ int div = n/THREAD;
+ __shared__ T tmp[THREAD];
+
+ tmp[threadIdx.x] = 0.0;
+
+ for(i = 0; i < div; i++){
+ tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x];
+ }
+ if(threadIdx.x < m%THREAD)
+ tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x];
+
+ __syncthreads();
+
+ for(i = THREAD / 2; i > 31; i = i / 2)
+ {
+ if(threadIdx.x < i)
+ tmp[threadIdx.x] += tmp[threadIdx.x + i];
+ __syncthreads();
+ }
+
+ if(threadIdx.x < 16)
+ {
+ tmp[threadIdx.x] += tmp[threadIdx.x + 16];
+ __syncthreads();
+ tmp[threadIdx.x] += tmp[threadIdx.x + 8];
+ __syncthreads();
+ tmp[threadIdx.x] += tmp[threadIdx.x + 4];
+ __syncthreads();
+ tmp[threadIdx.x] += tmp[threadIdx.x + 2];
+ __syncthreads();
+ tmp[threadIdx.x] += tmp[threadIdx.x + 1];
+ __syncthreads();
+ }
+
+
+ if(threadIdx.x == 0)
+ d_ans[blockIdx.x] = max(0, tmp[0]);
+
+}
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans)
+{
+ int i, j;
+
+ for(i = 0; i < m; i++){
+ for(j = 0; j < n; j++)
+ d_ans[i] += adim[i*n+j] * b[j];
+ d_ans[i] = max(0, d_ans[i]);
+ }
+
+}