diff options
Diffstat (limited to 'MLP/baselines')
-rw-r--r-- | MLP/baselines/cpu/Makefile | 4 | ||||
-rw-r--r-- | MLP/baselines/cpu/README | 9 | ||||
-rw-r--r-- | MLP/baselines/cpu/mlp_openmp.c | 161 | ||||
-rw-r--r-- | MLP/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | MLP/baselines/gpu/README | 9 | ||||
-rw-r--r-- | MLP/baselines/gpu/mlp.cu | 208 |
6 files changed, 396 insertions, 0 deletions
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile new file mode 100644 index 0000000..581897e --- /dev/null +++ b/MLP/baselines/cpu/Makefile @@ -0,0 +1,4 @@ +all: + gcc mlp_openmp.c -o mlp_openmp -fopenmp -std=c99 +run: + ./mlp_openmp diff --git a/MLP/baselines/cpu/README b/MLP/baselines/cpu/README new file mode 100644 index 0000000..b928195 --- /dev/null +++ b/MLP/baselines/cpu/README @@ -0,0 +1,9 @@ +Multilayer Perceptron (MLP) + +Compilation instructions + + make + +Execution instructions + + ./mlp_openmp diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c new file mode 100644 index 0000000..ef478c1 --- /dev/null +++ b/MLP/baselines/cpu/mlp_openmp.c @@ -0,0 +1,161 @@ +/** +* @file app.c +* @brief Template for a Host Application Source File. +* +*/ +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <getopt.h> +#include <assert.h> +#include <stdint.h> +#include "../../support/timer.h" +#include "../../support/common.h" + +T** A; +T* B; +T* C; + +// Create input arrays +static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){ + for (unsigned int l = 0; l < NUM_LAYERS; l++) + for (unsigned int i = 0; i < m_size * n_size; i++){ + if(i % 100 < 98){ + A[l][i] = 0; + }else{ + A[l][i] = (l+i) % 2; + } + } + for (unsigned int i = 0; i < n_size; i++){ + if(i % 50 < 48){ + B[i] = 0; + } + else{ + B[i] = i % 2; + } + } +} + +// Compute output in the host +static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) { + for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){ + for (unsigned int m = 0; m < m_size; m++){ + C[m] = 0; + } + #pragma omp parallel for + for (unsigned int m = 0; m < m_size; m++){ + for (unsigned int n = 0; n < n_size; n++){ + C[m] += A[nl][m * n_size + n] * B[n]; + } + C[m] = max(0, C[m]); + } + for (unsigned int n = 0; n < n_size; n++){ + B[n] = C[n]; + } + } +} + +static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) { + uint64_t sum = 0; + for (uint64_t m = 0; m < n_size; m++){ + sum += B[m]; + } + return sum; +} + +// Params --------------------------------------------------------------------- +typedef struct Params { + char* dpu_type; + int nr_of_ranks; + int input_size_n; + int input_size_m; + int n_warmup; + int n_reps; +}Params; + +void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -d <D> DPU type (default=fsim)" + "\n -r <R> # of ranks (default=2)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=8M elements)" + "\n"); + } + + struct Params input_params(int argc, char **argv) { + struct Params p; + p.dpu_type = "fsim"; + p.nr_of_ranks = 1; + p.input_size_n = 1 << 9; + p.input_size_m = 1 << 9; + p.n_warmup = 2; + p.n_reps = 3; + + int opt; + while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'd': p.dpu_type = optarg; break; + case 'r': p.nr_of_ranks = atoi(optarg); break; + case 'n': p.input_size_n = atoi(optarg); break; + case 'm': p.input_size_m = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.nr_of_ranks > 0 && "Invalid # of ranks!"); + + return p; + } + + /** + * @brief Main of the Host Application. + */ + int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + uint64_t n_size = 8192; + uint64_t m_size = 20480; + + Timer timer; + A = malloc(NUM_LAYERS * sizeof(T*)); + for(int l = 0; l < NUM_LAYERS; l++) + A[l] = malloc(n_size*m_size*sizeof(unsigned int)); + B = malloc(m_size*sizeof(unsigned int)); + C = malloc(m_size*sizeof(unsigned int)); + + // Create an input file with arbitrary data. + init_data(A, B, m_size, n_size); + + start(&timer, 0, 1); + mlp_host(C, A, B, n_size, m_size); + stop(&timer, 0); + + uint32_t sum = mlp_host_sum(n_size, m_size); + + printf("Kernel "); + print(&timer, 0, 1); + printf("\n"); + + printf("SUM = %d \n", sum); + + for(int l = 0; l < NUM_LAYERS; l++) + free(A[l]); + free(A); + free(B); + free(C); + + return 0; +} diff --git a/MLP/baselines/gpu/Makefile b/MLP/baselines/gpu/Makefile new file mode 100644 index 0000000..69ee49c --- /dev/null +++ b/MLP/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc mlp.cu -I/usr/local/cuda/include -lm -o mlp + +clean: + rm mlp diff --git a/MLP/baselines/gpu/README b/MLP/baselines/gpu/README new file mode 100644 index 0000000..253c8e3 --- /dev/null +++ b/MLP/baselines/gpu/README @@ -0,0 +1,9 @@ +Multilayer Perceptron (MLP) + +Compilation instructions + + make + +Execution instructions + + ./mlp diff --git a/MLP/baselines/gpu/mlp.cu b/MLP/baselines/gpu/mlp.cu new file mode 100644 index 0000000..c912d10 --- /dev/null +++ b/MLP/baselines/gpu/mlp.cu @@ -0,0 +1,208 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <cuda.h> +#include "../../support/common.h" + +#define THREAD 128 + +__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans); + +void cgemv(int m, int n, T *adim, T *b, T *d_ans); + +double gettime() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1.0e-6; +} + +int main(int argc, char **argv) +{ + /* for CPU */ + int i, j; + T **bdim; + T *c, *ans, *h_ans, *h_c; + int n = 8192; + int m = 20480; + + bdim = (T**) malloc(NUM_LAYERS * sizeof(T*)); + for(int l = 0; l < NUM_LAYERS; l++) + bdim[l] = (T*)malloc(sizeof(T)*m*n); + c = (T*)malloc(sizeof(T) *n); + h_c = (T*)malloc(sizeof(T) *n); + ans = (T*)malloc(sizeof(T) *m); + h_ans = (T*)malloc(sizeof(T) *m); + + /* for GPU */ + T *d_bdim; + T *d_c, *d_ans; + cudaMalloc((void **)&d_bdim, sizeof(T)*m*n); + cudaMalloc((void **)&d_c, sizeof(T)*n); + cudaMalloc((void **)&d_ans, sizeof(T)*m); + + for(i = 0; i < n; i++) + { + if(i % 50 < 48) + { + c[i] = 0; + h_c[i] = 0; + } + else + { + c[i] = i % 2; + h_c[i] = i % 2; + } + } + for(int l = 0; l < NUM_LAYERS; l++) + for(i = 0; i < n; i++) + { + for(j = 0; j < m; j++){ + if(j % 100 < 98) + { + + bdim[l][i*m+j] = 0; + } + else + { + + bdim[l][i*m+j] = (l + i) % 2; + } + } + } + + for(j = 0; j < m; j++){ + ans[j] = 0; + h_ans[j] = 0; + } + // Computation on the host for verification + T* vector = c; + T* output = ans; + T* matrix; + int mm = m; + int nn = n; + for(int l = 0; l < NUM_LAYERS; l++){ + matrix = bdim[l]; + cgemv(mm, nn, matrix, vector, output); + vector = output; + h_ans = output; + mm = n; nn = m; + } + + // Event creation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float time1 = 0; + float time2 = 0; + cudaMemcpy(d_ans, h_ans, sizeof(T)*m, cudaMemcpyHostToDevice); + cudaMemcpy(d_c, h_c, sizeof(T)*n, cudaMemcpyHostToDevice); + + vector = d_c; + output = d_ans; + mm = m; + nn = n; + for(int l = 0; l < NUM_LAYERS; l++){ + cudaMemcpy(d_bdim, bdim[l], sizeof(T)*m*n, cudaMemcpyHostToDevice); + matrix = d_bdim; + // Start timer + cudaEventRecord( start, 0 ); + gemv<<<mm, THREAD>>>(mm, nn, matrix, vector, output); + // End timer + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time2, start, stop ); + time1 += time2; + vector = output; + d_ans = output; + mm = n; nn = m; + } + + cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost); + cudaMemcpy(h_c, d_c, sizeof(T)*n, cudaMemcpyDeviceToHost); + + for(i = 0; i < m; i++) + { + if(ans[i] != h_ans[i]) + printf("ERROR in Ans %d -> %d -- %d\n", i, ans[i], h_ans[i]); + } + + for(i = 0; i < n; i++) + { + if(c[i] != h_c[i]) + printf("ERROR in C %d -> %d -- %d\n", i, c[i], h_c[i]); + } + printf("Execution time = %f ms\n", time1); + + + for(int l = 0; l < NUM_LAYERS; l++) + free(bdim[l]); + + + free(bdim); + free(c); + free(ans); + free(h_c); + cudaFree(d_bdim); + cudaFree(d_c); + cudaFree(d_ans); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return 0; +} + +__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans) +{ + int i; + int div = n/THREAD; + __shared__ T tmp[THREAD]; + + tmp[threadIdx.x] = 0.0; + + for(i = 0; i < div; i++){ + tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x]; + } + if(threadIdx.x < m%THREAD) + tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x]; + + __syncthreads(); + + for(i = THREAD / 2; i > 31; i = i / 2) + { + if(threadIdx.x < i) + tmp[threadIdx.x] += tmp[threadIdx.x + i]; + __syncthreads(); + } + + if(threadIdx.x < 16) + { + tmp[threadIdx.x] += tmp[threadIdx.x + 16]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 8]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 4]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 2]; + __syncthreads(); + tmp[threadIdx.x] += tmp[threadIdx.x + 1]; + __syncthreads(); + } + + + if(threadIdx.x == 0) + d_ans[blockIdx.x] = max(0, tmp[0]); + +} + +void cgemv(int m, int n, T *adim, T *b, T *d_ans) +{ + int i, j; + + for(i = 0; i < m; i++){ + for(j = 0; j < n; j++) + d_ans[i] += adim[i*n+j] * b[j]; + d_ans[i] = max(0, d_ans[i]); + } + +} |