diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /GEMV/baselines | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'GEMV/baselines')
-rw-r--r-- | GEMV/baselines/cpu/Makefile | 7 | ||||
-rw-r--r-- | GEMV/baselines/cpu/README | 9 | ||||
-rw-r--r-- | GEMV/baselines/cpu/gemv_openmp.c | 78 | ||||
-rw-r--r-- | GEMV/baselines/cpu/gemv_utils.h | 29 | ||||
-rw-r--r-- | GEMV/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | GEMV/baselines/gpu/README | 9 | ||||
-rw-r--r-- | GEMV/baselines/gpu/gemv.cu | 152 |
7 files changed, 289 insertions, 0 deletions
diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile new file mode 100644 index 0000000..c779651 --- /dev/null +++ b/GEMV/baselines/cpu/Makefile @@ -0,0 +1,7 @@ +all: + gcc -o gemv -fopenmp gemv_openmp.c + +clean: + rm gemv + + diff --git a/GEMV/baselines/cpu/README b/GEMV/baselines/cpu/README new file mode 100644 index 0000000..92906c3 --- /dev/null +++ b/GEMV/baselines/cpu/README @@ -0,0 +1,9 @@ +Matrix-Vector Multiplication (GEMV) + +Compilation instructions: + + make + +Execution instructions + + ./gemv diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c new file mode 100644 index 0000000..307e03b --- /dev/null +++ b/GEMV/baselines/cpu/gemv_openmp.c @@ -0,0 +1,78 @@ +#include <stdlib.h> +#include <stdio.h> +#include "../../support/timer.h" +#include "gemv_utils.h" + +int main(int argc, char *argv[]) +{ + const size_t rows = 20480; + const size_t cols = 8192; + + double **A, *b, *x; + + b = (double*) malloc(sizeof(double)*rows); + x = (double*) malloc(sizeof(double)*cols); + + allocate_dense(rows, cols, &A); + + make_hilbert_mat(rows,cols, &A); + +#pragma omp parallel + { +#pragma omp for + for (size_t i = 0; i < cols; i++) { + x[i] = (double) i+1 ; + } + +#pragma omp for + for (size_t i = 0; i < rows; i++) { + b[i] = (double) 0.0; + } + } + + Timer timer; + start(&timer, 0, 0); + + + gemv(A, x, rows, cols, &b); + + stop(&timer, 0); + + + printf("Kernel "); + print(&timer, 0, 1); + printf("\n"); + +#if 0 + print_vec(x, rows); + print_mat(A, rows, cols); + print_vec(b, rows); +#endif + + printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows)); + return 0; +} + +void gemv(double** A, double* x, size_t rows, size_t cols, double** b) { +#pragma omp parallel for + for (size_t i = 0; i < rows; i ++ ) + for (size_t j = 0; j < cols; j ++ ) { + (*b)[i] = (*b)[i] + A[i][j]*x[j]; + } +} + +void make_hilbert_mat(size_t rows, size_t cols, double*** A) { +#pragma omp parallel for + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { + (*A)[i][j] = 1.0/( (double) i + (double) j + 1.0); + } + } +} + +double sum_vec(double* vec, size_t rows) { + double sum = 0.0; +#pragma omp parallel for reduction(+:sum) + for (int i = 0; i < rows; i++) sum = sum + vec[i]; + return sum; +} diff --git a/GEMV/baselines/cpu/gemv_utils.h b/GEMV/baselines/cpu/gemv_utils.h new file mode 100644 index 0000000..605f148 --- /dev/null +++ b/GEMV/baselines/cpu/gemv_utils.h @@ -0,0 +1,29 @@ +void allocate_dense(size_t rows,size_t cols, double*** dense) { + + *dense = malloc(sizeof(double)*rows); + **dense = malloc(sizeof(double)*rows*cols); + + for (size_t i=0; i < rows; i++ ) { + (*dense)[i] = (*dense)[0] + i*cols; + } + +} + +void print_mat(double** A, size_t rows, size_t cols) { + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { + printf("%f ", A[i][j]); + } + printf("\n"); + } +} + +void print_vec(double* b, size_t rows) { + for (size_t i = 0; i < rows; i++) { + printf("%f\n", b[i]); + } +} + +void gemv(double** A, double* x, size_t rows, size_t cols, double** b); +void make_hilbert_mat(size_t rows, size_t cols, double*** A); +double sum_vec(double* vec, size_t rows); diff --git a/GEMV/baselines/gpu/Makefile b/GEMV/baselines/gpu/Makefile new file mode 100644 index 0000000..bd1be55 --- /dev/null +++ b/GEMV/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc gemv.cu -I/usr/local/cuda/include -lm -o gemv + +clean: + rm gemv diff --git a/GEMV/baselines/gpu/README b/GEMV/baselines/gpu/README new file mode 100644 index 0000000..92906c3 --- /dev/null +++ b/GEMV/baselines/gpu/README @@ -0,0 +1,9 @@ +Matrix-Vector Multiplication (GEMV) + +Compilation instructions: + + make + +Execution instructions + + ./gemv diff --git a/GEMV/baselines/gpu/gemv.cu b/GEMV/baselines/gpu/gemv.cu new file mode 100644 index 0000000..5c0e240 --- /dev/null +++ b/GEMV/baselines/gpu/gemv.cu @@ -0,0 +1,152 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <cuda.h> + +#define THREAD 128 + +#define T int + +__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans); + +void cgemv(int m, int n, T *adim, T *b, T *d_ans); + +double gettime() +{ +struct timeval tv; +gettimeofday(&tv, NULL); +return tv.tv_sec + (double)tv.tv_usec*1.0e-6; +} + +int main(int argc, char **argv) +{ +/* for CPU */ +int i, j; +int *bdim, *c, *ans, *h_ans; +//double start, stop; +//double cpu_time, gpu_time; +int n = 8192; +int m = 20480; + +bdim = (T*)malloc(sizeof(T) *m*n); +c = (T*)malloc(sizeof(T) *n); +ans = (T*)malloc(sizeof(T) *m); +h_ans = (T*)malloc(sizeof(T) *m); + +/* for GPU */ +T *d_bdim, *d_c, *d_ans; +cudaMalloc((void **)&d_bdim, sizeof(T)*m*n); +cudaMalloc((void **)&d_c, sizeof(T)*n); +cudaMalloc((void **)&d_ans, sizeof(T)*m); + +for(i = 0; i < n; i++) +{ +c[i] = 1; +for(j = 0; j < m; j++) +bdim[i*m+j] = 1; +} + +//start = gettime(); +cgemv(m, n, bdim, c, ans); +//stop = gettime(); +//cpu_time=stop - start; + +// Event creation +cudaEvent_t start, stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +float time1 = 0; + + +cudaMemcpy(d_bdim, bdim, sizeof(T)*m*n, cudaMemcpyHostToDevice); +cudaMemcpy(d_c, c, sizeof(T)*n, cudaMemcpyHostToDevice); + +// Start timer +cudaEventRecord( start, 0 ); +//start = gettime(); +gemv<<<m, THREAD>>>(m, n, d_bdim, d_c, d_ans); +//stop = gettime(); +// End timer +cudaEventRecord( stop, 0 ); +cudaEventSynchronize( stop ); +cudaEventElapsedTime( &time1, start, stop ); + +//gpu_time=stop - start; + +cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost); + +//printf("cpu_time : %.6f[sec]\n",cpu_time); +//printf("gpu_time : %.6f[sec]\n",gpu_time); +//printf("%f x\n", cpu_time / gpu_time); + + +for(i = 0; i < m; i++) +printf("%d -- %d\n", ans[i], h_ans[i]); + +printf("Execution time = %f ms\n", time1); + + +free(bdim); +free(c); +free(ans); +free(h_ans); +cudaFree(d_bdim); +cudaFree(d_c); +cudaFree(d_ans); + +return 0; +} + +__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans) +{ +int i; +int div = n/THREAD; +__shared__ T tmp[THREAD]; + +tmp[threadIdx.x] = 0.0; + +for(i = 0; i < div; i++) +{ +tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x]; +} +if(threadIdx.x < m%THREAD) +tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x]; + +__syncthreads(); + +for(i = THREAD / 2; i > 31; i = i / 2) +{ +if(threadIdx.x < i) +tmp[threadIdx.x] += tmp[threadIdx.x + i]; +__syncthreads(); +} + +if(threadIdx.x < 16) +{ +tmp[threadIdx.x] += tmp[threadIdx.x + 16]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 8]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 4]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 2]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 1]; +__syncthreads(); +} + + +if(threadIdx.x == 0) +d_ans[blockIdx.x] = tmp[0]; + +} + +void cgemv(int m, int n, T *adim, T *b, T *d_ans) +{ +int i, j; + +for(i = 0; i < m; i++) +for(j = 0; j < n; j++) +d_ans[i] += adim[i*n+j] * b[j]; + +} |