diff options
Diffstat (limited to 'GEMV/baselines/gpu')
-rw-r--r-- | GEMV/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | GEMV/baselines/gpu/README | 9 | ||||
-rw-r--r-- | GEMV/baselines/gpu/gemv.cu | 152 |
3 files changed, 166 insertions, 0 deletions
diff --git a/GEMV/baselines/gpu/Makefile b/GEMV/baselines/gpu/Makefile new file mode 100644 index 0000000..bd1be55 --- /dev/null +++ b/GEMV/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc gemv.cu -I/usr/local/cuda/include -lm -o gemv + +clean: + rm gemv diff --git a/GEMV/baselines/gpu/README b/GEMV/baselines/gpu/README new file mode 100644 index 0000000..92906c3 --- /dev/null +++ b/GEMV/baselines/gpu/README @@ -0,0 +1,9 @@ +Matrix-Vector Multiplication (GEMV) + +Compilation instructions: + + make + +Execution instructions + + ./gemv diff --git a/GEMV/baselines/gpu/gemv.cu b/GEMV/baselines/gpu/gemv.cu new file mode 100644 index 0000000..5c0e240 --- /dev/null +++ b/GEMV/baselines/gpu/gemv.cu @@ -0,0 +1,152 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <cuda.h> + +#define THREAD 128 + +#define T int + +__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans); + +void cgemv(int m, int n, T *adim, T *b, T *d_ans); + +double gettime() +{ +struct timeval tv; +gettimeofday(&tv, NULL); +return tv.tv_sec + (double)tv.tv_usec*1.0e-6; +} + +int main(int argc, char **argv) +{ +/* for CPU */ +int i, j; +int *bdim, *c, *ans, *h_ans; +//double start, stop; +//double cpu_time, gpu_time; +int n = 8192; +int m = 20480; + +bdim = (T*)malloc(sizeof(T) *m*n); +c = (T*)malloc(sizeof(T) *n); +ans = (T*)malloc(sizeof(T) *m); +h_ans = (T*)malloc(sizeof(T) *m); + +/* for GPU */ +T *d_bdim, *d_c, *d_ans; +cudaMalloc((void **)&d_bdim, sizeof(T)*m*n); +cudaMalloc((void **)&d_c, sizeof(T)*n); +cudaMalloc((void **)&d_ans, sizeof(T)*m); + +for(i = 0; i < n; i++) +{ +c[i] = 1; +for(j = 0; j < m; j++) +bdim[i*m+j] = 1; +} + +//start = gettime(); +cgemv(m, n, bdim, c, ans); +//stop = gettime(); +//cpu_time=stop - start; + +// Event creation +cudaEvent_t start, stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +float time1 = 0; + + +cudaMemcpy(d_bdim, bdim, sizeof(T)*m*n, cudaMemcpyHostToDevice); +cudaMemcpy(d_c, c, sizeof(T)*n, cudaMemcpyHostToDevice); + +// Start timer +cudaEventRecord( start, 0 ); +//start = gettime(); +gemv<<<m, THREAD>>>(m, n, d_bdim, d_c, d_ans); +//stop = gettime(); +// End timer +cudaEventRecord( stop, 0 ); +cudaEventSynchronize( stop ); +cudaEventElapsedTime( &time1, start, stop ); + +//gpu_time=stop - start; + +cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost); + +//printf("cpu_time : %.6f[sec]\n",cpu_time); +//printf("gpu_time : %.6f[sec]\n",gpu_time); +//printf("%f x\n", cpu_time / gpu_time); + + +for(i = 0; i < m; i++) +printf("%d -- %d\n", ans[i], h_ans[i]); + +printf("Execution time = %f ms\n", time1); + + +free(bdim); +free(c); +free(ans); +free(h_ans); +cudaFree(d_bdim); +cudaFree(d_c); +cudaFree(d_ans); + +return 0; +} + +__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans) +{ +int i; +int div = n/THREAD; +__shared__ T tmp[THREAD]; + +tmp[threadIdx.x] = 0.0; + +for(i = 0; i < div; i++) +{ +tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x]; +} +if(threadIdx.x < m%THREAD) +tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x]; + +__syncthreads(); + +for(i = THREAD / 2; i > 31; i = i / 2) +{ +if(threadIdx.x < i) +tmp[threadIdx.x] += tmp[threadIdx.x + i]; +__syncthreads(); +} + +if(threadIdx.x < 16) +{ +tmp[threadIdx.x] += tmp[threadIdx.x + 16]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 8]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 4]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 2]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 1]; +__syncthreads(); +} + + +if(threadIdx.x == 0) +d_ans[blockIdx.x] = tmp[0]; + +} + +void cgemv(int m, int n, T *adim, T *b, T *d_ans) +{ +int i, j; + +for(i = 0; i < m; i++) +for(j = 0; j < n; j++) +d_ans[i] += adim[i*n+j] * b[j]; + +} |