diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /GEMV/baselines/gpu/gemv.cu | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'GEMV/baselines/gpu/gemv.cu')
-rw-r--r-- | GEMV/baselines/gpu/gemv.cu | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/GEMV/baselines/gpu/gemv.cu b/GEMV/baselines/gpu/gemv.cu new file mode 100644 index 0000000..5c0e240 --- /dev/null +++ b/GEMV/baselines/gpu/gemv.cu @@ -0,0 +1,152 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <cuda.h> + +#define THREAD 128 + +#define T int + +__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans); + +void cgemv(int m, int n, T *adim, T *b, T *d_ans); + +double gettime() +{ +struct timeval tv; +gettimeofday(&tv, NULL); +return tv.tv_sec + (double)tv.tv_usec*1.0e-6; +} + +int main(int argc, char **argv) +{ +/* for CPU */ +int i, j; +int *bdim, *c, *ans, *h_ans; +//double start, stop; +//double cpu_time, gpu_time; +int n = 8192; +int m = 20480; + +bdim = (T*)malloc(sizeof(T) *m*n); +c = (T*)malloc(sizeof(T) *n); +ans = (T*)malloc(sizeof(T) *m); +h_ans = (T*)malloc(sizeof(T) *m); + +/* for GPU */ +T *d_bdim, *d_c, *d_ans; +cudaMalloc((void **)&d_bdim, sizeof(T)*m*n); +cudaMalloc((void **)&d_c, sizeof(T)*n); +cudaMalloc((void **)&d_ans, sizeof(T)*m); + +for(i = 0; i < n; i++) +{ +c[i] = 1; +for(j = 0; j < m; j++) +bdim[i*m+j] = 1; +} + +//start = gettime(); +cgemv(m, n, bdim, c, ans); +//stop = gettime(); +//cpu_time=stop - start; + +// Event creation +cudaEvent_t start, stop; +cudaEventCreate(&start); +cudaEventCreate(&stop); +float time1 = 0; + + +cudaMemcpy(d_bdim, bdim, sizeof(T)*m*n, cudaMemcpyHostToDevice); +cudaMemcpy(d_c, c, sizeof(T)*n, cudaMemcpyHostToDevice); + +// Start timer +cudaEventRecord( start, 0 ); +//start = gettime(); +gemv<<<m, THREAD>>>(m, n, d_bdim, d_c, d_ans); +//stop = gettime(); +// End timer +cudaEventRecord( stop, 0 ); +cudaEventSynchronize( stop ); +cudaEventElapsedTime( &time1, start, stop ); + +//gpu_time=stop - start; + +cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost); + +//printf("cpu_time : %.6f[sec]\n",cpu_time); +//printf("gpu_time : %.6f[sec]\n",gpu_time); +//printf("%f x\n", cpu_time / gpu_time); + + +for(i = 0; i < m; i++) +printf("%d -- %d\n", ans[i], h_ans[i]); + +printf("Execution time = %f ms\n", time1); + + +free(bdim); +free(c); +free(ans); +free(h_ans); +cudaFree(d_bdim); +cudaFree(d_c); +cudaFree(d_ans); + +return 0; +} + +__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans) +{ +int i; +int div = n/THREAD; +__shared__ T tmp[THREAD]; + +tmp[threadIdx.x] = 0.0; + +for(i = 0; i < div; i++) +{ +tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x]; +} +if(threadIdx.x < m%THREAD) +tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x]; + +__syncthreads(); + +for(i = THREAD / 2; i > 31; i = i / 2) +{ +if(threadIdx.x < i) +tmp[threadIdx.x] += tmp[threadIdx.x + i]; +__syncthreads(); +} + +if(threadIdx.x < 16) +{ +tmp[threadIdx.x] += tmp[threadIdx.x + 16]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 8]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 4]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 2]; +__syncthreads(); +tmp[threadIdx.x] += tmp[threadIdx.x + 1]; +__syncthreads(); +} + + +if(threadIdx.x == 0) +d_ans[blockIdx.x] = tmp[0]; + +} + +void cgemv(int m, int n, T *adim, T *b, T *d_ans) +{ +int i, j; + +for(i = 0; i < m; i++) +for(j = 0; j < n; j++) +d_ans[i] += adim[i*n+j] * b[j]; + +} |