PrIM -- first commit

author: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
committer: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
commit: 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree: fc6776a94549d2d4039898f183dbbeb2ce013ba9 /GEMV/baselines
parent: ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
7 files changed, 289 insertions, 0 deletions
diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile
new file mode 100644
index 0000000..c779651
--- /dev/null
+++ b/GEMV/baselines/cpu/Makefile
@@ -0,0 +1,7 @@
+all:
+		gcc -o gemv -fopenmp gemv_openmp.c 
+
+clean:
+		rm gemv
+
+
diff --git a/GEMV/baselines/cpu/README b/GEMV/baselines/cpu/README
new file mode 100644
index 0000000..92906c3
--- /dev/null
+++ b/GEMV/baselines/cpu/README
@@ -0,0 +1,9 @@
+Matrix-Vector Multiplication (GEMV)
+
+Compilation instructions:
+
+    make
+
+Execution instructions
+
+    ./gemv
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
new file mode 100644
index 0000000..307e03b
--- /dev/null
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -0,0 +1,78 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "../../support/timer.h"
+#include "gemv_utils.h"
+
+int main(int argc, char *argv[])
+{
+  const size_t rows = 20480;
+  const size_t cols = 8192;
+
+  double **A, *b, *x;
+
+  b = (double*) malloc(sizeof(double)*rows);
+  x = (double*) malloc(sizeof(double)*cols);
+
+  allocate_dense(rows, cols, &A);
+
+  make_hilbert_mat(rows,cols, &A);
+
+#pragma omp parallel
+    {
+#pragma omp for
+    for (size_t i = 0; i < cols; i++) {
+      x[i] = (double) i+1 ;
+    }
+
+#pragma omp for
+    for (size_t i = 0; i < rows; i++) {
+      b[i] = (double) 0.0;
+    }
+    }
+
+  Timer timer;
+  start(&timer, 0, 0);
+
+
+   gemv(A, x, rows, cols, &b);
+   
+   stop(&timer, 0);
+
+
+    printf("Kernel ");
+    print(&timer, 0, 1);
+    printf("\n");
+
+#if 0
+  print_vec(x, rows);
+  print_mat(A, rows, cols);
+  print_vec(b, rows);
+#endif
+
+  printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
+  return 0;
+}
+
+void gemv(double** A, double* x, size_t rows, size_t cols, double** b) {
+#pragma omp parallel for
+  for (size_t i = 0; i < rows; i ++ )
+  for (size_t j = 0; j < cols; j ++ ) {
+    (*b)[i] = (*b)[i] + A[i][j]*x[j];
+  }
+}
+
+void make_hilbert_mat(size_t rows, size_t cols, double*** A) {
+#pragma omp parallel for
+  for (size_t i = 0; i < rows; i++) {
+    for (size_t j = 0; j < cols; j++) {
+      (*A)[i][j] = 1.0/( (double) i + (double) j + 1.0);
+    }
+  }
+}
+
+double sum_vec(double* vec, size_t rows) {
+  double sum = 0.0;
+#pragma omp parallel for reduction(+:sum)
+  for (int i = 0; i < rows; i++) sum = sum + vec[i];
+  return sum;
+}
diff --git a/GEMV/baselines/cpu/gemv_utils.h b/GEMV/baselines/cpu/gemv_utils.h
new file mode 100644
index 0000000..605f148
--- /dev/null
+++ b/GEMV/baselines/cpu/gemv_utils.h
@@ -0,0 +1,29 @@
+void allocate_dense(size_t rows,size_t  cols, double*** dense) {
+
+  *dense = malloc(sizeof(double)*rows);
+  **dense = malloc(sizeof(double)*rows*cols);
+
+  for (size_t i=0; i < rows; i++ ) {
+    (*dense)[i] = (*dense)[0] + i*cols;
+  }
+
+}
+
+void print_mat(double** A, size_t rows, size_t cols) {
+  for (size_t i = 0; i < rows; i++) {
+    for (size_t j = 0; j < cols; j++) {
+      printf("%f ", A[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+void print_vec(double* b, size_t rows) {
+  for (size_t i = 0; i < rows; i++) {
+    printf("%f\n", b[i]);
+  }
+}
+
+void gemv(double** A, double* x, size_t rows, size_t cols, double** b);
+void make_hilbert_mat(size_t rows, size_t cols, double*** A);
+double sum_vec(double* vec, size_t rows);
diff --git a/GEMV/baselines/gpu/Makefile b/GEMV/baselines/gpu/Makefile
new file mode 100644
index 0000000..bd1be55
--- /dev/null
+++ b/GEMV/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+	/usr/local/cuda/bin/nvcc gemv.cu -I/usr/local/cuda/include -lm -o gemv
+
+clean:
+	rm gemv
diff --git a/GEMV/baselines/gpu/README b/GEMV/baselines/gpu/README
new file mode 100644
index 0000000..92906c3
--- /dev/null
+++ b/GEMV/baselines/gpu/README
@@ -0,0 +1,9 @@
+Matrix-Vector Multiplication (GEMV)
+
+Compilation instructions:
+
+    make
+
+Execution instructions
+
+    ./gemv
diff --git a/GEMV/baselines/gpu/gemv.cu b/GEMV/baselines/gpu/gemv.cu
new file mode 100644
index 0000000..5c0e240
--- /dev/null
+++ b/GEMV/baselines/gpu/gemv.cu
@@ -0,0 +1,152 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <cuda.h>
+
+#define THREAD 128
+
+#define T int
+
+__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans);
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans);
+
+double gettime()
+{
+struct timeval tv;
+gettimeofday(&tv, NULL);
+return tv.tv_sec + (double)tv.tv_usec*1.0e-6;
+}
+
+int main(int argc, char **argv)
+{
+/* for CPU */
+int i, j;
+int *bdim, *c, *ans, *h_ans;
+//double start, stop;
+//double cpu_time, gpu_time;
+int n = 8192;
+int m = 20480;
+
+bdim = (T*)malloc(sizeof(T) *m*n);
+c = (T*)malloc(sizeof(T) *n);
+ans = (T*)malloc(sizeof(T) *m);
+h_ans = (T*)malloc(sizeof(T) *m);
+
+/* for GPU */
+T *d_bdim, *d_c, *d_ans;
+cudaMalloc((void **)&d_bdim, sizeof(T)*m*n);
+cudaMalloc((void **)&d_c, sizeof(T)*n);
+cudaMalloc((void **)&d_ans, sizeof(T)*m);
+
+for(i = 0; i < n; i++)
+{
+c[i] = 1;
+for(j = 0; j < m; j++)
+bdim[i*m+j] = 1;
+}
+
+//start = gettime();
+cgemv(m, n, bdim, c, ans);
+//stop = gettime();
+//cpu_time=stop - start;
+
+// Event creation
+cudaEvent_t start, stop;
+cudaEventCreate(&start);
+cudaEventCreate(&stop);
+float time1 = 0;
+
+
+cudaMemcpy(d_bdim, bdim, sizeof(T)*m*n, cudaMemcpyHostToDevice);
+cudaMemcpy(d_c, c, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+// Start timer
+cudaEventRecord( start, 0 );
+//start = gettime();
+gemv<<<m, THREAD>>>(m, n, d_bdim, d_c, d_ans);
+//stop = gettime();
+// End timer
+cudaEventRecord( stop, 0 );
+cudaEventSynchronize( stop );
+cudaEventElapsedTime( &time1, start, stop );
+
+//gpu_time=stop - start;
+
+cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost);
+
+//printf("cpu_time : %.6f[sec]\n",cpu_time);
+//printf("gpu_time : %.6f[sec]\n",gpu_time);
+//printf("%f x\n", cpu_time / gpu_time);
+
+
+for(i = 0; i < m; i++)
+printf("%d -- %d\n", ans[i], h_ans[i]);
+
+printf("Execution time = %f ms\n", time1);
+
+
+free(bdim);
+free(c);
+free(ans);
+free(h_ans);
+cudaFree(d_bdim);
+cudaFree(d_c);
+cudaFree(d_ans);
+
+return 0;
+} 
+
+__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans)
+{
+int i;
+int div = n/THREAD;
+__shared__ T tmp[THREAD];
+
+tmp[threadIdx.x] = 0.0;
+
+for(i = 0; i < div; i++)
+{
+tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x];
+}
+if(threadIdx.x < m%THREAD)
+tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x];
+
+__syncthreads();
+
+for(i = THREAD / 2; i > 31; i = i / 2)
+{
+if(threadIdx.x < i)
+tmp[threadIdx.x] += tmp[threadIdx.x + i];
+__syncthreads();
+}
+
+if(threadIdx.x < 16)
+{
+tmp[threadIdx.x] += tmp[threadIdx.x + 16];
+__syncthreads();
+tmp[threadIdx.x] += tmp[threadIdx.x + 8];
+__syncthreads();
+tmp[threadIdx.x] += tmp[threadIdx.x + 4];
+__syncthreads();
+tmp[threadIdx.x] += tmp[threadIdx.x + 2];
+__syncthreads();
+tmp[threadIdx.x] += tmp[threadIdx.x + 1];
+__syncthreads();
+}
+
+
+if(threadIdx.x == 0)
+d_ans[blockIdx.x] = tmp[0];
+
+}
+
+void cgemv(int m, int n, T *adim, T *b, T *d_ans)
+{
+int i, j;
+
+for(i = 0; i < m; i++)
+for(j = 0; j < n; j++)
+d_ans[i] += adim[i*n+j] * b[j];
+
+}
author	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
committer	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
commit	3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree	fc6776a94549d2d4039898f183dbbeb2ce013ba9 /GEMV/baselines
parent	ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)