From 3de4b495fb176eba9a0eb517a4ce05903cb67acb Mon Sep 17 00:00:00 2001
From: Juan Gomez Luna <juan.gomez@safari.ethz.ch>
Date: Wed, 16 Jun 2021 19:46:05 +0200
Subject: PrIM -- first commit

---
 VA/baselines/gpu/vec_add.cu | 101 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 VA/baselines/gpu/vec_add.cu

(limited to 'VA/baselines/gpu/vec_add.cu')

diff --git a/VA/baselines/gpu/vec_add.cu b/VA/baselines/gpu/vec_add.cu
new file mode 100644
index 0000000..c0c259b
--- /dev/null
+++ b/VA/baselines/gpu/vec_add.cu
@@ -0,0 +1,101 @@
+/* File:     vec_add.cu
+ * Purpose:  Implement vector addition on a gpu using cuda
+ *
+ * Compile:  nvcc [-g] [-G] -o vec_add vec_add.cu
+ * Run:      ./vec_add
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+
+__global__ void Vec_add(unsigned int x[], unsigned int y[], unsigned int z[], int n) {
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < n){
+        z[thread_id] = x[thread_id] + y[thread_id];
+    }
+}
+
+
+int main(int argc, char* argv[]) {
+    int n, m;
+    unsigned int *h_x, *h_y, *h_z;
+    unsigned int *d_x, *d_y, *d_z;
+    size_t size;
+
+    /* Define vector length */
+    n = 2621440;
+    m = 320;
+    size = m * n * sizeof(unsigned int);
+
+    // Allocate memory for the vectors on host memory.
+    h_x = (unsigned int*) malloc(size);
+    h_y = (unsigned int*) malloc(size);
+    h_z = (unsigned int*) malloc(size);
+
+    for (int i = 0; i < n * m; i++) {
+        h_x[i] = i+1;
+        h_y[i] = n-i;
+    }
+
+    printf("Input size = %d\n", n * m);
+
+    // Print original vectors.
+    /*printf("h_x = ");
+    for (int i = 0; i < m; i++){
+        printf("%u ", h_x[i]);
+    }
+    printf("\n\n");
+    printf("h_y = ");
+    for (int i = 0; i < m; i++){
+        printf("%u ", h_y[i]);
+    }
+    printf("\n\n");*/
+
+    // Event creation
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    float time1 = 0;
+
+    /* Allocate vectors in device memory */
+    cudaMalloc(&d_x, size);
+    cudaMalloc(&d_y, size);
+    cudaMalloc(&d_z, size);
+
+    /* Copy vectors from host memory to device memory */
+    cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice);
+    
+    // Start timer
+    cudaEventRecord( start, 0 );
+
+    /* Kernel Call */
+    Vec_add<<<(n * m) / 256, 256>>>(d_x, d_y, d_z, n * m);
+
+    // End timer
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize( stop );
+    cudaEventElapsedTime( &time1, start, stop );
+
+    cudaMemcpy(h_z, d_z, size, cudaMemcpyDeviceToHost);
+    /*printf("The sum is: \n");
+    for (int i = 0; i < m; i++){
+        printf("%u ", h_z[i]);
+    }
+    printf("\n");*/
+
+    printf("Execution time = %f ms\n", time1);
+
+    /* Free device memory */
+    cudaFree(d_x);
+    cudaFree(d_y);
+    cudaFree(d_z);
+    /* Free host memory */
+    free(h_x);
+    free(h_y);
+    free(h_z);
+
+    return 0;
+}  /* main */
-- 
cgit v1.2.3