diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /VA/baselines/gpu/vec_add.cu | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'VA/baselines/gpu/vec_add.cu')
-rw-r--r-- | VA/baselines/gpu/vec_add.cu | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/VA/baselines/gpu/vec_add.cu b/VA/baselines/gpu/vec_add.cu new file mode 100644 index 0000000..c0c259b --- /dev/null +++ b/VA/baselines/gpu/vec_add.cu @@ -0,0 +1,101 @@ +/* File: vec_add.cu + * Purpose: Implement vector addition on a gpu using cuda + * + * Compile: nvcc [-g] [-G] -o vec_add vec_add.cu + * Run: ./vec_add + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <math.h> + +__global__ void Vec_add(unsigned int x[], unsigned int y[], unsigned int z[], int n) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < n){ + z[thread_id] = x[thread_id] + y[thread_id]; + } +} + + +int main(int argc, char* argv[]) { + int n, m; + unsigned int *h_x, *h_y, *h_z; + unsigned int *d_x, *d_y, *d_z; + size_t size; + + /* Define vector length */ + n = 2621440; + m = 320; + size = m * n * sizeof(unsigned int); + + // Allocate memory for the vectors on host memory. + h_x = (unsigned int*) malloc(size); + h_y = (unsigned int*) malloc(size); + h_z = (unsigned int*) malloc(size); + + for (int i = 0; i < n * m; i++) { + h_x[i] = i+1; + h_y[i] = n-i; + } + + printf("Input size = %d\n", n * m); + + // Print original vectors. + /*printf("h_x = "); + for (int i = 0; i < m; i++){ + printf("%u ", h_x[i]); + } + printf("\n\n"); + printf("h_y = "); + for (int i = 0; i < m; i++){ + printf("%u ", h_y[i]); + } + printf("\n\n");*/ + + // Event creation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float time1 = 0; + + /* Allocate vectors in device memory */ + cudaMalloc(&d_x, size); + cudaMalloc(&d_y, size); + cudaMalloc(&d_z, size); + + /* Copy vectors from host memory to device memory */ + cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); + + // Start timer + cudaEventRecord( start, 0 ); + + /* Kernel Call */ + Vec_add<<<(n * m) / 256, 256>>>(d_x, d_y, d_z, n * m); + + // End timer + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time1, start, stop ); + + cudaMemcpy(h_z, d_z, size, cudaMemcpyDeviceToHost); + /*printf("The sum is: \n"); + for (int i = 0; i < m; i++){ + printf("%u ", h_z[i]); + } + printf("\n");*/ + + printf("Execution time = %f ms\n", time1); + + /* Free device memory */ + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_z); + /* Free host memory */ + free(h_x); + free(h_y); + free(h_z); + + return 0; +} /* main */ |