diff options
Diffstat (limited to 'VA/baselines')
-rw-r--r-- | VA/baselines/cpu/Makefile | 5 | ||||
-rw-r--r-- | VA/baselines/cpu/README | 9 | ||||
-rw-r--r-- | VA/baselines/cpu/app_baseline.c | 132 | ||||
-rw-r--r-- | VA/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | VA/baselines/gpu/README | 9 | ||||
-rw-r--r-- | VA/baselines/gpu/vec_add.cu | 101 |
6 files changed, 261 insertions, 0 deletions
diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile new file mode 100644 index 0000000..f320d87 --- /dev/null +++ b/VA/baselines/cpu/Makefile @@ -0,0 +1,5 @@ +all: + gcc -o va -fopenmp app_baseline.c + +clean: + rm va diff --git a/VA/baselines/cpu/README b/VA/baselines/cpu/README new file mode 100644 index 0000000..1b979ac --- /dev/null +++ b/VA/baselines/cpu/README @@ -0,0 +1,9 @@ +Vector addition (VA) + +Compilation instructions + + make + +Execution instructions + + ./va -t 4 diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c new file mode 100644 index 0000000..ecd8efa --- /dev/null +++ b/VA/baselines/cpu/app_baseline.c @@ -0,0 +1,132 @@ +/** +* @file app.c +* @brief Template for a Host Application Source File. +* +*/ +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <getopt.h> +#include <assert.h> +#include <stdint.h> + +#include <omp.h> +#include "../../support/timer.h" + +static int32_t *A; +static int32_t *B; +static int32_t *C; +static int32_t *C2; + +/** +* @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values +* @param nr_elements how many 32-bit elements we want the file to be +* @return the buffer address +*/ +void *create_test_file(unsigned int nr_elements) { + srand(0); + printf("nr_elements\t%u\t", nr_elements); + A = (uint32_t*) malloc(nr_elements * sizeof(uint32_t)); + B = (uint32_t*) malloc(nr_elements * sizeof(uint32_t)); + C = (uint32_t*) malloc(nr_elements * sizeof(uint32_t)); + + for (int i = 0; i < nr_elements; i++) { + A[i] = (int) (rand()); + B[i] = (int) (rand()); + } + +} + +/** +* @brief compute output in the host +*/ +static void vector_addition_host(unsigned int nr_elements, int t) { + omp_set_num_threads(t); + #pragma omp parallel for + for (int i = 0; i < nr_elements; i++) { + C[i] = A[i] + B[i]; + } +} + +// Params --------------------------------------------------------------------- +typedef struct Params { + int input_size; + int n_warmup; + int n_reps; + int n_threads; +}Params; + +void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -t <T> # of threads (default=8)" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=8M elements)" + "\n"); +} + +struct Params input_params(int argc, char **argv) { + struct Params p; + p.input_size = 16777216; + p.n_warmup = 1; + p.n_reps = 3; + p.n_threads = 5; + + int opt; + while((opt = getopt(argc, argv, "hi:w:e:t:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': p.input_size = atoi(optarg); break; + case 'w': p.n_warmup = atoi(optarg); break; + case 'e': p.n_reps = atoi(optarg); break; + case 't': p.n_threads = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.n_threads > 0 && "Invalid # of ranks!"); + + return p; +} + +/** +* @brief Main of the Host Application. +*/ +int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + + const unsigned int file_size = p.input_size; + + // Create an input file with arbitrary data. + create_test_file(file_size); + + Timer timer; + start(&timer, 0, 0); + + vector_addition_host(file_size, p.n_threads); + + stop(&timer, 0); + printf("Kernel "); + print(&timer, 0, 1); + printf("\n"); + + free(A); + free(B); + free(C); + + return 0; + } diff --git a/VA/baselines/gpu/Makefile b/VA/baselines/gpu/Makefile new file mode 100644 index 0000000..0b822b6 --- /dev/null +++ b/VA/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc vec_add.cu -I/usr/local/cuda/include -lm -o va + +clean: + rm va diff --git a/VA/baselines/gpu/README b/VA/baselines/gpu/README new file mode 100644 index 0000000..3cfa0c6 --- /dev/null +++ b/VA/baselines/gpu/README @@ -0,0 +1,9 @@ +Vector addition (VA) + +Compilation instructions + + make + +Execution instructions + + ./va diff --git a/VA/baselines/gpu/vec_add.cu b/VA/baselines/gpu/vec_add.cu new file mode 100644 index 0000000..c0c259b --- /dev/null +++ b/VA/baselines/gpu/vec_add.cu @@ -0,0 +1,101 @@ +/* File: vec_add.cu + * Purpose: Implement vector addition on a gpu using cuda + * + * Compile: nvcc [-g] [-G] -o vec_add vec_add.cu + * Run: ./vec_add + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <math.h> + +__global__ void Vec_add(unsigned int x[], unsigned int y[], unsigned int z[], int n) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < n){ + z[thread_id] = x[thread_id] + y[thread_id]; + } +} + + +int main(int argc, char* argv[]) { + int n, m; + unsigned int *h_x, *h_y, *h_z; + unsigned int *d_x, *d_y, *d_z; + size_t size; + + /* Define vector length */ + n = 2621440; + m = 320; + size = m * n * sizeof(unsigned int); + + // Allocate memory for the vectors on host memory. + h_x = (unsigned int*) malloc(size); + h_y = (unsigned int*) malloc(size); + h_z = (unsigned int*) malloc(size); + + for (int i = 0; i < n * m; i++) { + h_x[i] = i+1; + h_y[i] = n-i; + } + + printf("Input size = %d\n", n * m); + + // Print original vectors. + /*printf("h_x = "); + for (int i = 0; i < m; i++){ + printf("%u ", h_x[i]); + } + printf("\n\n"); + printf("h_y = "); + for (int i = 0; i < m; i++){ + printf("%u ", h_y[i]); + } + printf("\n\n");*/ + + // Event creation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float time1 = 0; + + /* Allocate vectors in device memory */ + cudaMalloc(&d_x, size); + cudaMalloc(&d_y, size); + cudaMalloc(&d_z, size); + + /* Copy vectors from host memory to device memory */ + cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); + + // Start timer + cudaEventRecord( start, 0 ); + + /* Kernel Call */ + Vec_add<<<(n * m) / 256, 256>>>(d_x, d_y, d_z, n * m); + + // End timer + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time1, start, stop ); + + cudaMemcpy(h_z, d_z, size, cudaMemcpyDeviceToHost); + /*printf("The sum is: \n"); + for (int i = 0; i < m; i++){ + printf("%u ", h_z[i]); + } + printf("\n");*/ + + printf("Execution time = %f ms\n", time1); + + /* Free device memory */ + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_z); + /* Free host memory */ + free(h_x); + free(h_y); + free(h_z); + + return 0; +} /* main */ |