diff options
Diffstat (limited to 'UNI/baselines/gpu/unique.cu')
-rw-r--r-- | UNI/baselines/gpu/unique.cu | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/UNI/baselines/gpu/unique.cu b/UNI/baselines/gpu/unique.cu new file mode 100644 index 0000000..12d21fc --- /dev/null +++ b/UNI/baselines/gpu/unique.cu @@ -0,0 +1,197 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2015 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +/* + In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15 + + Copyright (c) 2015 University of Illinois at Urbana-Champaign. + All rights reserved. + + Permission to use, copy, modify and distribute this software and its documentation for + educational purpose is hereby granted without fee, provided that the above copyright + notice and this permission notice appear in all copies of this software and that you do + not sell the software. + + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR + OTHERWISE. + + Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu) +*/ + +#include "ds.h" +#include "kernel.cu" + +// Sequential CPU version +void cpu_unique(T* output, T* input, int elements){ + int j = 0; + output[j] = input[j]; + j++; + for (int i = 1; i < elements; i++){ + if (input[i] != input[i-1]){ + output[j] = input[i]; + j++; + } + } +} + +int main(int argc, char **argv){ + + // Syntax verification + if (argc != 4) { + printf("Wrong format\n"); + printf("Syntax: %s <Device Input (%% elements) numElements>\n",argv[0]); + exit(1); + } + int device = atoi(argv[1]); + int input = atoi(argv[2]); + int numElements = atoi(argv[3]); + size_t size = numElements * sizeof(T); + + // Set device + cudaDeviceProp device_properties; + cudaGetDeviceProperties(&device_properties,device); + cudaSetDevice(device); + + printf("DS Unique on %s\n", device_properties.name); + printf("Thread block size = %d\n", L_DIM); + printf("Coarsening factor = %d\n", REGS); +#ifdef FLOAT + printf("Single precision array: %d elements\n", numElements); +#elif INT + printf("Integer array: %d elements\n", numElements); +#else + printf("Double precision array: %d elements\n", numElements); +#endif + + // Event creation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + float time1 = 0; + float time2 = 0; + + // Allocate the host input vector A + T *h_A = (T*)malloc(size); + + // Allocate the host output vectors + T *h_B = (T*)malloc(size); + T *h_C = (T*)malloc(size); + + // Allocate the device input vector A + T *d_A = NULL; + cudaMalloc((void **)&d_A, size); + +#define WARMUP 0 +#define REP 1 + int value1 = 0; + int value2 = 1; + int value3 = 2; + int value4 = 3; + unsigned int flagM = 0; + for(int iteration = 0; iteration < REP+WARMUP; iteration++){ + // Initialize the host input vectors + srand(2014); + for(int i = 0; i < numElements; i++){ + h_A[i] = value1; + if(i >= numElements/4 && i < numElements/2) h_A[i] = value2; + if(i >= numElements/2 && i < 3*numElements/4) h_A[i] = value3; + if(i >= 3*numElements/4 && i < numElements) h_A[i] = value4; + } + int M = (numElements * input)/100; + int m = M; + while(m>0){ + int x = (int)(numElements*(((float)rand()/(float)RAND_MAX))); + if(h_A[x]==value1 || h_A[x]==value2 || h_A[x]==value3 || h_A[x]==value4){ + h_A[x] = x+2; + m--; + } + } + +#if PRINT + printf("\n"); + for(int i = 0; i < numElements; ++i){ + printf("%d ",*(h_A+i)); + } + printf("\n"); +#endif + + // Copy the host input vector A in host memory to the device input vector in device memory + cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); + + int ldim = L_DIM; + // Atomic flags + unsigned int* d_flags = NULL; + int num_flags = numElements % (ldim * REGS) == 0 ? numElements / (ldim * REGS) : numElements / (ldim * REGS) + 1; + unsigned int *flags = (unsigned int *)calloc(sizeof(unsigned int), num_flags + 2); + flags[0] = 1; + flags[num_flags + 1] = 0; + cudaMalloc((void **)&d_flags, (num_flags + 2) * sizeof(unsigned int)); + cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice); + free(flags); + // Number of work-groups/thread blocks + int num_wg = num_flags; + + // Start timer + cudaEventRecord( start, 0 ); + + // Kernel launch + unique<<<num_wg, ldim>>>(d_A, d_A, numElements, d_flags); + + cudaMemcpy(&flagM, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost); + + // End timer + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time1, start, stop ); + if(iteration >= WARMUP) time2 += time1; + + if(iteration == REP+WARMUP-1){ + float timer = time2 / REP; + double bw = (double)((numElements + flagM) * sizeof(T)) / (double)(timer * 1000000.0); + printf("Execution time = %f ms, Throughput = %f GB/s\n", timer, bw); + } + + // Free flags + cudaFree(d_flags); + } + // Copy to host memory + cudaMemcpy(h_B, d_A, size, cudaMemcpyDeviceToHost); + + // CPU execution for comparison + cpu_unique(h_C, h_A, numElements); + + // Verify that the result vector is correct +#if PRINT + for(int i = 0; i < numElements; ++i){ + printf("%d ",*(h_B+i)); + } + printf("\n"); + for(int i = 0; i < numElements; ++i){ + printf("%d ",*(h_C+i)); + } + printf("\n"); +#endif + for (int i = 0; i < flagM - 1; ++i){ + if (h_B[i] != h_C[i]){ + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + printf("Test PASSED\n"); + + // Free device global memory + cudaFree(d_A); + cudaEventDestroy(start); + cudaEventDestroy(stop); + // Free host memory + free(h_A); + free(h_B); + free(h_C); + + return 0; +} |