summaryrefslogtreecommitdiff
path: root/UNI/baselines/gpu/unique.cu
diff options
context:
space:
mode:
Diffstat (limited to 'UNI/baselines/gpu/unique.cu')
-rw-r--r--UNI/baselines/gpu/unique.cu197
1 files changed, 197 insertions, 0 deletions
diff --git a/UNI/baselines/gpu/unique.cu b/UNI/baselines/gpu/unique.cu
new file mode 100644
index 0000000..12d21fc
--- /dev/null
+++ b/UNI/baselines/gpu/unique.cu
@@ -0,0 +1,197 @@
+/***************************************************************************
+ *cr
+ *cr (C) Copyright 2015 The Board of Trustees of the
+ *cr University of Illinois
+ *cr All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+ In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+ Copyright (c) 2015 University of Illinois at Urbana-Champaign.
+ All rights reserved.
+
+ Permission to use, copy, modify and distribute this software and its documentation for
+ educational purpose is hereby granted without fee, provided that the above copyright
+ notice and this permission notice appear in all copies of this software and that you do
+ not sell the software.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
+ OTHERWISE.
+
+ Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include "ds.h"
+#include "kernel.cu"
+
+// Sequential CPU version
+void cpu_unique(T* output, T* input, int elements){
+ int j = 0;
+ output[j] = input[j];
+ j++;
+ for (int i = 1; i < elements; i++){
+ if (input[i] != input[i-1]){
+ output[j] = input[i];
+ j++;
+ }
+ }
+}
+
+int main(int argc, char **argv){
+
+ // Syntax verification
+ if (argc != 4) {
+ printf("Wrong format\n");
+ printf("Syntax: %s <Device Input (%% elements) numElements>\n",argv[0]);
+ exit(1);
+ }
+ int device = atoi(argv[1]);
+ int input = atoi(argv[2]);
+ int numElements = atoi(argv[3]);
+ size_t size = numElements * sizeof(T);
+
+ // Set device
+ cudaDeviceProp device_properties;
+ cudaGetDeviceProperties(&device_properties,device);
+ cudaSetDevice(device);
+
+ printf("DS Unique on %s\n", device_properties.name);
+ printf("Thread block size = %d\n", L_DIM);
+ printf("Coarsening factor = %d\n", REGS);
+#ifdef FLOAT
+ printf("Single precision array: %d elements\n", numElements);
+#elif INT
+ printf("Integer array: %d elements\n", numElements);
+#else
+ printf("Double precision array: %d elements\n", numElements);
+#endif
+
+ // Event creation
+ cudaEvent_t start, stop;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+
+ float time1 = 0;
+ float time2 = 0;
+
+ // Allocate the host input vector A
+ T *h_A = (T*)malloc(size);
+
+ // Allocate the host output vectors
+ T *h_B = (T*)malloc(size);
+ T *h_C = (T*)malloc(size);
+
+ // Allocate the device input vector A
+ T *d_A = NULL;
+ cudaMalloc((void **)&d_A, size);
+
+#define WARMUP 0
+#define REP 1
+ int value1 = 0;
+ int value2 = 1;
+ int value3 = 2;
+ int value4 = 3;
+ unsigned int flagM = 0;
+ for(int iteration = 0; iteration < REP+WARMUP; iteration++){
+ // Initialize the host input vectors
+ srand(2014);
+ for(int i = 0; i < numElements; i++){
+ h_A[i] = value1;
+ if(i >= numElements/4 && i < numElements/2) h_A[i] = value2;
+ if(i >= numElements/2 && i < 3*numElements/4) h_A[i] = value3;
+ if(i >= 3*numElements/4 && i < numElements) h_A[i] = value4;
+ }
+ int M = (numElements * input)/100;
+ int m = M;
+ while(m>0){
+ int x = (int)(numElements*(((float)rand()/(float)RAND_MAX)));
+ if(h_A[x]==value1 || h_A[x]==value2 || h_A[x]==value3 || h_A[x]==value4){
+ h_A[x] = x+2;
+ m--;
+ }
+ }
+
+#if PRINT
+ printf("\n");
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_A+i));
+ }
+ printf("\n");
+#endif
+
+ // Copy the host input vector A in host memory to the device input vector in device memory
+ cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
+
+ int ldim = L_DIM;
+ // Atomic flags
+ unsigned int* d_flags = NULL;
+ int num_flags = numElements % (ldim * REGS) == 0 ? numElements / (ldim * REGS) : numElements / (ldim * REGS) + 1;
+ unsigned int *flags = (unsigned int *)calloc(sizeof(unsigned int), num_flags + 2);
+ flags[0] = 1;
+ flags[num_flags + 1] = 0;
+ cudaMalloc((void **)&d_flags, (num_flags + 2) * sizeof(unsigned int));
+ cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+ free(flags);
+ // Number of work-groups/thread blocks
+ int num_wg = num_flags;
+
+ // Start timer
+ cudaEventRecord( start, 0 );
+
+ // Kernel launch
+ unique<<<num_wg, ldim>>>(d_A, d_A, numElements, d_flags);
+
+ cudaMemcpy(&flagM, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+ // End timer
+ cudaEventRecord( stop, 0 );
+ cudaEventSynchronize( stop );
+ cudaEventElapsedTime( &time1, start, stop );
+ if(iteration >= WARMUP) time2 += time1;
+
+ if(iteration == REP+WARMUP-1){
+ float timer = time2 / REP;
+ double bw = (double)((numElements + flagM) * sizeof(T)) / (double)(timer * 1000000.0);
+ printf("Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+ }
+
+ // Free flags
+ cudaFree(d_flags);
+ }
+ // Copy to host memory
+ cudaMemcpy(h_B, d_A, size, cudaMemcpyDeviceToHost);
+
+ // CPU execution for comparison
+ cpu_unique(h_C, h_A, numElements);
+
+ // Verify that the result vector is correct
+#if PRINT
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_B+i));
+ }
+ printf("\n");
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_C+i));
+ }
+ printf("\n");
+#endif
+ for (int i = 0; i < flagM - 1; ++i){
+ if (h_B[i] != h_C[i]){
+ fprintf(stderr, "Result verification failed at element %d!\n", i);
+ exit(EXIT_FAILURE);
+ }
+ }
+ printf("Test PASSED\n");
+
+ // Free device global memory
+ cudaFree(d_A);
+ cudaEventDestroy(start);
+ cudaEventDestroy(stop);
+ // Free host memory
+ free(h_A);
+ free(h_B);
+ free(h_C);
+
+ return 0;
+}