summaryrefslogtreecommitdiff
path: root/SEL/baselines/gpu/select.cu
diff options
context:
space:
mode:
Diffstat (limited to 'SEL/baselines/gpu/select.cu')
-rw-r--r--SEL/baselines/gpu/select.cu242
1 files changed, 242 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/select.cu b/SEL/baselines/gpu/select.cu
new file mode 100644
index 0000000..b3c60d6
--- /dev/null
+++ b/SEL/baselines/gpu/select.cu
@@ -0,0 +1,242 @@
+/***************************************************************************
+ *cr
+ *cr (C) Copyright 2015 The Board of Trustees of the
+ *cr University of Illinois
+ *cr All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+ In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+ Copyright (c) 2015 University of Illinois at Urbana-Champaign.
+ All rights reserved.
+
+ Permission to use, copy, modify and distribute this software and its documentation for
+ educational purpose is hereby granted without fee, provided that the above copyright
+ notice and this permission notice appear in all copies of this software and that you do
+ not sell the software.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
+ OTHERWISE.
+
+ Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include "ds.h"
+
+// Sample predicate for partition (only for INT)
+struct is_even{
+ __host__ __device__
+ bool operator()(const T &x){
+ return (x % 2) == 0;
+ }
+};
+
+#include "kernel.cu"
+
+// Sequential CPU version
+void cpu_copy_if(T* output, T* input, int elements, struct is_even pred){
+ int pos = 0;
+ for (int i = 0; i < elements; i++){
+ if(pred(input[i])){
+ output[pos] = input[i];
+ pos++;
+ }
+ }
+}
+void cpu_remove_if(T* input, int elements, struct is_even pred){
+ int pos = 0;
+ for (int i = 0; i < elements; i++){
+ if(!pred(input[i])){
+ input[pos] = input[i];
+ pos++;
+ }
+ }
+}
+
+int main(int argc, char **argv){
+
+ // Syntax verification
+ if (argc != 4) {
+ printf("Wrong format\n");
+ printf("Syntax: %s <Device Input (%% elements) numElements>\n",argv[0]);
+ exit(1);
+ }
+ int device = atoi(argv[1]);
+ int input = atoi(argv[2]);
+ int numElements = atoi(argv[3]);
+ size_t size = numElements * sizeof(T);
+
+ // Set device
+ cudaDeviceProp device_properties;
+ cudaGetDeviceProperties(&device_properties,device);
+ cudaSetDevice(device);
+
+ printf("DS Select on %s\n", device_properties.name);
+ printf("Thread block size = %d\n", L_DIM);
+ printf("Coarsening factor = %d\n", REGS);
+#ifdef FLOAT
+ printf("Single precision array: %d elements\n", numElements);
+#elif INT
+ printf("Integer array: %d elements\n", numElements);
+#else
+ printf("Double precision array: %d elements\n", numElements);
+#endif
+
+ // Event creation
+ cudaEvent_t start, stop;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+
+ float time1 = 0;
+ float time2 = 0;
+ float time3 = 0;
+
+ // Allocate the host input vector A
+ T *h_A = (T*)malloc(size);
+
+ // Allocate the host output vectors
+ T *h_B = (T*)malloc(size);
+ T *h_C = (T*)malloc(size);
+ T *h_D = (T*)malloc(size);
+
+ // Allocate the device input vector A and output vector B
+ T *d_A = NULL;
+ cudaMalloc((void **)&d_A, size);
+ T *d_B = NULL;
+ cudaMalloc((void **)&d_B, size);
+
+#define WARMUP 2
+#define REP 10
+ unsigned int flagM1 = 0;
+ unsigned int flagM2 = 0;
+ for(int iteration = 0; iteration < REP+WARMUP; iteration++){
+ // Initialize the host input vectors
+ srand(2014);
+ for(int i = 0; i < numElements; i++)
+ h_A[i] = i % 2 != 0 ? i:i+1;
+ int M = (numElements * input)/100;
+ int m = M;
+ while(m>0){
+ int x = (int)(numElements*(((float)rand()/(float)RAND_MAX)));
+ if(h_A[x] % 2 != 0){
+ h_A[x] = x * 2;
+ m--;
+ }
+ }
+
+#if PRINT
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_A+i));
+ }
+ printf("\n");
+#endif
+
+ // Copy the host input vector A in host memory to the device input vector in device memory
+ cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
+
+ int ldim = L_DIM;
+ // Atomic flags
+ unsigned int* d_flags = NULL;
+ const int num_flags = numElements % (ldim * REGS) == 0 ? numElements / (ldim * REGS) : numElements / (ldim * REGS) + 1;
+ unsigned int *flags = (unsigned int *)calloc(sizeof(unsigned int), num_flags + 2);
+ flags[0] = 1;
+ flags[num_flags + 1] = 0;
+ cudaMalloc((void **)&d_flags, (num_flags + 2) * sizeof(unsigned int));
+ cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+ // Number of work-groups/thread blocks
+ int num_wg = num_flags;
+
+ // Start timer
+ cudaEventRecord( start, 0 );
+
+ // Kernel launch (Copy_if)
+ select_copy_if<<<num_wg, ldim>>>(d_B, d_A, numElements, d_flags, is_even());
+
+ cudaMemcpy(&flagM1, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+ // Stop timer
+ cudaEventRecord( stop, 0 );
+ cudaEventSynchronize( stop );
+ cudaEventElapsedTime( &time1, start, stop );
+ if(iteration >= WARMUP) time2 += time1;
+
+ if(iteration == REP+WARMUP-1){
+ float timer = time2 / REP;
+ double bw = (double)((numElements + flagM1) * sizeof(T)) / (double)(timer * 1000000.0);
+ printf("Copy_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+ }
+
+ // Atomic flags
+ cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+ free(flags);
+
+ // Start timer
+ cudaEventRecord( start, 0 );
+
+ // Kernel launch (Remove_if)
+ select_remove_if<<<num_wg, ldim>>>(d_A, d_A, numElements, d_flags, is_even());
+
+ cudaMemcpy(&flagM2, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+ // End timer
+ cudaEventRecord( stop, 0 );
+ cudaEventSynchronize( stop );
+ cudaEventElapsedTime( &time1, start, stop );
+ if(iteration >= WARMUP) time3 += time1;
+
+ if(iteration == REP+WARMUP-1){
+ float timer = time3 / REP;
+ double bw = (double)((numElements + flagM2) * sizeof(T)) / (double)(timer * 1000000.0);
+ printf("Remove_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+ }
+
+ // Free flags
+ cudaFree(d_flags);
+ }
+ // Copy to host memory
+ cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
+ cudaMemcpy(h_C, d_A, size, cudaMemcpyDeviceToHost);
+
+ // CPU execution for comparison
+ cpu_copy_if(h_D, h_A, numElements, is_even());
+ cpu_remove_if(h_A, numElements, is_even());
+
+ // Verify that the result vector is correct
+#if PRINT
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_B+i));
+ }
+ printf("\n");
+ for(int i = 0; i < numElements; ++i){
+ printf("%d ",*(h_D+i));
+ }
+ printf("\n");
+#endif
+ for (int i = 0; i < flagM1 - 1; ++i){
+ if (h_B[i] != h_D[i]){
+ fprintf(stderr, "Copy_if - Result verification failed at element %d!\n", i);
+ exit(EXIT_FAILURE);
+ }
+ }
+ for (int i = 0; i < flagM2 - 1; ++i){
+ if (h_C[i] != h_A[i]){
+ fprintf(stderr, "Remove_if - Result verification failed at element %d!\n", i);
+ exit(EXIT_FAILURE);
+ }
+ }
+ printf("Test PASSED\n");
+
+ // Free device global memory
+ cudaFree(d_A);
+ cudaFree(d_B);
+ cudaEventDestroy(start);
+ cudaEventDestroy(stop);
+ // Free host memory
+ free(h_A);
+ free(h_B);
+ free(h_C);
+ free(h_D);
+
+ return 0;
+}