1 files changed, 242 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/select.cu b/SEL/baselines/gpu/select.cu
new file mode 100644
index 0000000..b3c60d6
--- /dev/null
+++ b/SEL/baselines/gpu/select.cu
@@ -0,0 +1,242 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include "ds.h"
+
+// Sample predicate for partition (only for INT)
+struct is_even{
+  __host__ __device__
+  bool operator()(const T &x){
+    return (x % 2) == 0;
+  }
+};
+
+#include "kernel.cu"
+
+// Sequential CPU version
+void cpu_copy_if(T* output, T* input, int elements, struct is_even pred){
+  int pos = 0;
+  for (int i = 0; i < elements; i++){
+    if(pred(input[i])){
+	  output[pos] = input[i];
+      pos++;
+    }
+  }
+}
+void cpu_remove_if(T* input, int elements, struct is_even pred){
+  int pos = 0;
+  for (int i = 0; i < elements; i++){
+    if(!pred(input[i])){
+	  input[pos] = input[i];
+      pos++;
+    }
+  }
+}
+
+int main(int argc, char **argv){
+
+  // Syntax verification
+  if (argc != 4) {
+      printf("Wrong format\n");
+      printf("Syntax: %s <Device Input (%% elements) numElements>\n",argv[0]);
+      exit(1);
+  }
+  int device = atoi(argv[1]);
+  int input = atoi(argv[2]);
+  int numElements = atoi(argv[3]);
+  size_t size = numElements * sizeof(T);
+
+  // Set device
+  cudaDeviceProp device_properties;
+  cudaGetDeviceProperties(&device_properties,device);
+  cudaSetDevice(device);
+
+  printf("DS Select on %s\n", device_properties.name);
+  printf("Thread block size = %d\n", L_DIM);
+  printf("Coarsening factor = %d\n", REGS);
+#ifdef FLOAT
+  printf("Single precision array: %d elements\n", numElements);
+#elif INT
+  printf("Integer array: %d elements\n", numElements);
+#else
+  printf("Double precision array: %d elements\n", numElements);
+#endif
+
+  // Event creation
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  float time1 = 0;
+  float time2 = 0;
+  float time3 = 0;
+
+  // Allocate the host input vector A
+  T *h_A = (T*)malloc(size);
+
+  // Allocate the host output vectors
+  T *h_B = (T*)malloc(size);
+  T *h_C = (T*)malloc(size);
+  T *h_D = (T*)malloc(size);
+
+  // Allocate the device input vector A and output vector B
+  T *d_A = NULL;
+  cudaMalloc((void **)&d_A, size);
+  T *d_B = NULL;
+  cudaMalloc((void **)&d_B, size);
+
+#define WARMUP 2
+#define REP 10
+  unsigned int flagM1 = 0;
+  unsigned int flagM2 = 0;
+  for(int iteration = 0; iteration < REP+WARMUP; iteration++){
+    // Initialize the host input vectors
+    srand(2014);
+    for(int i = 0; i < numElements; i++)
+        h_A[i] = i % 2 != 0 ? i:i+1;
+    int M = (numElements * input)/100;
+    int m = M;
+    while(m>0){
+        int x = (int)(numElements*(((float)rand()/(float)RAND_MAX)));
+        if(h_A[x] % 2 != 0){
+            h_A[x] = x * 2;
+            m--;
+        }
+    }
+
+#if PRINT
+    for(int i = 0; i < numElements; ++i){
+        printf("%d ",*(h_A+i));
+    }
+    printf("\n");
+#endif
+
+    // Copy the host input vector A in host memory to the device input vector in device memory
+    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
+
+    int ldim = L_DIM;
+    // Atomic flags
+    unsigned int* d_flags = NULL;
+    const int num_flags = numElements % (ldim * REGS) == 0 ? numElements / (ldim * REGS) : numElements / (ldim * REGS) + 1;
+    unsigned int *flags = (unsigned int *)calloc(sizeof(unsigned int), num_flags + 2);
+    flags[0] = 1;
+    flags[num_flags + 1] = 0;
+    cudaMalloc((void **)&d_flags, (num_flags + 2) * sizeof(unsigned int));
+    cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    // Number of work-groups/thread blocks
+    int num_wg = num_flags;
+
+    // Start timer
+    cudaEventRecord( start, 0 );
+
+    // Kernel launch (Copy_if)
+    select_copy_if<<<num_wg, ldim>>>(d_B, d_A, numElements, d_flags, is_even());
+
+    cudaMemcpy(&flagM1, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+    // Stop timer
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize( stop );
+    cudaEventElapsedTime( &time1, start, stop );
+    if(iteration >= WARMUP) time2 += time1;
+
+    if(iteration == REP+WARMUP-1){
+      float timer = time2 / REP;
+      double bw = (double)((numElements + flagM1) * sizeof(T)) / (double)(timer * 1000000.0);
+      printf("Copy_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+    }
+
+    // Atomic flags
+    cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    free(flags);
+
+    // Start timer
+    cudaEventRecord( start, 0 );
+
+    // Kernel launch (Remove_if)
+    select_remove_if<<<num_wg, ldim>>>(d_A, d_A, numElements, d_flags, is_even()); 
+
+    cudaMemcpy(&flagM2, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+    // End timer
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize( stop );
+    cudaEventElapsedTime( &time1, start, stop );
+    if(iteration >= WARMUP) time3 += time1;
+
+    if(iteration == REP+WARMUP-1){
+      float timer = time3 / REP;
+      double bw = (double)((numElements + flagM2) * sizeof(T)) / (double)(timer * 1000000.0);
+      printf("Remove_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+    }
+
+    // Free flags
+    cudaFree(d_flags);
+  }
+  // Copy to host memory
+  cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_C, d_A, size, cudaMemcpyDeviceToHost);
+
+  // CPU execution for comparison
+  cpu_copy_if(h_D, h_A, numElements, is_even());
+  cpu_remove_if(h_A, numElements, is_even());
+
+  // Verify that the result vector is correct
+#if PRINT
+  for(int i = 0; i < numElements; ++i){
+     printf("%d ",*(h_B+i));
+  }
+  printf("\n");
+  for(int i = 0; i < numElements; ++i){
+      printf("%d ",*(h_D+i));
+  }
+  printf("\n");
+#endif
+  for (int i = 0; i < flagM1 - 1; ++i){
+      if (h_B[i] != h_D[i]){
+          fprintf(stderr, "Copy_if - Result verification failed at element %d!\n", i);
+          exit(EXIT_FAILURE);
+      }
+  }
+  for (int i = 0; i < flagM2 - 1; ++i){
+      if (h_C[i] != h_A[i]){
+          fprintf(stderr, "Remove_if - Result verification failed at element %d!\n", i);
+          exit(EXIT_FAILURE);
+      }
+  }
+  printf("Test PASSED\n");
+
+  // Free device global memory
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Free host memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+  free(h_D);
+
+  return 0;
+}