1 files changed, 91 insertions, 0 deletions
diff --git a/UNI/baselines/gpu/kernel.cu b/UNI/baselines/gpu/kernel.cu
new file mode 100644
index 0000000..5cd9ea4
--- /dev/null
+++ b/UNI/baselines/gpu/kernel.cu
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+__device__ T warp_up(T reg, int delta){
+  __shared__ volatile T R[L_DIM];
+
+  R[threadIdx.x] = reg;
+
+  return (lane_id() - delta >= 0 ? R[threadIdx.x - delta] : 0);
+}
+__device__ T __shuffle_up(T* matrix, int my_s, int pos, T regi, int i){
+#if (__CUDA_ARCH__ >= 300 )
+  T p = __shfl_up(regi, 1);
+#else
+  T p = warp_up(regi, 1);
+#endif
+  if(lane_id() == 0 && i > 0)
+    p = matrix[pos - 1]; 
+  if(lane_id() == 0 && threadIdx.x != 0 && i == 0)
+    p = matrix[pos - 1];
+  if(my_s > 0 && threadIdx.x == 0 && i == 0)
+    p = matrix[pos - 1];
+  if(my_s == 0 && threadIdx.x == 0 && i == 0)
+    p = -1;
+  return p;
+}
+
+__global__ void unique(T *matrix_out, T *matrix,
+    int size,
+    volatile unsigned int *flags)
+{
+  __shared__ int count; // Counter for number of non-zero elements per block
+  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
+
+  // Dynamic allocation of runtime workgroup id
+  if (threadIdx.x == 0) count = 0;
+  const int my_s = dynamic_wg_id(flags, num_flags);
+
+  int local_cnt = 0;
+  // Declare on-chip memory
+  T reg[REGS];
+  int pos = my_s * REGS * blockDim.x + threadIdx.x;
+  // Load in on-chip memory
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    if (pos < size){
+      reg[j] = matrix[pos];
+      if(reg[j] != __shuffle_up(matrix, my_s, pos, reg[j], j))
+        local_cnt++;
+      else
+        reg[j] = -1;
+    }
+    else
+      reg[j] = -1;
+    pos += blockDim.x;
+  }
+  reduction<int>(&count, local_cnt);
+
+  // Set global synch
+  ds_sync_irregular(flags, my_s, &count);
+
+  // Store to global memory 
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
+    if (reg[j] >= 0){
+      matrix_out[pos] = reg[j];
+    }
+  }
+}