1 files changed, 113 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/kernel.cu b/SEL/baselines/gpu/kernel.cu
new file mode 100644
index 0000000..607dfd1
--- /dev/null
+++ b/SEL/baselines/gpu/kernel.cu
@@ -0,0 +1,113 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+__global__ void select_remove_if(T *matrix_out, T *matrix,
+    int size,
+    volatile unsigned int *flags,
+    struct is_even pred)
+{
+  __shared__ int count; // Counter for number of non-zero elements per block
+  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
+
+  // Dynamic allocation of runtime workgroup id
+  if (threadIdx.x == 0) count = 0;
+  const int my_s = dynamic_wg_id(flags, num_flags);
+
+  int local_cnt = 0;
+  // Declare on-chip memory
+  T reg[REGS];
+  int pos = my_s * REGS * blockDim.x + threadIdx.x;
+  // Load in on-chip memory
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    if (pos < size){
+      reg[j] = matrix[pos];
+      if(!pred(reg[j]))
+        local_cnt++;
+      else
+        reg[j] = -1;
+    }
+    else
+      reg[j] = -1;
+    pos += blockDim.x;
+  }
+  reduction<int>(&count, local_cnt);
+
+  // Set global synch
+  ds_sync_irregular(flags, my_s, &count);
+
+  // Store to global memory 
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
+    if (reg[j] >= 0){
+      matrix_out[pos] = reg[j];
+    }
+  }
+}
+
+__global__ void select_copy_if(T *matrix_out, T *matrix,
+    int size,
+    volatile unsigned int *flags,
+    struct is_even pred)
+{
+  __shared__ int count; // Counter for number of non-zero elements per block
+  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
+
+  // Dynamic allocation of runtime workgroup id
+  if (threadIdx.x == 0) count = 0;
+  const int my_s = dynamic_wg_id(flags, num_flags);
+
+  int local_cnt = 0;
+  // Declare on-chip memory
+  T reg[REGS];
+  int pos = my_s * REGS * blockDim.x + threadIdx.x;
+  // Load in on-chip memory
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    if (pos < size){
+      reg[j] = matrix[pos];
+      if(pred(reg[j]))
+        local_cnt++;
+      else
+        reg[j] = -1;
+    }
+    else
+      reg[j] = -1;
+    pos += blockDim.x;
+  }
+  reduction<int>(&count, local_cnt);
+
+  // Set global synch
+  ds_sync_irregular(flags, my_s, &count);
+
+  // Store to global memory 
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
+    if (reg[j] >= 0){
+      matrix_out[pos] = reg[j];
+    }
+  }
+}