1 files changed, 170 insertions, 0 deletions
diff --git a/TRNS/baselines/gpu/kernel.cu b/TRNS/baselines/gpu/kernel.cu
new file mode 100644
index 0000000..f2251cd
--- /dev/null
+++ b/TRNS/baselines/gpu/kernel.cu
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the 
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba, 
+ *        University of Illinois nor the names of its contributors may be used 
+ *        to endorse or promote products derived from this Software without 
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include "support/common.h"
+
+extern __shared__ int l_mem[];
+
+// GPU kernel ------------------------------------------------------------------------------------------
+__global__ void PTTWAC_soa_asta(int A, int B, int b, T *input, int *finished, int *head) {
+
+    int* done = l_mem;
+    int* gid_ = &done[1];
+
+    const int tid = threadIdx.x;
+    int       m   = A * B - 1;
+
+    if(tid == 0) // Dynamic fetch
+        gid_[0] = atomicAdd(&head[0], 1);
+    __syncthreads();
+
+    while(gid_[0] < m) {
+        int next_in_cycle = (gid_[0] * A) - m * (gid_[0] / B);
+        if(next_in_cycle == gid_[0]) {
+            if(tid == 0) // Dynamic fetch
+                gid_[0] = atomicAdd(&head[0], 1);
+            __syncthreads();
+            continue;
+        }
+        T   data1, data2, data3, data4;
+        int i = tid;
+        if(i < b)
+            data1 = input[gid_[0] * b + i];
+        i += blockDim.x;
+        if(i < b)
+            data2 = input[gid_[0] * b + i];
+        i += blockDim.x;
+        if(i < b)
+            data3 = input[gid_[0] * b + i];
+        i += blockDim.x;
+        if(i < b)
+            data4 = input[gid_[0] * b + i];
+
+        if(tid == 0) {
+            //make sure the read is not cached
+            done[0] = atomicAdd(&finished[gid_[0]], 0);
+        }
+        __syncthreads();
+
+        for(; done[0] == 0; next_in_cycle = (next_in_cycle * A) - m * (next_in_cycle / B)) {
+            T backup1, backup2, backup3, backup4;
+            i = tid;
+            if(i < b)
+                backup1 = input[next_in_cycle * b + i];
+            i += blockDim.x;
+            if(i < b)
+                backup2 = input[next_in_cycle * b + i];
+            i += blockDim.x;
+            if(i < b)
+                backup3 = input[next_in_cycle * b + i];
+            i += blockDim.x;
+            if(i < b)
+                backup4 = input[next_in_cycle * b + i];
+
+            if(tid == 0) {
+                done[0] = atomicExch(&finished[next_in_cycle], (int)1);
+            }
+            __syncthreads();
+
+            if(!done[0]) {
+                i = tid;
+                if(i < b)
+                    input[next_in_cycle * b + i] = data1;
+                i += blockDim.x;
+                if(i < b)
+                    input[next_in_cycle * b + i] = data2;
+                i += blockDim.x;
+                if(i < b)
+                    input[next_in_cycle * b + i] = data3;
+                i += blockDim.x;
+                if(i < b)
+                    input[next_in_cycle * b + i] = data4;
+            }
+            i = tid;
+            if(i < b)
+                data1 = backup1;
+            i += blockDim.x;
+            if(i < b)
+                data2 = backup2;
+            i += blockDim.x;
+            if(i < b)
+                data3 = backup3;
+            i += blockDim.x;
+            if(i < b)
+                data4 = backup4;
+        }
+
+        if(tid == 0) // Dynamic fetch
+            gid_[0] = atomicAdd(&head[0], 1);
+        __syncthreads();
+    }
+}
+
+cudaError_t call_PTTWAC_soa_asta(int blocks, int threads, int A, int B, int b, T *input, 
+    int *finished, int *head, int l_mem_size){
+    dim3 dimGrid(blocks);
+    dim3 dimBlock(threads);
+    PTTWAC_soa_asta<<<dimGrid, dimBlock, l_mem_size>>>(A, B, b, input, 
+        finished, head);
+    cudaError_t err = cudaGetLastError();
+    return err;
+}
+
+__global__ void BS_marshal(T *input, int tile_size, int width) {
+
+  T* tile = (T*)l_mem;
+
+  int tidx = threadIdx.x;
+  int m = width*tile_size-1;
+  int bid = blockIdx.x;
+
+  input += tile_size*width*bid;
+  for (int i = tidx; i < tile_size*width; i+=blockDim.x) {
+    int next = (i * tile_size)-m*(i/width);
+    tile[next] = input[i];
+  }
+  __syncthreads();
+  for (int i = tidx; i < tile_size*width; i+=blockDim.x) {
+    input[i] = tile[i];
+  }
+}
+
+cudaError_t call_BS_marshal(int blocks, int threads, int m, int n, T *input, int l_mem_size){
+    dim3 dimGrid(blocks);
+    dim3 dimBlock(threads);
+    BS_marshal<<<dimGrid, dimBlock, l_mem_size>>>(input, m, n);
+    cudaError_t err = cudaGetLastError();
+    return err;
+}