summaryrefslogtreecommitdiff
path: root/TRNS/baselines/gpu/kernel.cu
diff options
context:
space:
mode:
authorJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
committerJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
commit3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
treefc6776a94549d2d4039898f183dbbeb2ce013ba9 /TRNS/baselines/gpu/kernel.cu
parentef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
PrIM -- first commit
Diffstat (limited to 'TRNS/baselines/gpu/kernel.cu')
-rw-r--r--TRNS/baselines/gpu/kernel.cu170
1 files changed, 170 insertions, 0 deletions
diff --git a/TRNS/baselines/gpu/kernel.cu b/TRNS/baselines/gpu/kernel.cu
new file mode 100644
index 0000000..f2251cd
--- /dev/null
+++ b/TRNS/baselines/gpu/kernel.cu
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include "support/common.h"
+
+extern __shared__ int l_mem[];
+
+// GPU kernel ------------------------------------------------------------------------------------------
+__global__ void PTTWAC_soa_asta(int A, int B, int b, T *input, int *finished, int *head) {
+
+ int* done = l_mem;
+ int* gid_ = &done[1];
+
+ const int tid = threadIdx.x;
+ int m = A * B - 1;
+
+ if(tid == 0) // Dynamic fetch
+ gid_[0] = atomicAdd(&head[0], 1);
+ __syncthreads();
+
+ while(gid_[0] < m) {
+ int next_in_cycle = (gid_[0] * A) - m * (gid_[0] / B);
+ if(next_in_cycle == gid_[0]) {
+ if(tid == 0) // Dynamic fetch
+ gid_[0] = atomicAdd(&head[0], 1);
+ __syncthreads();
+ continue;
+ }
+ T data1, data2, data3, data4;
+ int i = tid;
+ if(i < b)
+ data1 = input[gid_[0] * b + i];
+ i += blockDim.x;
+ if(i < b)
+ data2 = input[gid_[0] * b + i];
+ i += blockDim.x;
+ if(i < b)
+ data3 = input[gid_[0] * b + i];
+ i += blockDim.x;
+ if(i < b)
+ data4 = input[gid_[0] * b + i];
+
+ if(tid == 0) {
+ //make sure the read is not cached
+ done[0] = atomicAdd(&finished[gid_[0]], 0);
+ }
+ __syncthreads();
+
+ for(; done[0] == 0; next_in_cycle = (next_in_cycle * A) - m * (next_in_cycle / B)) {
+ T backup1, backup2, backup3, backup4;
+ i = tid;
+ if(i < b)
+ backup1 = input[next_in_cycle * b + i];
+ i += blockDim.x;
+ if(i < b)
+ backup2 = input[next_in_cycle * b + i];
+ i += blockDim.x;
+ if(i < b)
+ backup3 = input[next_in_cycle * b + i];
+ i += blockDim.x;
+ if(i < b)
+ backup4 = input[next_in_cycle * b + i];
+
+ if(tid == 0) {
+ done[0] = atomicExch(&finished[next_in_cycle], (int)1);
+ }
+ __syncthreads();
+
+ if(!done[0]) {
+ i = tid;
+ if(i < b)
+ input[next_in_cycle * b + i] = data1;
+ i += blockDim.x;
+ if(i < b)
+ input[next_in_cycle * b + i] = data2;
+ i += blockDim.x;
+ if(i < b)
+ input[next_in_cycle * b + i] = data3;
+ i += blockDim.x;
+ if(i < b)
+ input[next_in_cycle * b + i] = data4;
+ }
+ i = tid;
+ if(i < b)
+ data1 = backup1;
+ i += blockDim.x;
+ if(i < b)
+ data2 = backup2;
+ i += blockDim.x;
+ if(i < b)
+ data3 = backup3;
+ i += blockDim.x;
+ if(i < b)
+ data4 = backup4;
+ }
+
+ if(tid == 0) // Dynamic fetch
+ gid_[0] = atomicAdd(&head[0], 1);
+ __syncthreads();
+ }
+}
+
+cudaError_t call_PTTWAC_soa_asta(int blocks, int threads, int A, int B, int b, T *input,
+ int *finished, int *head, int l_mem_size){
+ dim3 dimGrid(blocks);
+ dim3 dimBlock(threads);
+ PTTWAC_soa_asta<<<dimGrid, dimBlock, l_mem_size>>>(A, B, b, input,
+ finished, head);
+ cudaError_t err = cudaGetLastError();
+ return err;
+}
+
+__global__ void BS_marshal(T *input, int tile_size, int width) {
+
+ T* tile = (T*)l_mem;
+
+ int tidx = threadIdx.x;
+ int m = width*tile_size-1;
+ int bid = blockIdx.x;
+
+ input += tile_size*width*bid;
+ for (int i = tidx; i < tile_size*width; i+=blockDim.x) {
+ int next = (i * tile_size)-m*(i/width);
+ tile[next] = input[i];
+ }
+ __syncthreads();
+ for (int i = tidx; i < tile_size*width; i+=blockDim.x) {
+ input[i] = tile[i];
+ }
+}
+
+cudaError_t call_BS_marshal(int blocks, int threads, int m, int n, T *input, int l_mem_size){
+ dim3 dimGrid(blocks);
+ dim3 dimBlock(threads);
+ BS_marshal<<<dimGrid, dimBlock, l_mem_size>>>(input, m, n);
+ cudaError_t err = cudaGetLastError();
+ return err;
+}