/* * Copyright (c) 2016 University of Cordoba and University of Illinois * All rights reserved. * * Developed by: IMPACT Research Group * University of Cordoba and University of Illinois * http://impact.crhc.illinois.edu/ * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * with the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * > Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimers. * > Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimers in the * documentation and/or other materials provided with the distribution. * > Neither the names of IMPACT Research Group, University of Cordoba, * University of Illinois nor the names of its contributors may be used * to endorse or promote products derived from this Software without * specific prior written permission. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH * THE SOFTWARE. * */ #include "support/common.h" extern __shared__ int l_mem[]; // GPU kernel ------------------------------------------------------------------------------------------ __global__ void PTTWAC_soa_asta(int A, int B, int b, T *input, int *finished, int *head) { int* done = l_mem; int* gid_ = &done[1]; const int tid = threadIdx.x; int m = A * B - 1; if(tid == 0) // Dynamic fetch gid_[0] = atomicAdd(&head[0], 1); __syncthreads(); while(gid_[0] < m) { int next_in_cycle = (gid_[0] * A) - m * (gid_[0] / B); if(next_in_cycle == gid_[0]) { if(tid == 0) // Dynamic fetch gid_[0] = atomicAdd(&head[0], 1); __syncthreads(); continue; } T data1, data2, data3, data4; int i = tid; if(i < b) data1 = input[gid_[0] * b + i]; i += blockDim.x; if(i < b) data2 = input[gid_[0] * b + i]; i += blockDim.x; if(i < b) data3 = input[gid_[0] * b + i]; i += blockDim.x; if(i < b) data4 = input[gid_[0] * b + i]; if(tid == 0) { //make sure the read is not cached done[0] = atomicAdd(&finished[gid_[0]], 0); } __syncthreads(); for(; done[0] == 0; next_in_cycle = (next_in_cycle * A) - m * (next_in_cycle / B)) { T backup1, backup2, backup3, backup4; i = tid; if(i < b) backup1 = input[next_in_cycle * b + i]; i += blockDim.x; if(i < b) backup2 = input[next_in_cycle * b + i]; i += blockDim.x; if(i < b) backup3 = input[next_in_cycle * b + i]; i += blockDim.x; if(i < b) backup4 = input[next_in_cycle * b + i]; if(tid == 0) { done[0] = atomicExch(&finished[next_in_cycle], (int)1); } __syncthreads(); if(!done[0]) { i = tid; if(i < b) input[next_in_cycle * b + i] = data1; i += blockDim.x; if(i < b) input[next_in_cycle * b + i] = data2; i += blockDim.x; if(i < b) input[next_in_cycle * b + i] = data3; i += blockDim.x; if(i < b) input[next_in_cycle * b + i] = data4; } i = tid; if(i < b) data1 = backup1; i += blockDim.x; if(i < b) data2 = backup2; i += blockDim.x; if(i < b) data3 = backup3; i += blockDim.x; if(i < b) data4 = backup4; } if(tid == 0) // Dynamic fetch gid_[0] = atomicAdd(&head[0], 1); __syncthreads(); } } cudaError_t call_PTTWAC_soa_asta(int blocks, int threads, int A, int B, int b, T *input, int *finished, int *head, int l_mem_size){ dim3 dimGrid(blocks); dim3 dimBlock(threads); PTTWAC_soa_asta<<>>(A, B, b, input, finished, head); cudaError_t err = cudaGetLastError(); return err; } __global__ void BS_marshal(T *input, int tile_size, int width) { T* tile = (T*)l_mem; int tidx = threadIdx.x; int m = width*tile_size-1; int bid = blockIdx.x; input += tile_size*width*bid; for (int i = tidx; i < tile_size*width; i+=blockDim.x) { int next = (i * tile_size)-m*(i/width); tile[next] = input[i]; } __syncthreads(); for (int i = tidx; i < tile_size*width; i+=blockDim.x) { input[i] = tile[i]; } } cudaError_t call_BS_marshal(int blocks, int threads, int m, int n, T *input, int l_mem_size){ dim3 dimGrid(blocks); dim3 dimBlock(threads); BS_marshal<<>>(input, m, n); cudaError_t err = cudaGetLastError(); return err; }