diff options
Diffstat (limited to 'SEL/baselines/gpu/kernel.cu')
-rw-r--r-- | SEL/baselines/gpu/kernel.cu | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/kernel.cu b/SEL/baselines/gpu/kernel.cu new file mode 100644 index 0000000..607dfd1 --- /dev/null +++ b/SEL/baselines/gpu/kernel.cu @@ -0,0 +1,113 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2015 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +/* + In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15 + + Copyright (c) 2015 University of Illinois at Urbana-Champaign. + All rights reserved. + + Permission to use, copy, modify and distribute this software and its documentation for + educational purpose is hereby granted without fee, provided that the above copyright + notice and this permission notice appear in all copies of this software and that you do + not sell the software. + + THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR + OTHERWISE. + + Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu) +*/ + +__global__ void select_remove_if(T *matrix_out, T *matrix, + int size, + volatile unsigned int *flags, + struct is_even pred) +{ + __shared__ int count; // Counter for number of non-zero elements per block + const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1; + + // Dynamic allocation of runtime workgroup id + if (threadIdx.x == 0) count = 0; + const int my_s = dynamic_wg_id(flags, num_flags); + + int local_cnt = 0; + // Declare on-chip memory + T reg[REGS]; + int pos = my_s * REGS * blockDim.x + threadIdx.x; + // Load in on-chip memory + #pragma unroll + for (int j = 0; j < REGS; j++){ + if (pos < size){ + reg[j] = matrix[pos]; + if(!pred(reg[j])) + local_cnt++; + else + reg[j] = -1; + } + else + reg[j] = -1; + pos += blockDim.x; + } + reduction<int>(&count, local_cnt); + + // Set global synch + ds_sync_irregular(flags, my_s, &count); + + // Store to global memory + #pragma unroll + for (int j = 0; j < REGS; j++){ + pos = block_binary_prefix_sums(&count, reg[j] >= 0); + if (reg[j] >= 0){ + matrix_out[pos] = reg[j]; + } + } +} + +__global__ void select_copy_if(T *matrix_out, T *matrix, + int size, + volatile unsigned int *flags, + struct is_even pred) +{ + __shared__ int count; // Counter for number of non-zero elements per block + const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1; + + // Dynamic allocation of runtime workgroup id + if (threadIdx.x == 0) count = 0; + const int my_s = dynamic_wg_id(flags, num_flags); + + int local_cnt = 0; + // Declare on-chip memory + T reg[REGS]; + int pos = my_s * REGS * blockDim.x + threadIdx.x; + // Load in on-chip memory + #pragma unroll + for (int j = 0; j < REGS; j++){ + if (pos < size){ + reg[j] = matrix[pos]; + if(pred(reg[j])) + local_cnt++; + else + reg[j] = -1; + } + else + reg[j] = -1; + pos += blockDim.x; + } + reduction<int>(&count, local_cnt); + + // Set global synch + ds_sync_irregular(flags, my_s, &count); + + // Store to global memory + #pragma unroll + for (int j = 0; j < REGS; j++){ + pos = block_binary_prefix_sums(&count, reg[j] >= 0); + if (reg[j] >= 0){ + matrix_out[pos] = reg[j]; + } + } +} |