summaryrefslogtreecommitdiff
path: root/SEL/baselines/gpu/ds.h
diff options
context:
space:
mode:
Diffstat (limited to 'SEL/baselines/gpu/ds.h')
-rw-r--r--SEL/baselines/gpu/ds.h293
1 files changed, 293 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/ds.h b/SEL/baselines/gpu/ds.h
new file mode 100644
index 0000000..1fa73de
--- /dev/null
+++ b/SEL/baselines/gpu/ds.h
@@ -0,0 +1,293 @@
+/***************************************************************************
+ *cr
+ *cr (C) Copyright 2015 The Board of Trustees of the
+ *cr University of Illinois
+ *cr All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+ In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+ Copyright (c) 2015 University of Illinois at Urbana-Champaign.
+ All rights reserved.
+
+ Permission to use, copy, modify and distribute this software and its documentation for
+ educational purpose is hereby granted without fee, provided that the above copyright
+ notice and this permission notice appear in all copies of this software and that you do
+ not sell the software.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
+ OTHERWISE.
+
+ Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <ctime>
+#include <cstdio>
+#include <math.h>
+#include <sys/time.h>
+#include <vector>
+
+#ifdef FLOAT
+#define T float
+#elif INT
+#define T int
+#elif INT64
+#define T int64_t
+#else
+#define T double
+#endif
+
+#ifdef THREADS
+#define L_DIM THREADS
+#else
+#define L_DIM 1024
+#endif
+
+#ifdef COARSENING
+#define REGS COARSENING
+#else
+#ifdef FLOAT
+#define REGS 16
+#elif INT
+#define REGS 16
+#else
+#define REGS 8
+#endif
+#endif
+
+#ifdef ATOMIC
+#define ATOM 1
+#else
+#define ATOM 0
+#endif
+
+#define WARP_SIZE 32
+
+#define PRINT 0
+
+// Dynamic allocation of runtime workgroup id
+__device__ int dynamic_wg_id(volatile unsigned int *flags, const int num_flags){
+ __shared__ int gid_;
+ if (threadIdx.x == 0) gid_ = atomicAdd((unsigned int*)&flags[num_flags + 1], 1);
+ __syncthreads();
+ int my_s = gid_;
+ return my_s;
+}
+
+// Set global synchronization (regular DS)
+__device__ void ds_sync(volatile unsigned int *flags, const int my_s){
+#if ATOM
+ if (threadIdx.x == 0){
+ while (atomicOr((unsigned int*)&flags[my_s], 0) == 0){}
+ atomicOr((unsigned int*)&flags[my_s + 1], 1);
+ }
+#else
+ if (threadIdx.x == 0){
+ while (flags[my_s] == 0){}
+ flags[my_s + 1] = 1;
+ }
+#endif
+ __syncthreads();
+}
+
+// Set global synchronization (irregular DS)
+__device__ void ds_sync_irregular(volatile unsigned int *flags, const int my_s, int *count){
+#if ATOM
+ if (threadIdx.x == 0){
+ while (atomicOr((unsigned int*)&flags[my_s], 0) == 0){}
+ int flag = flags[my_s];
+ atomicAdd((unsigned int*)&flags[my_s + 1], flag + *count);
+ *count = flag - 1;
+ }
+#else
+ if (threadIdx.x == 0){
+ while (flags[my_s] == 0){}
+ int flag = flags[my_s];
+ flags[my_s + 1] = flag + *count;
+ *count = flag - 1;
+ }
+#endif
+ __syncthreads();
+}
+
+// Set global synchronization (irregular DS Partition)
+__device__ void ds_sync_irregular_partition(volatile unsigned int *flags1, volatile unsigned int *flags2, const int my_s, int *count1, int *count2){
+#if ATOM
+ if (threadIdx.x == 0){
+ while (atomicOr((unsigned int*)&flags1[my_s], 0) == 0){}
+ int flag2 = flags2[my_s];
+ atomicAdd((unsigned int*)&flags2[my_s + 1], flag2 + *count);
+ int flag1 = flags1[my_s];
+ atomicAdd((unsigned int*)&flags1[my_s + 1], flag1 + *count);
+ *count1 = flag1 - 1;
+ *count2 = flag2 - 1;
+ }
+#else
+ if (threadIdx.x == 0){
+ while (flags1[my_s] == 0){}
+ int flag2 = flags2[my_s];
+ flags2[my_s + 1] = flag2 + *count2;
+ int flag1 = flags1[my_s];
+ flags1[my_s + 1] = flag1 + *count1;
+ *count1 = flag1 - 1;
+ *count2 = flag2 - 1;
+ }
+#endif
+ __syncthreads();
+}
+
+// Reduction kernel (CUDA SDK reduce6)
+template <class S>
+__device__ void reduction(S *count, S local_cnt){
+ __shared__ S sdata[L_DIM];
+
+ unsigned int tid = threadIdx.x;
+ S mySum = local_cnt;
+
+ // each thread puts its local sum into shared memory
+ sdata[tid] = local_cnt;
+ __syncthreads();
+
+ // do reduction in shared mem
+ if ((blockDim.x >= 1024) && (tid < 512)){
+ sdata[tid] = mySum = mySum + sdata[tid + 512];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 512) && (tid < 256)){
+ sdata[tid] = mySum = mySum + sdata[tid + 256];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 256) && (tid < 128)){
+ sdata[tid] = mySum = mySum + sdata[tid + 128];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 128) && (tid < 64)){
+ sdata[tid] = mySum = mySum + sdata[tid + 64];
+ }
+ __syncthreads();
+
+#if (__CUDA_ARCH__ >= 300 )
+ if ( tid < 32 ){
+ // Fetch final intermediate sum from 2nd warp
+ if (blockDim.x >= 64) mySum += sdata[tid + 32];
+ // Reduce final warp using shuffle
+ #pragma unroll
+ for (int offset = WARP_SIZE/2; offset > 0; offset /= 2){
+ //mySum += __shfl_down(mySum, offset);
+ mySum += __shfl_xor(mySum, offset);
+ }
+ }
+#else
+ // fully unroll reduction within a single warp
+ if ((blockDim.x >= 64) && (tid < 32)){
+ sdata[tid] = mySum = mySum + sdata[tid + 32];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 32) && (tid < 16)){
+ sdata[tid] = mySum = mySum + sdata[tid + 16];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 16) && (tid < 8)){
+ sdata[tid] = mySum = mySum + sdata[tid + 8];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 8) && (tid < 4)){
+ sdata[tid] = mySum = mySum + sdata[tid + 4];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 4) && (tid < 2)){
+ sdata[tid] = mySum = mySum + sdata[tid + 2];
+ }
+ __syncthreads();
+
+ if ((blockDim.x >= 2) && ( tid < 1)){
+ sdata[tid] = mySum = mySum + sdata[tid + 1];
+ }
+ __syncthreads();
+#endif
+
+ // write result for this block to global mem
+ if (tid == 0) *count = mySum;
+}
+
+// Binary prefix-sum (GPU Computing Gems)
+__device__ inline int lane_id(void) { return threadIdx.x % WARP_SIZE; }
+__device__ inline int warp_id(void) { return threadIdx.x / WARP_SIZE; }
+
+__device__ unsigned int warp_prefix_sums(bool p){
+ unsigned int b = __ballot(p);
+ return __popc(b & ((1 << lane_id()) - 1));
+}
+
+__device__ int warp_scan(int val, volatile int *s_data){
+#if (__CUDA_ARCH__ < 300 )
+ int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
+ s_data[idx] = 0;
+ idx += WARP_SIZE;
+ int t = s_data[idx] = val;
+ s_data[idx] = t = t + s_data[idx - 1];
+ s_data[idx] = t = t + s_data[idx - 2];
+ s_data[idx] = t = t + s_data[idx - 4];
+ s_data[idx] = t = t + s_data[idx - 8];
+ s_data[idx] = t = t + s_data[idx - 16];
+ return s_data[idx - 1];
+#else
+ int x = val;
+ #pragma unroll
+ for(int offset = 1; offset < 32; offset <<= 1){
+ // From GTC: Kepler shuffle tips and tricks:
+#if 0
+ int y = __shfl_up(x, offset);
+ if(lane_id() >= offset)
+ x += y;
+#else
+ asm volatile("{"
+ " .reg .s32 r0;"
+ " .reg .pred p;"
+ " shfl.up.b32 r0|p, %0, %1, 0x0;"
+ " @p add.s32 r0, r0, %0;"
+ " mov.s32 %0, r0;"
+ "}" : "+r"(x) : "r"(offset));
+#endif
+ }
+ return x - val;
+#endif
+}
+
+__device__ int block_binary_prefix_sums(int* count, int x){
+
+ __shared__ int sdata[L_DIM];
+
+ // A. Exclusive scan within each warp
+ int warpPrefix = warp_prefix_sums(x);
+
+ // B. Store in shared memory
+ if(lane_id() == WARP_SIZE - 1)
+ sdata[warp_id()] = warpPrefix + x;
+ __syncthreads();
+
+ // C. One warp scans in shared memory
+ if(threadIdx.x < WARP_SIZE)
+ sdata[threadIdx.x] = warp_scan(sdata[threadIdx.x], sdata);
+ __syncthreads();
+
+ // D. Each thread calculates it final value
+ int thread_out_element = warpPrefix + sdata[warp_id()];
+ int output = thread_out_element + *count;
+ __syncthreads();
+ if(threadIdx.x == blockDim.x - 1)
+ *count += (thread_out_element + x);
+
+ return output;
+}