1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
/***************************************************************************
*cr
*cr (C) Copyright 2015 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
Copyright (c) 2015 University of Illinois at Urbana-Champaign.
All rights reserved.
Permission to use, copy, modify and distribute this software and its documentation for
educational purpose is hereby granted without fee, provided that the above copyright
notice and this permission notice appear in all copies of this software and that you do
not sell the software.
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
OTHERWISE.
Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
*/
__device__ T warp_up(T reg, int delta){
__shared__ volatile T R[L_DIM];
R[threadIdx.x] = reg;
return (lane_id() - delta >= 0 ? R[threadIdx.x - delta] : 0);
}
__device__ T __shuffle_up(T* matrix, int my_s, int pos, T regi, int i){
#if (__CUDA_ARCH__ >= 300 )
T p = __shfl_up(regi, 1);
#else
T p = warp_up(regi, 1);
#endif
if(lane_id() == 0 && i > 0)
p = matrix[pos - 1];
if(lane_id() == 0 && threadIdx.x != 0 && i == 0)
p = matrix[pos - 1];
if(my_s > 0 && threadIdx.x == 0 && i == 0)
p = matrix[pos - 1];
if(my_s == 0 && threadIdx.x == 0 && i == 0)
p = -1;
return p;
}
__global__ void unique(T *matrix_out, T *matrix,
int size,
volatile unsigned int *flags)
{
__shared__ int count; // Counter for number of non-zero elements per block
const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
// Dynamic allocation of runtime workgroup id
if (threadIdx.x == 0) count = 0;
const int my_s = dynamic_wg_id(flags, num_flags);
int local_cnt = 0;
// Declare on-chip memory
T reg[REGS];
int pos = my_s * REGS * blockDim.x + threadIdx.x;
// Load in on-chip memory
#pragma unroll
for (int j = 0; j < REGS; j++){
if (pos < size){
reg[j] = matrix[pos];
if(reg[j] != __shuffle_up(matrix, my_s, pos, reg[j], j))
local_cnt++;
else
reg[j] = -1;
}
else
reg[j] = -1;
pos += blockDim.x;
}
reduction<int>(&count, local_cnt);
// Set global synch
ds_sync_irregular(flags, my_s, &count);
// Store to global memory
#pragma unroll
for (int j = 0; j < REGS; j++){
pos = block_binary_prefix_sums(&count, reg[j] >= 0);
if (reg[j] >= 0){
matrix_out[pos] = reg[j];
}
}
}
|