1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
/***************************************************************************
*cr
*cr (C) Copyright 2015 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
Copyright (c) 2015 University of Illinois at Urbana-Champaign.
All rights reserved.
Permission to use, copy, modify and distribute this software and its documentation for
educational purpose is hereby granted without fee, provided that the above copyright
notice and this permission notice appear in all copies of this software and that you do
not sell the software.
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
OTHERWISE.
Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
*/
__global__ void select_remove_if(T *matrix_out, T *matrix,
int size,
volatile unsigned int *flags,
struct is_even pred)
{
__shared__ int count; // Counter for number of non-zero elements per block
const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
// Dynamic allocation of runtime workgroup id
if (threadIdx.x == 0) count = 0;
const int my_s = dynamic_wg_id(flags, num_flags);
int local_cnt = 0;
// Declare on-chip memory
T reg[REGS];
int pos = my_s * REGS * blockDim.x + threadIdx.x;
// Load in on-chip memory
#pragma unroll
for (int j = 0; j < REGS; j++){
if (pos < size){
reg[j] = matrix[pos];
if(!pred(reg[j]))
local_cnt++;
else
reg[j] = -1;
}
else
reg[j] = -1;
pos += blockDim.x;
}
reduction<int>(&count, local_cnt);
// Set global synch
ds_sync_irregular(flags, my_s, &count);
// Store to global memory
#pragma unroll
for (int j = 0; j < REGS; j++){
pos = block_binary_prefix_sums(&count, reg[j] >= 0);
if (reg[j] >= 0){
matrix_out[pos] = reg[j];
}
}
}
__global__ void select_copy_if(T *matrix_out, T *matrix,
int size,
volatile unsigned int *flags,
struct is_even pred)
{
__shared__ int count; // Counter for number of non-zero elements per block
const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
// Dynamic allocation of runtime workgroup id
if (threadIdx.x == 0) count = 0;
const int my_s = dynamic_wg_id(flags, num_flags);
int local_cnt = 0;
// Declare on-chip memory
T reg[REGS];
int pos = my_s * REGS * blockDim.x + threadIdx.x;
// Load in on-chip memory
#pragma unroll
for (int j = 0; j < REGS; j++){
if (pos < size){
reg[j] = matrix[pos];
if(pred(reg[j]))
local_cnt++;
else
reg[j] = -1;
}
else
reg[j] = -1;
pos += blockDim.x;
}
reduction<int>(&count, local_cnt);
// Set global synch
ds_sync_irregular(flags, my_s, &count);
// Store to global memory
#pragma unroll
for (int j = 0; j < REGS; j++){
pos = block_binary_prefix_sums(&count, reg[j] >= 0);
if (reg[j] >= 0){
matrix_out[pos] = reg[j];
}
}
}
|