SEL/baselines/gpu/kernel.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

/***************************************************************************
 *cr
 *cr            (C) Copyright 2015 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/
/*
  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15

  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
  All rights reserved.

  Permission to use, copy, modify and distribute this software and its documentation for 
  educational purpose is hereby granted without fee, provided that the above copyright 
  notice and this permission notice appear in all copies of this software and that you do 
  not sell the software.

  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
  OTHERWISE.

  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
*/

__global__ void select_remove_if(T *matrix_out, T *matrix,
    int size,
    volatile unsigned int *flags,
    struct is_even pred)
{
  __shared__ int count; // Counter for number of non-zero elements per block
  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;

  // Dynamic allocation of runtime workgroup id
  if (threadIdx.x == 0) count = 0;
  const int my_s = dynamic_wg_id(flags, num_flags);

  int local_cnt = 0;
  // Declare on-chip memory
  T reg[REGS];
  int pos = my_s * REGS * blockDim.x + threadIdx.x;
  // Load in on-chip memory
  #pragma unroll
  for (int j = 0; j < REGS; j++){
    if (pos < size){
      reg[j] = matrix[pos];
      if(!pred(reg[j]))
        local_cnt++;
      else
        reg[j] = -1;
    }
    else
      reg[j] = -1;
    pos += blockDim.x;
  }
  reduction<int>(&count, local_cnt);

  // Set global synch
  ds_sync_irregular(flags, my_s, &count);

  // Store to global memory 
  #pragma unroll
  for (int j = 0; j < REGS; j++){
    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
    if (reg[j] >= 0){
      matrix_out[pos] = reg[j];
    }
  }
}

__global__ void select_copy_if(T *matrix_out, T *matrix,
    int size,
    volatile unsigned int *flags,
    struct is_even pred)
{
  __shared__ int count; // Counter for number of non-zero elements per block
  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;

  // Dynamic allocation of runtime workgroup id
  if (threadIdx.x == 0) count = 0;
  const int my_s = dynamic_wg_id(flags, num_flags);

  int local_cnt = 0;
  // Declare on-chip memory
  T reg[REGS];
  int pos = my_s * REGS * blockDim.x + threadIdx.x;
  // Load in on-chip memory
  #pragma unroll
  for (int j = 0; j < REGS; j++){
    if (pos < size){
      reg[j] = matrix[pos];
      if(pred(reg[j]))
        local_cnt++;
      else
        reg[j] = -1;
    }
    else
      reg[j] = -1;
    pos += blockDim.x;
  }
  reduction<int>(&count, local_cnt);

  // Set global synch
  ds_sync_irregular(flags, my_s, &count);

  // Store to global memory 
  #pragma unroll
  for (int j = 0; j < REGS; j++){
    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
    if (reg[j] >= 0){
      matrix_out[pos] = reg[j];
    }
  }
}