BS/baselines/gpu/binary_search.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

#include <cuda.h>
#include <limits.h>
#include "binary_search.h"

#include <chrono>
#include <iostream>

#define BLOCKDIM 512
#define SEARCH_CHUNK 16
#define BLOCK_CHUNK (BLOCKDIM*SEARCH_CHUNK)


__global__ void search_kernel(const long int *arr,
    const long int len, const long int *querys, const long int num_querys, long int *res, bool *flag)
{
    int search;
    if(*flag == false) {
        int tid = threadIdx.x;
        __shared__ int s_arr[BLOCK_CHUNK];

        /* Since each value is being copied to shared memory, the rest of the
        following uncommented code is unncessary, since a direct comparison
        can be done at the time of copy below. */
        // for(int i = 0; i < BLOCKDIM; ++i) {
        //     int shared_loc = i*SEARCH_CHUNK + tid;
        //     int global_loc = shared_loc + BLOCK_CHUNK * blockIdx.x;
        //     if(arr[global_loc] == search) {
        //         *flag = true;
        //         *res = global_loc;
        //     }
        //     __syncthreads();
        // }

        /* Copy chunk of array that this entire block of threads will read
        from the slower global memory to the faster shared memory. */
        for(long int i = 0; i < SEARCH_CHUNK; ++i) {
            int shared_loc = tid*SEARCH_CHUNK + i;
            int global_loc = shared_loc + BLOCK_CHUNK * blockIdx.x;

            /* Make sure to stay within the bounds of the global array,
            else assign a dummy value. */
            if(global_loc < len) {
              s_arr[shared_loc] = arr[global_loc];
            }
            else {
              s_arr[shared_loc] = INT_MAX;
            }
        }
        __syncthreads();

        for(long int i = 0; i < num_querys; i++)
        {
            search = querys[i];
            /* For each thread, set the initial search range. */
            int L = 0;
            int R = SEARCH_CHUNK - 1;
            int m = (L + R) / 2;

            /* Pointer to the part of the shared array for this thread. */
            int *s_ptr = &s_arr[tid*SEARCH_CHUNK];

            /* Each thread will search a chunk of the block array.
            Many blocks will not find a solution so the search must
            be allowed to fail on a per block basis. The loop will
            break (fail) when L >= R. */
            while(L <= R && *flag == false)
            {
                if(s_ptr[m] < search) {
                    L = m + 1;
                }
                else if(s_ptr[m] > search) {
                    R = m - 1;
                }
                else {
                    *flag = true;
                    *res = m += tid*SEARCH_CHUNK + BLOCK_CHUNK * blockIdx.x;
                }

                m = (L + R) / 2;
            }
        }
    }
}


int binary_search(const long int *arr, const long int len, const long int *querys, const long int num_querys)
{
    long int *d_arr, *d_querys, *d_res;
    bool *d_flag;

    size_t arr_size = len * sizeof(long int);
    size_t querys_size = num_querys * sizeof(long int);
    size_t res_size = sizeof(long int);
    size_t flag_size = sizeof(bool);

    cudaMalloc(&d_arr, arr_size);
    cudaMalloc(&d_querys, querys_size);
    cudaMalloc(&d_res, res_size);
    cudaMalloc(&d_flag, flag_size);

    cudaMemcpy(d_arr, arr, arr_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_querys, querys, querys_size, cudaMemcpyHostToDevice);
    cudaMemset(d_flag, 0, flag_size);

    /* Set res value to -1, so that if the function returns -1, that
    indicates an algorithm failure. */
    cudaMemset(d_res, -0x1, res_size);

    int blockSize = BLOCKDIM;
    int gridSize = (len-1)/BLOCK_CHUNK + 1;

    auto start = std::chrono::high_resolution_clock::now();
    search_kernel<<<gridSize,blockSize>>>(d_arr, len, d_querys, num_querys ,d_res, d_flag);
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    std::cout << "Kernel Time: " <<
        std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count() <<
        " ms" << std::endl;

    long int res;
    cudaMemcpy(&res, d_res, res_size, cudaMemcpyDeviceToHost);

    return res;
}