diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW/baselines/gpu/needle.cu | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'NW/baselines/gpu/needle.cu')
-rw-r--r-- | NW/baselines/gpu/needle.cu | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu new file mode 100644 index 0000000..697a9e7 --- /dev/null +++ b/NW/baselines/gpu/needle.cu @@ -0,0 +1,266 @@ +#define LIMIT -999 +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include "needle.h" +#include <cuda.h> +#include <sys/time.h> + +// includes, kernels +#include "needle_kernel.cu" + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + + printf("WG size of kernel = %d \n", BLOCK_SIZE); + + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]); + fprintf(stderr, "\t<dimension> - x and y dimensions\n"); + fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); + exit(1); +} + +void runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + int *matrix_cuda, *referrence_cuda; + int size; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 3) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + } + else{ + usage(argc, argv); + } + + if(atoi(argv[1])%16!=0){ + fprintf(stderr,"The dimension values must be a multiple of 16\n"); + exit(1); + } + + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + size = max_cols * max_rows; + cudaMalloc((void**)& referrence_cuda, sizeof(int)*size); + cudaMalloc((void**)& matrix_cuda, sizeof(int)*size); + + cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice); + cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice); + + dim3 dimGrid; + dim3 dimBlock(BLOCK_SIZE, 1); + int block_width = ( max_cols - 1 )/BLOCK_SIZE; + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + + printf("Processing top-left matrix\n"); + //process top-left matrix + for( int i = 1 ; i <= block_width ; i++){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + printf("Processing bottom-right matrix\n"); + //process bottom-right matrix + for( int i = block_width - 1 ; i >= 1 ; i--){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + +#ifdef TIMING + gettimeofday(&tv_kernel_end, NULL); + tvsub(&tv_kernel_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; +#endif + + cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); + + //#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = output_itemsets[(i - 1) * max_cols + j - 1]; + w = output_itemsets[ i * max_cols + j - 1 ]; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = output_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + cudaFree(referrence_cuda); + cudaFree(matrix_cuda); + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} + |