summaryrefslogtreecommitdiff
path: root/NW/baselines/gpu/needle.cu
diff options
context:
space:
mode:
authorJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
committerJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
commit3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
treefc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW/baselines/gpu/needle.cu
parentef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
PrIM -- first commit
Diffstat (limited to 'NW/baselines/gpu/needle.cu')
-rw-r--r--NW/baselines/gpu/needle.cu266
1 files changed, 266 insertions, 0 deletions
diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu
new file mode 100644
index 0000000..697a9e7
--- /dev/null
+++ b/NW/baselines/gpu/needle.cu
@@ -0,0 +1,266 @@
+#define LIMIT -999
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "needle.h"
+#include <cuda.h>
+#include <sys/time.h>
+
+// includes, kernels
+#include "needle_kernel.cu"
+
+#ifdef TIMING
+#include "timing.h"
+
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
+ d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+
+int blosum62[24][24] = {
+ { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
+ {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
+ {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
+ {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+ {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
+ {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
+ {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
+ {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
+ {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
+ {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
+ {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
+ {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
+ {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+ { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
+ { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
+ {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
+ {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
+ { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
+ {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
+ {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
+};
+
+double gettime() {
+ struct timeval t;
+ gettimeofday(&t,NULL);
+ return t.tv_sec+t.tv_usec*1e-6;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+ int
+main( int argc, char** argv)
+{
+
+ printf("WG size of kernel = %d \n", BLOCK_SIZE);
+
+ runTest( argc, argv);
+
+ return EXIT_SUCCESS;
+}
+
+void usage(int argc, char **argv)
+{
+ fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
+ fprintf(stderr, "\t<dimension> - x and y dimensions\n");
+ fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
+ exit(1);
+}
+
+void runTest( int argc, char** argv)
+{
+ int max_rows, max_cols, penalty;
+ int *input_itemsets, *output_itemsets, *referrence;
+ int *matrix_cuda, *referrence_cuda;
+ int size;
+
+
+ // the lengths of the two sequences should be able to divided by 16.
+ // And at current stage max_rows needs to equal max_cols
+ if (argc == 3)
+ {
+ max_rows = atoi(argv[1]);
+ max_cols = atoi(argv[1]);
+ penalty = atoi(argv[2]);
+ }
+ else{
+ usage(argc, argv);
+ }
+
+ if(atoi(argv[1])%16!=0){
+ fprintf(stderr,"The dimension values must be a multiple of 16\n");
+ exit(1);
+ }
+
+
+ max_rows = max_rows + 1;
+ max_cols = max_cols + 1;
+ referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+
+
+ if (!input_itemsets)
+ fprintf(stderr, "error: can not allocate memory");
+
+ srand ( 7 );
+
+
+ for (int i = 0 ; i < max_cols; i++){
+ for (int j = 0 ; j < max_rows; j++){
+ input_itemsets[i*max_cols+j] = 0;
+ }
+ }
+
+ printf("Start Needleman-Wunsch\n");
+
+ for( int i=1; i< max_rows ; i++){ //please define your own sequence.
+ input_itemsets[i*max_cols] = rand() % 10 + 1;
+ }
+ for( int j=1; j< max_cols ; j++){ //please define your own sequence.
+ input_itemsets[j] = rand() % 10 + 1;
+ }
+
+
+ for (int i = 1 ; i < max_cols; i++){
+ for (int j = 1 ; j < max_rows; j++){
+ referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
+ }
+ }
+
+ for( int i = 1; i< max_rows ; i++)
+ input_itemsets[i*max_cols] = -i * penalty;
+ for( int j = 1; j< max_cols ; j++)
+ input_itemsets[j] = -j * penalty;
+
+
+ size = max_cols * max_rows;
+ cudaMalloc((void**)& referrence_cuda, sizeof(int)*size);
+ cudaMalloc((void**)& matrix_cuda, sizeof(int)*size);
+
+ cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice);
+ cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice);
+
+ dim3 dimGrid;
+ dim3 dimBlock(BLOCK_SIZE, 1);
+ int block_width = ( max_cols - 1 )/BLOCK_SIZE;
+
+#ifdef TIMING
+ gettimeofday(&tv_kernel_start, NULL);
+#endif
+
+ printf("Processing top-left matrix\n");
+ //process top-left matrix
+ for( int i = 1 ; i <= block_width ; i++){
+ dimGrid.x = i;
+ dimGrid.y = 1;
+ needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+ ,max_cols, penalty, i, block_width);
+ }
+ printf("Processing bottom-right matrix\n");
+ //process bottom-right matrix
+ for( int i = block_width - 1 ; i >= 1 ; i--){
+ dimGrid.x = i;
+ dimGrid.y = 1;
+ needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+ ,max_cols, penalty, i, block_width);
+ }
+
+#ifdef TIMING
+ gettimeofday(&tv_kernel_end, NULL);
+ tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
+ kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+ cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
+
+ //#define TRACEBACK
+#ifdef TRACEBACK
+
+ FILE *fpo = fopen("result.txt","w");
+ fprintf(fpo, "print traceback value GPU:\n");
+
+ for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
+ int nw, n, w, traceback;
+ if ( i == max_rows - 2 && j == max_rows - 2 )
+ fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
+ if ( i == 0 && j == 0 )
+ break;
+ if ( i > 0 && j > 0 ){
+ nw = output_itemsets[(i - 1) * max_cols + j - 1];
+ w = output_itemsets[ i * max_cols + j - 1 ];
+ n = output_itemsets[(i - 1) * max_cols + j];
+ }
+ else if ( i == 0 ){
+ nw = n = LIMIT;
+ w = output_itemsets[ i * max_cols + j - 1 ];
+ }
+ else if ( j == 0 ){
+ nw = w = LIMIT;
+ n = output_itemsets[(i - 1) * max_cols + j];
+ }
+ else{
+ }
+
+ //traceback = maximum(nw, w, n);
+ int new_nw, new_w, new_n;
+ new_nw = nw + referrence[i * max_cols + j];
+ new_w = w - penalty;
+ new_n = n - penalty;
+
+ traceback = maximum(new_nw, new_w, new_n);
+ if(traceback == new_nw)
+ traceback = nw;
+ if(traceback == new_w)
+ traceback = w;
+ if(traceback == new_n)
+ traceback = n;
+
+ fprintf(fpo, "%d ", traceback);
+
+ if(traceback == nw )
+ {i--; j--; continue;}
+
+ else if(traceback == w )
+ {j--; continue;}
+
+ else if(traceback == n )
+ {i--; continue;}
+
+ else
+ ;
+ }
+
+ fclose(fpo);
+
+#endif
+
+ cudaFree(referrence_cuda);
+ cudaFree(matrix_cuda);
+
+ free(referrence);
+ free(input_itemsets);
+ free(output_itemsets);
+
+#ifdef TIMING
+ printf("Exec: %f\n", kernel_time);
+#endif
+}
+