From 3de4b495fb176eba9a0eb517a4ce05903cb67acb Mon Sep 17 00:00:00 2001 From: Juan Gomez Luna Date: Wed, 16 Jun 2021 19:46:05 +0200 Subject: PrIM -- first commit --- NW/baselines/cpu/Makefile | 16 ++ NW/baselines/cpu/README | 9 + NW/baselines/cpu/needle.cpp | 382 +++++++++++++++++++++++++++++++++++++++++++ NW/baselines/cpu/run | 1 + NW/baselines/cpu/run_offload | 1 + 5 files changed, 409 insertions(+) create mode 100644 NW/baselines/cpu/Makefile create mode 100644 NW/baselines/cpu/README create mode 100644 NW/baselines/cpu/needle.cpp create mode 100644 NW/baselines/cpu/run create mode 100644 NW/baselines/cpu/run_offload (limited to 'NW/baselines/cpu') diff --git a/NW/baselines/cpu/Makefile b/NW/baselines/cpu/Makefile new file mode 100644 index 0000000..f49dd2f --- /dev/null +++ b/NW/baselines/cpu/Makefile @@ -0,0 +1,16 @@ +# C compiler +CC = g++ +ICC = icc +CC_FLAGS = -g -O3 -fopenmp +OFFLOAD_CC_FLAGS = -offload-option,mic,compiler,"-no-opt-prefetch" + +all: needle needle_offload + +needle: + $(CC) $(CC_FLAGS) needle.cpp -o needle + +needle_offload: + $(ICC) $(CC_FLAGS) $(OFFLOAD_CC_FLAGS) -DOMP_OFFLOAD needle.cpp -o needle_offload + +clean: + rm -f needle needle_offload diff --git a/NW/baselines/cpu/README b/NW/baselines/cpu/README new file mode 100644 index 0000000..f3fd2c2 --- /dev/null +++ b/NW/baselines/cpu/README @@ -0,0 +1,9 @@ +Needleman-Wunsch (NW) + +Compilation instructions + + make + +Execution instructions + + ./needle 46080 10 4 diff --git a/NW/baselines/cpu/needle.cpp b/NW/baselines/cpu/needle.cpp new file mode 100644 index 0000000..0f1e2b6 --- /dev/null +++ b/NW/baselines/cpu/needle.cpp @@ -0,0 +1,382 @@ +#define LIMIT -999 +//#define TRACE +#include +#include +#include +#include +#include +#include +#define OPENMP +//#define NUM_THREAD 4 + +#define BLOCK_SIZE 16 + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + +// Returns the current system time in microseconds +long long get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; + +} + +#ifdef OMP_OFFLOAD +#pragma omp declare target +#endif +int maximum( int a, + int b, + int c){ + + int k; + if( a <= b ) + k = b; + else + k = a; + + if( k <=c ) + return(c); + else + return(k); +} +#ifdef OMP_OFFLOAD +#pragma omp end declare target +#endif + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s \n", argv[0]); + fprintf(stderr, "\t - x and y dimensions\n"); + fprintf(stderr, "\t - penalty(positive integer)\n"); + fprintf(stderr, "\t - no. of threads\n"); + exit(1); +} + +void nw_optimized(int *input_itemsets, int *output_itemsets, int *referrence, + int max_rows, int max_cols, int penalty) +{ +#ifdef OMP_OFFLOAD + int transfer_size = max_rows * max_cols; +#pragma omp target data map(to: max_cols, penalty, referrence[0:transfer_size]) map(input_itemsets[0:transfer_size]) + { + +#pragma omp target +#endif + for( int blk = 1; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = 0; b_index_x < blk; ++b_index_x) + { + int b_index_y = blk - 1 - b_index_x; + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j + 1]; + } + } + + } + } + + printf("Processing bottom-right matrix\n"); + +#ifdef OMP_OFFLOAD +#pragma omp target +#endif + for ( int blk = 2; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = blk - 1; b_index_x < (max_cols-1)/BLOCK_SIZE; ++b_index_x) + { + int b_index_y = (max_cols-1)/BLOCK_SIZE + blk - 2 - b_index_x; + + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j +1]; + } + } + } + } + +#ifdef OMP_OFFLOAD + } +#endif + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// + void +runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + //int *matrix_cuda, *matrix_cuda_out, *referrence_cuda; + //int size; + int omp_num_threads; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 4) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + omp_num_threads = atoi(argv[3]); + } + else{ + usage(argc, argv); + } + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + + //Compute top-left matrix + printf("Num of threads: %d\n", omp_num_threads); + printf("Processing top-left matrix\n"); + + long long start_time = get_time(); + + nw_optimized( input_itemsets, output_itemsets, referrence, + max_rows, max_cols, penalty ); + + long long end_time = get_time(); + + printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); + +#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = input_itemsets[(i - 1) * max_cols + j - 1]; + w = input_itemsets[ i * max_cols + j - 1 ]; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = input_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +} + + + diff --git a/NW/baselines/cpu/run b/NW/baselines/cpu/run new file mode 100644 index 0000000..8c8088a --- /dev/null +++ b/NW/baselines/cpu/run @@ -0,0 +1 @@ +./needle 2048 10 2 diff --git a/NW/baselines/cpu/run_offload b/NW/baselines/cpu/run_offload new file mode 100644 index 0000000..8c5989a --- /dev/null +++ b/NW/baselines/cpu/run_offload @@ -0,0 +1 @@ +./needle_offload 2048 10 2 -- cgit v1.2.3