summaryrefslogtreecommitdiff
path: root/NW/baselines
diff options
context:
space:
mode:
Diffstat (limited to 'NW/baselines')
-rw-r--r--NW/baselines/cpu/Makefile16
-rw-r--r--NW/baselines/cpu/README9
-rw-r--r--NW/baselines/cpu/needle.cpp382
-rw-r--r--NW/baselines/cpu/run1
-rw-r--r--NW/baselines/cpu/run_offload1
-rw-r--r--NW/baselines/gpu/Makefile28
-rw-r--r--NW/baselines/gpu/Makefile_nvidia50
-rw-r--r--NW/baselines/gpu/README23
-rw-r--r--NW/baselines/gpu/common/common.mk341
-rw-r--r--NW/baselines/gpu/common/make.config40
-rw-r--r--NW/baselines/gpu/needle.cu266
-rw-r--r--NW/baselines/gpu/needle.h11
-rw-r--r--NW/baselines/gpu/needle_kernel.cu188
-rw-r--r--NW/baselines/gpu/run1
-rw-r--r--NW/baselines/gpu/timing.h22
15 files changed, 1379 insertions, 0 deletions
diff --git a/NW/baselines/cpu/Makefile b/NW/baselines/cpu/Makefile
new file mode 100644
index 0000000..f49dd2f
--- /dev/null
+++ b/NW/baselines/cpu/Makefile
@@ -0,0 +1,16 @@
+# C compiler
+CC = g++
+ICC = icc
+CC_FLAGS = -g -O3 -fopenmp
+OFFLOAD_CC_FLAGS = -offload-option,mic,compiler,"-no-opt-prefetch"
+
+all: needle needle_offload
+
+needle:
+ $(CC) $(CC_FLAGS) needle.cpp -o needle
+
+needle_offload:
+ $(ICC) $(CC_FLAGS) $(OFFLOAD_CC_FLAGS) -DOMP_OFFLOAD needle.cpp -o needle_offload
+
+clean:
+ rm -f needle needle_offload
diff --git a/NW/baselines/cpu/README b/NW/baselines/cpu/README
new file mode 100644
index 0000000..f3fd2c2
--- /dev/null
+++ b/NW/baselines/cpu/README
@@ -0,0 +1,9 @@
+Needleman-Wunsch (NW)
+
+Compilation instructions
+
+ make
+
+Execution instructions
+
+ ./needle 46080 10 4
diff --git a/NW/baselines/cpu/needle.cpp b/NW/baselines/cpu/needle.cpp
new file mode 100644
index 0000000..0f1e2b6
--- /dev/null
+++ b/NW/baselines/cpu/needle.cpp
@@ -0,0 +1,382 @@
+#define LIMIT -999
+//#define TRACE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <omp.h>
+#define OPENMP
+//#define NUM_THREAD 4
+
+#define BLOCK_SIZE 16
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+// Returns the current system time in microseconds
+long long get_time()
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000000) + tv.tv_usec;
+
+}
+
+#ifdef OMP_OFFLOAD
+#pragma omp declare target
+#endif
+int maximum( int a,
+ int b,
+ int c){
+
+ int k;
+ if( a <= b )
+ k = b;
+ else
+ k = a;
+
+ if( k <=c )
+ return(c);
+ else
+ return(k);
+}
+#ifdef OMP_OFFLOAD
+#pragma omp end declare target
+#endif
+
+
+int blosum62[24][24] = {
+ { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
+ {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
+ {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
+ {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+ {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
+ {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
+ {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
+ {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
+ {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
+ {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
+ {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
+ {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
+ {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+ { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
+ { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
+ {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
+ {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
+ { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
+ {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
+ {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
+};
+
+double gettime() {
+ struct timeval t;
+ gettimeofday(&t,NULL);
+ return t.tv_sec+t.tv_usec*1e-6;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+ int
+main( int argc, char** argv)
+{
+ runTest( argc, argv);
+
+ return EXIT_SUCCESS;
+}
+
+void usage(int argc, char **argv)
+{
+ fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> <num_threads>\n", argv[0]);
+ fprintf(stderr, "\t<dimension> - x and y dimensions\n");
+ fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
+ fprintf(stderr, "\t<num_threads> - no. of threads\n");
+ exit(1);
+}
+
+void nw_optimized(int *input_itemsets, int *output_itemsets, int *referrence,
+ int max_rows, int max_cols, int penalty)
+{
+#ifdef OMP_OFFLOAD
+ int transfer_size = max_rows * max_cols;
+#pragma omp target data map(to: max_cols, penalty, referrence[0:transfer_size]) map(input_itemsets[0:transfer_size])
+ {
+
+#pragma omp target
+#endif
+ for( int blk = 1; blk <= (max_cols-1)/BLOCK_SIZE; blk++ )
+ {
+#ifdef OPENMP
+#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty)
+#endif
+ for( int b_index_x = 0; b_index_x < blk; ++b_index_x)
+ {
+ int b_index_y = blk - 1 - b_index_x;
+ int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64)));
+ int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64)));
+
+ // Copy referrence to local memory
+ for ( int i = 0; i < BLOCK_SIZE; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE; ++j)
+ {
+ reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1];
+ }
+ }
+
+ // Copy input_itemsets to local memory
+ for ( int i = 0; i < BLOCK_SIZE + 1; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE + 1; ++j)
+ {
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j];
+ }
+ }
+
+ // Compute
+ for ( int i = 1; i < BLOCK_SIZE + 1; ++i )
+ {
+ for ( int j = 1; j < BLOCK_SIZE + 1; ++j)
+ {
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1],
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty,
+ input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty);
+ }
+ }
+
+ // Copy results to global memory
+ for ( int i = 0; i < BLOCK_SIZE; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE; ++j)
+ {
+ input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j + 1];
+ }
+ }
+
+ }
+ }
+
+ printf("Processing bottom-right matrix\n");
+
+#ifdef OMP_OFFLOAD
+#pragma omp target
+#endif
+ for ( int blk = 2; blk <= (max_cols-1)/BLOCK_SIZE; blk++ )
+ {
+#ifdef OPENMP
+#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty)
+#endif
+ for( int b_index_x = blk - 1; b_index_x < (max_cols-1)/BLOCK_SIZE; ++b_index_x)
+ {
+ int b_index_y = (max_cols-1)/BLOCK_SIZE + blk - 2 - b_index_x;
+
+ int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64)));
+ int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64)));
+
+ // Copy referrence to local memory
+ for ( int i = 0; i < BLOCK_SIZE; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE; ++j)
+ {
+ reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1];
+ }
+ }
+
+ // Copy input_itemsets to local memory
+ for ( int i = 0; i < BLOCK_SIZE + 1; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE + 1; ++j)
+ {
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j];
+ }
+ }
+
+ // Compute
+ for ( int i = 1; i < BLOCK_SIZE + 1; ++i )
+ {
+ for ( int j = 1; j < BLOCK_SIZE + 1; ++j)
+ {
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1],
+ input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty,
+ input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty);
+ }
+ }
+
+ // Copy results to global memory
+ for ( int i = 0; i < BLOCK_SIZE; ++i )
+ {
+#pragma omp simd
+ for ( int j = 0; j < BLOCK_SIZE; ++j)
+ {
+ input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j +1];
+ }
+ }
+ }
+ }
+
+#ifdef OMP_OFFLOAD
+ }
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test for CUDA
+////////////////////////////////////////////////////////////////////////////////
+ void
+runTest( int argc, char** argv)
+{
+ int max_rows, max_cols, penalty;
+ int *input_itemsets, *output_itemsets, *referrence;
+ //int *matrix_cuda, *matrix_cuda_out, *referrence_cuda;
+ //int size;
+ int omp_num_threads;
+
+
+ // the lengths of the two sequences should be able to divided by 16.
+ // And at current stage max_rows needs to equal max_cols
+ if (argc == 4)
+ {
+ max_rows = atoi(argv[1]);
+ max_cols = atoi(argv[1]);
+ penalty = atoi(argv[2]);
+ omp_num_threads = atoi(argv[3]);
+ }
+ else{
+ usage(argc, argv);
+ }
+
+ max_rows = max_rows + 1;
+ max_cols = max_cols + 1;
+ referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+
+
+ if (!input_itemsets)
+ fprintf(stderr, "error: can not allocate memory");
+
+ srand ( 7 );
+
+ for (int i = 0 ; i < max_cols; i++){
+ for (int j = 0 ; j < max_rows; j++){
+ input_itemsets[i*max_cols+j] = 0;
+ }
+ }
+
+ printf("Start Needleman-Wunsch\n");
+
+ for( int i=1; i< max_rows ; i++){ //please define your own sequence.
+ input_itemsets[i*max_cols] = rand() % 10 + 1;
+ }
+ for( int j=1; j< max_cols ; j++){ //please define your own sequence.
+ input_itemsets[j] = rand() % 10 + 1;
+ }
+
+
+ for (int i = 1 ; i < max_cols; i++){
+ for (int j = 1 ; j < max_rows; j++){
+ referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
+ }
+ }
+
+ for( int i = 1; i< max_rows ; i++)
+ input_itemsets[i*max_cols] = -i * penalty;
+ for( int j = 1; j< max_cols ; j++)
+ input_itemsets[j] = -j * penalty;
+
+
+
+ //Compute top-left matrix
+ printf("Num of threads: %d\n", omp_num_threads);
+ printf("Processing top-left matrix\n");
+
+ long long start_time = get_time();
+
+ nw_optimized( input_itemsets, output_itemsets, referrence,
+ max_rows, max_cols, penalty );
+
+ long long end_time = get_time();
+
+ printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
+
+#define TRACEBACK
+#ifdef TRACEBACK
+
+ FILE *fpo = fopen("result.txt","w");
+ fprintf(fpo, "print traceback value GPU:\n");
+
+ for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
+ int nw, n, w, traceback;
+ if ( i == max_rows - 2 && j == max_rows - 2 )
+ fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element
+ if ( i == 0 && j == 0 )
+ break;
+ if ( i > 0 && j > 0 ){
+ nw = input_itemsets[(i - 1) * max_cols + j - 1];
+ w = input_itemsets[ i * max_cols + j - 1 ];
+ n = input_itemsets[(i - 1) * max_cols + j];
+ }
+ else if ( i == 0 ){
+ nw = n = LIMIT;
+ w = input_itemsets[ i * max_cols + j - 1 ];
+ }
+ else if ( j == 0 ){
+ nw = w = LIMIT;
+ n = input_itemsets[(i - 1) * max_cols + j];
+ }
+ else{
+ }
+
+ //traceback = maximum(nw, w, n);
+ int new_nw, new_w, new_n;
+ new_nw = nw + referrence[i * max_cols + j];
+ new_w = w - penalty;
+ new_n = n - penalty;
+
+ traceback = maximum(new_nw, new_w, new_n);
+ if(traceback == new_nw)
+ traceback = nw;
+ if(traceback == new_w)
+ traceback = w;
+ if(traceback == new_n)
+ traceback = n;
+
+ fprintf(fpo, "%d ", traceback);
+
+ if(traceback == nw )
+ {i--; j--; continue;}
+
+ else if(traceback == w )
+ {j--; continue;}
+
+ else if(traceback == n )
+ {i--; continue;}
+
+ else
+ ;
+ }
+
+ fclose(fpo);
+
+#endif
+
+ free(referrence);
+ free(input_itemsets);
+ free(output_itemsets);
+
+}
+
+
+
diff --git a/NW/baselines/cpu/run b/NW/baselines/cpu/run
new file mode 100644
index 0000000..8c8088a
--- /dev/null
+++ b/NW/baselines/cpu/run
@@ -0,0 +1 @@
+./needle 2048 10 2
diff --git a/NW/baselines/cpu/run_offload b/NW/baselines/cpu/run_offload
new file mode 100644
index 0000000..8c5989a
--- /dev/null
+++ b/NW/baselines/cpu/run_offload
@@ -0,0 +1 @@
+./needle_offload 2048 10 2
diff --git a/NW/baselines/gpu/Makefile b/NW/baselines/gpu/Makefile
new file mode 100644
index 0000000..ebf130f
--- /dev/null
+++ b/NW/baselines/gpu/Makefile
@@ -0,0 +1,28 @@
+include ./common/make.config
+
+CC := $(CUDA_DIR)/bin/nvcc
+
+INCLUDE := $(CUDA_DIR)/include
+
+SRC = needle.cu
+
+EXE = needle
+
+release: $(SRC)
+ $(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -DTIMING
+
+clang: $(SRC)
+ clang++ $(SRC) -o $(EXE) -I../util --cuda-gpu-arch=sm_20 \
+ -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -DTIMING
+
+enum: $(SRC)
+ $(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
+
+debug: $(SRC)
+ $(CC) ${KERNEL_DIM} -g $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
+
+debugenum: $(SRC)
+ $(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
+
+clean: $(SRC)
+ rm -f $(EXE) $(EXE).linkinfo result.txt
diff --git a/NW/baselines/gpu/Makefile_nvidia b/NW/baselines/gpu/Makefile_nvidia
new file mode 100644
index 0000000..330e3d1
--- /dev/null
+++ b/NW/baselines/gpu/Makefile_nvidia
@@ -0,0 +1,50 @@
+################################################################################
+#
+# Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
+#
+# NOTICE TO USER:
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and
+# international Copyright laws.
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+# OR PERFORMANCE OF THIS SOURCE CODE.
+#
+# U.S. Government End Users. This source code is a "commercial item" as
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+# "commercial computer software" and "commercial computer software
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+# and is provided to the U.S. Government only as a commercial end item.
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+# Build script for project
+#
+################################################################################
+
+# Add source files here
+EXECUTABLE := needle
+# CUDA source files (compiled with cudacc)
+CUFILES := needle.cu
+# CUDA dependency files
+CU_DEPS := needle_kernel.cu
+# C/C++ source files (compiled with gcc / c++)
+# CCFILES := BlackScholes_gold.cpp
+
+
+
+################################################################################
+# Rules and targets
+
+include ./common/common.mk
diff --git a/NW/baselines/gpu/README b/NW/baselines/gpu/README
new file mode 100644
index 0000000..00d3695
--- /dev/null
+++ b/NW/baselines/gpu/README
@@ -0,0 +1,23 @@
+Needleman-Wunsch (NW)
+
+Compilation instructions
+
+ make
+
+Execution instructions
+
+ ./needle 46080 10
+
+
+Note: This program generate two sequences randomly. Please specify your own sequences for different uses.
+ At the current stage, the program only supports two sequences with the same lengh, which can be divided by 16.
+Usage: needle 32 10
+ 32 //the length of both sequences
+ 10 //penalty value
+
+******Adjustable work group size*****
+RD_WG_SIZE_0 or RD_WG_SIZE_0_0
+
+USAGE:
+make clean
+make KERNEL_DIM="-DRD_WG_SIZE_0=16"
diff --git a/NW/baselines/gpu/common/common.mk b/NW/baselines/gpu/common/common.mk
new file mode 100644
index 0000000..4a5d800
--- /dev/null
+++ b/NW/baselines/gpu/common/common.mk
@@ -0,0 +1,341 @@
+################################################################################
+#
+# Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
+#
+# NOTICE TO USER:
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and
+# international Copyright laws.
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+# OR PERFORMANCE OF THIS SOURCE CODE.
+#
+# U.S. Government End Users. This source code is a "commercial item" as
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+# "commercial computer software" and "commercial computer software
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+# and is provided to the U.S. Government only as a commercial end item.
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+# Common build script
+#
+################################################################################
+
+.SUFFIXES : .cu .cu_dbg_o .c_dbg_o .cpp_dbg_o .cu_rel_o .c_rel_o .cpp_rel_o .cubin
+
+# Add new SM Versions here as devices with new Compute Capability are released
+SM_VERSIONS := sm_10 sm_11 sm_12 sm_13
+
+CUDA_INSTALL_PATH ?= /usr/local/cuda
+
+ifdef cuda-install
+ CUDA_INSTALL_PATH := $(cuda-install)
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
+# 'linux' is output for Linux system, 'darwin' for OS X
+DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
+
+# Basic directory setup for SDK
+# (override directories only if they are not already defined)
+SRCDIR ?=
+ROOTDIR ?= ..
+ROOTBINDIR ?= $(ROOTDIR)/../bin
+BINDIR ?= $(ROOTBINDIR)/$(OSLOWER)
+ROOTOBJDIR ?= obj
+LIBDIR := $(ROOTDIR)/../lib
+COMMONDIR := $(ROOTDIR)/../common
+
+# Compilers
+NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
+CXX := g++
+CC := gcc
+LINK := g++ -fPIC
+
+# Includes
+INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc
+
+# architecture flag for cubin build
+CUBIN_ARCH_FLAG := -m32
+
+# Warning flags
+CXXWARN_FLAGS := \
+ -W -Wall \
+ -Wimplicit \
+ -Wswitch \
+ -Wformat \
+ -Wchar-subscripts \
+ -Wparentheses \
+ -Wmultichar \
+ -Wtrigraphs \
+ -Wpointer-arith \
+ -Wcast-align \
+ -Wreturn-type \
+ -Wno-unused-function \
+ $(SPACE)
+
+CWARN_FLAGS := $(CXXWARN_FLAGS) \
+ -Wstrict-prototypes \
+ -Wmissing-prototypes \
+ -Wmissing-declarations \
+ -Wnested-externs \
+ -Wmain \
+
+# Compiler-specific flags
+NVCCFLAGS :=
+CXXFLAGS := $(CXXWARN_FLAGS)
+CFLAGS := $(CWARN_FLAGS)
+
+# Common flags
+COMMONFLAGS += $(INCLUDES) -DUNIX
+
+# Debug/release configuration
+ifeq ($(dbg),1)
+ COMMONFLAGS += -g
+ NVCCFLAGS += -D_DEBUG
+ BINSUBDIR := debug
+ LIBSUFFIX := D
+else
+ COMMONFLAGS += -O3
+ BINSUBDIR := release
+ LIBSUFFIX :=
+ NVCCFLAGS += --compiler-options -fno-strict-aliasing
+ CXXFLAGS += -fno-strict-aliasing
+ CFLAGS += -fno-strict-aliasing
+endif
+
+# append optional arch/SM version flags (such as -arch sm_11)
+#NVCCFLAGS += $(SMVERSIONFLAGS)
+
+# architecture flag for cubin build
+CUBIN_ARCH_FLAG := -m32
+
+# detect if 32 bit or 64 bit system
+HP_64 = $(shell uname -m | grep 64)
+
+# OpenGL is used or not (if it is used, then it is necessary to include GLEW)
+ifeq ($(USEGLLIB),1)
+
+ ifneq ($(DARWIN),)
+ OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
+ else
+ OPENGLLIB := -lGL -lGLU
+
+ ifeq "$(strip $(HP_64))" ""
+ OPENGLLIB += -lGLEW
+ else
+ OPENGLLIB += -lGLEW_x86_64
+ endif
+ endif
+
+ CUBIN_ARCH_FLAG := -m64
+endif
+
+ifeq ($(USEGLUT),1)
+ ifneq ($(DARWIN),)
+ OPENGLLIB += -framework GLUT
+ else
+ OPENGLLIB += -lglut
+ endif
+endif
+
+ifeq ($(USEPARAMGL),1)
+ PARAMGLLIB := -lparamgl$(LIBSUFFIX)
+endif
+
+ifeq ($(USERENDERCHECKGL),1)
+ RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX)
+endif
+
+ifeq ($(USECUDPP), 1)
+ ifeq "$(strip $(HP_64))" ""
+ CUDPPLIB := -lcudpp
+ else
+ CUDPPLIB := -lcudpp64
+ endif
+
+ CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX)
+
+ ifeq ($(emu), 1)
+ CUDPPLIB := $(CUDPPLIB)_emu
+ endif
+endif
+
+# Libs
+LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER)
+ifeq ($(USEDRVAPI),1)
+ LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
+else
+ LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
+endif
+
+ifeq ($(USECUFFT),1)
+ ifeq ($(emu),1)
+ LIB += -lcufftemu
+ else
+ LIB += -lcufft
+ endif
+endif
+
+ifeq ($(USECUBLAS),1)
+ ifeq ($(emu),1)
+ LIB += -lcublasemu
+ else
+ LIB += -lcublas
+ endif
+endif
+
+# Lib/exe configuration
+ifneq ($(STATIC_LIB),)
+ TARGETDIR := $(LIBDIR)
+ TARGET := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
+ LINKLINE = ar qv $(TARGET) $(OBJS)
+else
+ # LIB += -lcutil$(LIBSUFFIX)
+ # Device emulation configuration
+ ifeq ($(emu), 1)
+ NVCCFLAGS += -deviceemu
+ CUDACCFLAGS +=
+ BINSUBDIR := emu$(BINSUBDIR)
+ # consistency, makes developing easier
+ CXXFLAGS += -D__DEVICE_EMULATION__
+ CFLAGS += -D__DEVICE_EMULATION__
+ endif
+ TARGETDIR := $(BINDIR)/$(BINSUBDIR)
+ TARGET := $(TARGETDIR)/$(EXECUTABLE)
+ LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
+endif
+
+# check if verbose
+ifeq ($(verbose), 1)
+ VERBOSE :=
+else
+ VERBOSE := @
+endif
+
+################################################################################
+# Check for input flags and set compiler flags appropriately
+################################################################################
+ifeq ($(fastmath), 1)
+ NVCCFLAGS += -use_fast_math
+endif
+
+ifeq ($(keep), 1)
+ NVCCFLAGS += -keep
+ NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
+endif
+
+ifdef maxregisters
+ NVCCFLAGS += -maxrregcount $(maxregisters)
+endif
+
+# Add cudacc flags
+NVCCFLAGS += $(CUDACCFLAGS)
+
+# workaround for mac os x cuda 1.1 compiler issues
+ifneq ($(DARWIN),)
+ NVCCFLAGS += --host-compilation=C
+endif
+
+# Add common flags
+NVCCFLAGS += $(COMMONFLAGS)
+CXXFLAGS += $(COMMONFLAGS)
+CFLAGS += $(COMMONFLAGS)
+
+ifeq ($(nvcc_warn_verbose),1)
+ NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS))
+ NVCCFLAGS += --compiler-options -fno-strict-aliasing
+endif
+
+################################################################################
+# Set up object files
+################################################################################
+OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR)
+OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp_o,$(notdir $(CCFILES)))
+OBJS += $(patsubst %.c,$(OBJDIR)/%.c_o,$(notdir $(CFILES)))
+OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_o,$(notdir $(CUFILES)))
+
+################################################################################
+# Set up cubin files
+################################################################################
+CUBINDIR := $(SRCDIR)data
+CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
+
+################################################################################
+# Rules
+################################################################################
+$(OBJDIR)/%.c_o : $(SRCDIR)%.c $(C_DEPS)
+ $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
+
+$(OBJDIR)/%.cpp_o : $(SRCDIR)%.cpp $(C_DEPS)
+ $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(OBJDIR)/%.cu_o : $(SRCDIR)%.cu $(CU_DEPS)
+ $(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
+
+$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
+ $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
+
+#
+# The following definition is a template that gets instantiated for each SM
+# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things:
+# 1. It adds to OBJS a .cu_sm_XX_o for each .cu file it finds in CUFILES_sm_XX.
+# 2. It generates a rule for building .cu_sm_XX_o files from the corresponding
+# .cu file.
+#
+# The intended use for this is to allow Makefiles that use common.mk to compile
+# files to different Compute Capability targets (aka SM arch version). To do
+# so, in the Makefile, list files for each SM arch separately, like so:
+#
+# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
+# CUFILES_sm_12 := anothercudakernel_sm12.cu
+#
+define SMVERSION_template
+OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1)_o,$(notdir $(CUFILES_$(1))))
+$(OBJDIR)/%.cu_$(1)_o : $(SRCDIR)%.cu $(CU_DEPS)
+ $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1)
+endef
+
+# This line invokes the above template for each arch version stored in
+# SM_VERSIONS. The call funtion invokes the template, and the eval
+# function interprets it as make commands.
+$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
+
+$(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile
+ $(VERBOSE)$(LINKLINE)
+
+cubindirectory:
+ $(VERBOSE)mkdir -p $(CUBINDIR)
+
+makedirectories:
+ $(VERBOSE)mkdir -p $(LIBDIR)
+ $(VERBOSE)mkdir -p $(OBJDIR)
+ $(VERBOSE)mkdir -p $(TARGETDIR)
+
+
+tidy :
+ $(VERBOSE)find . | egrep "#" | xargs rm -f
+ $(VERBOSE)find . | egrep "\~" | xargs rm -f
+
+clean : tidy
+ $(VERBOSE)rm -f $(OBJS)
+ $(VERBOSE)rm -f $(CUBINS)
+ $(VERBOSE)rm -f $(TARGET)
+ $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
+
+clobber : clean
+ $(VERBOSE)rm -rf $(ROOTOBJDIR)
diff --git a/NW/baselines/gpu/common/make.config b/NW/baselines/gpu/common/make.config
new file mode 100644
index 0000000..38c3157
--- /dev/null
+++ b/NW/baselines/gpu/common/make.config
@@ -0,0 +1,40 @@
+# CUDA toolkit installation path
+CUDA_DIR = /usr/local/cuda
+
+# CUDA toolkit libraries
+CUDA_LIB_DIR := $(CUDA_DIR)/lib
+ifeq ($(shell uname -m), x86_64)
+ ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T)
+ CUDA_LIB_DIR := $(CUDA_DIR)/lib64
+ endif
+endif
+
+# CUDA SDK installation path
+SDK_DIR = /usr/local/cuda/samples/
+
+# OPENCL
+
+# NVIDIA_DIR
+NV_OPENCL_DIR =/usr/local/cuda
+NV_OPENCL_INC = $(NV_OPENCL_DIR)/include
+NV_OPENCL_LIB = $(NV_OPENCL_DIR)/lib64
+
+# INTEL_DIR
+INTEL_OPENCL_DIR = /opt/intel/opencl
+INTEL_OPENCL_INC = $(INTEL_OPENCL_DIR)/include
+INTEL_OPENCL_LIB = $(INTEL_OPENCL_DIR)
+
+# AMD_DIR
+# OPENCL_DIR = /usr/local/cuda
+# OPENCL_INC = $(OPENCL_DIR)/include/
+# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ -lOpenCL
+#ifeq ($(shell uname -m), x86_64)
+# ifeq ($(shell if test -d $(OPENCL_DIR)/lib/x86_64/; then echo T; else echo F; fi), T)
+# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/
+# endif
+#endif
+
+# DEFAULT OCL
+OPENCL_DIR = $(NV_OPENCL_DIR)
+OPENCL_INC = $(NV_OPENCL_INC)
+OPENCL_LIB = $(NV_OPENCL_LIB)
diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu
new file mode 100644
index 0000000..697a9e7
--- /dev/null
+++ b/NW/baselines/gpu/needle.cu
@@ -0,0 +1,266 @@
+#define LIMIT -999
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "needle.h"
+#include <cuda.h>
+#include <sys/time.h>
+
+// includes, kernels
+#include "needle_kernel.cu"
+
+#ifdef TIMING
+#include "timing.h"
+
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
+ d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+
+int blosum62[24][24] = {
+ { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
+ {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
+ {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
+ {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+ {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
+ {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
+ {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
+ {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
+ {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
+ {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
+ {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
+ {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
+ {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+ { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
+ { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
+ {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
+ {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
+ { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
+ {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+ {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+ { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
+ {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
+};
+
+double gettime() {
+ struct timeval t;
+ gettimeofday(&t,NULL);
+ return t.tv_sec+t.tv_usec*1e-6;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+ int
+main( int argc, char** argv)
+{
+
+ printf("WG size of kernel = %d \n", BLOCK_SIZE);
+
+ runTest( argc, argv);
+
+ return EXIT_SUCCESS;
+}
+
+void usage(int argc, char **argv)
+{
+ fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
+ fprintf(stderr, "\t<dimension> - x and y dimensions\n");
+ fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
+ exit(1);
+}
+
+void runTest( int argc, char** argv)
+{
+ int max_rows, max_cols, penalty;
+ int *input_itemsets, *output_itemsets, *referrence;
+ int *matrix_cuda, *referrence_cuda;
+ int size;
+
+
+ // the lengths of the two sequences should be able to divided by 16.
+ // And at current stage max_rows needs to equal max_cols
+ if (argc == 3)
+ {
+ max_rows = atoi(argv[1]);
+ max_cols = atoi(argv[1]);
+ penalty = atoi(argv[2]);
+ }
+ else{
+ usage(argc, argv);
+ }
+
+ if(atoi(argv[1])%16!=0){
+ fprintf(stderr,"The dimension values must be a multiple of 16\n");
+ exit(1);
+ }
+
+
+ max_rows = max_rows + 1;
+ max_cols = max_cols + 1;
+ referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+ output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+
+
+ if (!input_itemsets)
+ fprintf(stderr, "error: can not allocate memory");
+
+ srand ( 7 );
+
+
+ for (int i = 0 ; i < max_cols; i++){
+ for (int j = 0 ; j < max_rows; j++){
+ input_itemsets[i*max_cols+j] = 0;
+ }
+ }
+
+ printf("Start Needleman-Wunsch\n");
+
+ for( int i=1; i< max_rows ; i++){ //please define your own sequence.
+ input_itemsets[i*max_cols] = rand() % 10 + 1;
+ }
+ for( int j=1; j< max_cols ; j++){ //please define your own sequence.
+ input_itemsets[j] = rand() % 10 + 1;
+ }
+
+
+ for (int i = 1 ; i < max_cols; i++){
+ for (int j = 1 ; j < max_rows; j++){
+ referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
+ }
+ }
+
+ for( int i = 1; i< max_rows ; i++)
+ input_itemsets[i*max_cols] = -i * penalty;
+ for( int j = 1; j< max_cols ; j++)
+ input_itemsets[j] = -j * penalty;
+
+
+ size = max_cols * max_rows;
+ cudaMalloc((void**)& referrence_cuda, sizeof(int)*size);
+ cudaMalloc((void**)& matrix_cuda, sizeof(int)*size);
+
+ cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice);
+ cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice);
+
+ dim3 dimGrid;
+ dim3 dimBlock(BLOCK_SIZE, 1);
+ int block_width = ( max_cols - 1 )/BLOCK_SIZE;
+
+#ifdef TIMING
+ gettimeofday(&tv_kernel_start, NULL);
+#endif
+
+ printf("Processing top-left matrix\n");
+ //process top-left matrix
+ for( int i = 1 ; i <= block_width ; i++){
+ dimGrid.x = i;
+ dimGrid.y = 1;
+ needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+ ,max_cols, penalty, i, block_width);
+ }
+ printf("Processing bottom-right matrix\n");
+ //process bottom-right matrix
+ for( int i = block_width - 1 ; i >= 1 ; i--){
+ dimGrid.x = i;
+ dimGrid.y = 1;
+ needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+ ,max_cols, penalty, i, block_width);
+ }
+
+#ifdef TIMING
+ gettimeofday(&tv_kernel_end, NULL);
+ tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
+ kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+ cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
+
+ //#define TRACEBACK
+#ifdef TRACEBACK
+
+ FILE *fpo = fopen("result.txt","w");
+ fprintf(fpo, "print traceback value GPU:\n");
+
+ for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
+ int nw, n, w, traceback;
+ if ( i == max_rows - 2 && j == max_rows - 2 )
+ fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
+ if ( i == 0 && j == 0 )
+ break;
+ if ( i > 0 && j > 0 ){
+ nw = output_itemsets[(i - 1) * max_cols + j - 1];
+ w = output_itemsets[ i * max_cols + j - 1 ];
+ n = output_itemsets[(i - 1) * max_cols + j];
+ }
+ else if ( i == 0 ){
+ nw = n = LIMIT;
+ w = output_itemsets[ i * max_cols + j - 1 ];
+ }
+ else if ( j == 0 ){
+ nw = w = LIMIT;
+ n = output_itemsets[(i - 1) * max_cols + j];
+ }
+ else{
+ }
+
+ //traceback = maximum(nw, w, n);
+ int new_nw, new_w, new_n;
+ new_nw = nw + referrence[i * max_cols + j];
+ new_w = w - penalty;
+ new_n = n - penalty;
+
+ traceback = maximum(new_nw, new_w, new_n);
+ if(traceback == new_nw)
+ traceback = nw;
+ if(traceback == new_w)
+ traceback = w;
+ if(traceback == new_n)
+ traceback = n;
+
+ fprintf(fpo, "%d ", traceback);
+
+ if(traceback == nw )
+ {i--; j--; continue;}
+
+ else if(traceback == w )
+ {j--; continue;}
+
+ else if(traceback == n )
+ {i--; continue;}
+
+ else
+ ;
+ }
+
+ fclose(fpo);
+
+#endif
+
+ cudaFree(referrence_cuda);
+ cudaFree(matrix_cuda);
+
+ free(referrence);
+ free(input_itemsets);
+ free(output_itemsets);
+
+#ifdef TIMING
+ printf("Exec: %f\n", kernel_time);
+#endif
+}
+
diff --git a/NW/baselines/gpu/needle.h b/NW/baselines/gpu/needle.h
new file mode 100644
index 0000000..6ffef52
--- /dev/null
+++ b/NW/baselines/gpu/needle.h
@@ -0,0 +1,11 @@
+#ifdef RD_WG_SIZE_0_0
+ #define BLOCK_SIZE RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+ #define BLOCK_SIZE RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+ #define BLOCK_SIZE RD_WG_SIZE
+#else
+ #define BLOCK_SIZE 16
+#endif
+//#define TRACE
+
diff --git a/NW/baselines/gpu/needle_kernel.cu b/NW/baselines/gpu/needle_kernel.cu
new file mode 100644
index 0000000..72f8501
--- /dev/null
+++ b/NW/baselines/gpu/needle_kernel.cu
@@ -0,0 +1,188 @@
+#include "needle.h"
+#include <stdio.h>
+
+
+#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
+
+__device__ __host__ int
+maximum( int a,
+ int b,
+ int c){
+
+ int k;
+ if( a <= b )
+ k = b;
+ else
+ k = a;
+
+ if( k <=c )
+ return(c);
+ else
+ return(k);
+
+}
+
+__global__ void
+needle_cuda_shared_1( int* referrence,
+ int* matrix_cuda,
+ int cols,
+ int penalty,
+ int i,
+ int block_width)
+{
+ int bx = blockIdx.x;
+ int tx = threadIdx.x;
+
+ int b_index_x = bx;
+ int b_index_y = i - 1 - bx;
+
+ int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
+ int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
+ int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
+ int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
+
+ __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
+ __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
+
+ if (tx == 0)
+ temp[tx][0] = matrix_cuda[index_nw];
+
+
+ for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+ ref[ty][tx] = referrence[index + cols * ty];
+
+ __syncthreads();
+
+ temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
+
+ __syncthreads();
+
+ temp[0][tx + 1] = matrix_cuda[index_n];
+
+ __syncthreads();
+
+
+ for( int m = 0 ; m < BLOCK_SIZE ; m++){
+
+ if ( tx <= m ){
+
+ int t_index_x = tx + 1;
+ int t_index_y = m - tx + 1;
+
+ temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+ temp[t_index_y][t_index_x-1] - penalty,
+ temp[t_index_y-1][t_index_x] - penalty);
+
+
+
+ }
+
+ __syncthreads();
+
+ }
+
+ for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
+
+ if ( tx <= m){
+
+ int t_index_x = tx + BLOCK_SIZE - m ;
+ int t_index_y = BLOCK_SIZE - tx;
+
+ temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+ temp[t_index_y][t_index_x-1] - penalty,
+ temp[t_index_y-1][t_index_x] - penalty);
+
+ }
+
+ __syncthreads();
+ }
+
+ for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+ matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
+
+}
+
+
+__global__ void
+needle_cuda_shared_2( int* referrence,
+ int* matrix_cuda,
+
+ int cols,
+ int penalty,
+ int i,
+ int block_width)
+{
+
+ int bx = blockIdx.x;
+ int tx = threadIdx.x;
+
+ int b_index_x = bx + block_width - i ;
+ int b_index_y = block_width - bx -1;
+
+ int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
+ int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
+ int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
+ int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
+
+ __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
+ __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
+
+ for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+ ref[ty][tx] = referrence[index + cols * ty];
+
+ __syncthreads();
+
+ if (tx == 0)
+ temp[tx][0] = matrix_cuda[index_nw];
+
+
+ temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
+
+ __syncthreads();
+
+ temp[0][tx + 1] = matrix_cuda[index_n];
+
+ __syncthreads();
+
+
+ for( int m = 0 ; m < BLOCK_SIZE ; m++){
+
+ if ( tx <= m ){
+
+ int t_index_x = tx + 1;
+ int t_index_y = m - tx + 1;
+
+ temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+ temp[t_index_y][t_index_x-1] - penalty,
+ temp[t_index_y-1][t_index_x] - penalty);
+
+ }
+
+ __syncthreads();
+
+ }
+
+
+ for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
+
+ if ( tx <= m){
+
+ int t_index_x = tx + BLOCK_SIZE - m ;
+ int t_index_y = BLOCK_SIZE - tx;
+
+ temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+ temp[t_index_y][t_index_x-1] - penalty,
+ temp[t_index_y-1][t_index_x] - penalty);
+
+
+ }
+
+ __syncthreads();
+ }
+
+
+ for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+ matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
+
+}
+
diff --git a/NW/baselines/gpu/run b/NW/baselines/gpu/run
new file mode 100644
index 0000000..11e9559
--- /dev/null
+++ b/NW/baselines/gpu/run
@@ -0,0 +1 @@
+./needle 2048 10
diff --git a/NW/baselines/gpu/timing.h b/NW/baselines/gpu/timing.h
new file mode 100644
index 0000000..6ef8813
--- /dev/null
+++ b/NW/baselines/gpu/timing.h
@@ -0,0 +1,22 @@
+#ifndef __TIMING_H__
+#define __TIMING_H__
+
+#include <sys/time.h>
+
+void time_measure_start(struct timeval *tv);
+void time_measure_end(struct timeval *tv);
+
+/* tvsub: ret = x - y. */
+static inline void tvsub(struct timeval *x,
+ struct timeval *y,
+ struct timeval *ret)
+{
+ ret->tv_sec = x->tv_sec - y->tv_sec;
+ ret->tv_usec = x->tv_usec - y->tv_usec;
+ if (ret->tv_usec < 0) {
+ ret->tv_sec--;
+ ret->tv_usec += 1000000;
+ }
+}
+
+#endif \ No newline at end of file