diff options
Diffstat (limited to 'NW/baselines')
-rw-r--r-- | NW/baselines/cpu/Makefile | 16 | ||||
-rw-r--r-- | NW/baselines/cpu/README | 9 | ||||
-rw-r--r-- | NW/baselines/cpu/needle.cpp | 382 | ||||
-rw-r--r-- | NW/baselines/cpu/run | 1 | ||||
-rw-r--r-- | NW/baselines/cpu/run_offload | 1 | ||||
-rw-r--r-- | NW/baselines/gpu/Makefile | 28 | ||||
-rw-r--r-- | NW/baselines/gpu/Makefile_nvidia | 50 | ||||
-rw-r--r-- | NW/baselines/gpu/README | 23 | ||||
-rw-r--r-- | NW/baselines/gpu/common/common.mk | 341 | ||||
-rw-r--r-- | NW/baselines/gpu/common/make.config | 40 | ||||
-rw-r--r-- | NW/baselines/gpu/needle.cu | 266 | ||||
-rw-r--r-- | NW/baselines/gpu/needle.h | 11 | ||||
-rw-r--r-- | NW/baselines/gpu/needle_kernel.cu | 188 | ||||
-rw-r--r-- | NW/baselines/gpu/run | 1 | ||||
-rw-r--r-- | NW/baselines/gpu/timing.h | 22 |
15 files changed, 1379 insertions, 0 deletions
diff --git a/NW/baselines/cpu/Makefile b/NW/baselines/cpu/Makefile new file mode 100644 index 0000000..f49dd2f --- /dev/null +++ b/NW/baselines/cpu/Makefile @@ -0,0 +1,16 @@ +# C compiler +CC = g++ +ICC = icc +CC_FLAGS = -g -O3 -fopenmp +OFFLOAD_CC_FLAGS = -offload-option,mic,compiler,"-no-opt-prefetch" + +all: needle needle_offload + +needle: + $(CC) $(CC_FLAGS) needle.cpp -o needle + +needle_offload: + $(ICC) $(CC_FLAGS) $(OFFLOAD_CC_FLAGS) -DOMP_OFFLOAD needle.cpp -o needle_offload + +clean: + rm -f needle needle_offload diff --git a/NW/baselines/cpu/README b/NW/baselines/cpu/README new file mode 100644 index 0000000..f3fd2c2 --- /dev/null +++ b/NW/baselines/cpu/README @@ -0,0 +1,9 @@ +Needleman-Wunsch (NW) + +Compilation instructions + + make + +Execution instructions + + ./needle 46080 10 4 diff --git a/NW/baselines/cpu/needle.cpp b/NW/baselines/cpu/needle.cpp new file mode 100644 index 0000000..0f1e2b6 --- /dev/null +++ b/NW/baselines/cpu/needle.cpp @@ -0,0 +1,382 @@ +#define LIMIT -999 +//#define TRACE +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <omp.h> +#define OPENMP +//#define NUM_THREAD 4 + +#define BLOCK_SIZE 16 + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + +// Returns the current system time in microseconds +long long get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; + +} + +#ifdef OMP_OFFLOAD +#pragma omp declare target +#endif +int maximum( int a, + int b, + int c){ + + int k; + if( a <= b ) + k = b; + else + k = a; + + if( k <=c ) + return(c); + else + return(k); +} +#ifdef OMP_OFFLOAD +#pragma omp end declare target +#endif + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> <num_threads>\n", argv[0]); + fprintf(stderr, "\t<dimension> - x and y dimensions\n"); + fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); + fprintf(stderr, "\t<num_threads> - no. of threads\n"); + exit(1); +} + +void nw_optimized(int *input_itemsets, int *output_itemsets, int *referrence, + int max_rows, int max_cols, int penalty) +{ +#ifdef OMP_OFFLOAD + int transfer_size = max_rows * max_cols; +#pragma omp target data map(to: max_cols, penalty, referrence[0:transfer_size]) map(input_itemsets[0:transfer_size]) + { + +#pragma omp target +#endif + for( int blk = 1; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = 0; b_index_x < blk; ++b_index_x) + { + int b_index_y = blk - 1 - b_index_x; + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j + 1]; + } + } + + } + } + + printf("Processing bottom-right matrix\n"); + +#ifdef OMP_OFFLOAD +#pragma omp target +#endif + for ( int blk = 2; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = blk - 1; b_index_x < (max_cols-1)/BLOCK_SIZE; ++b_index_x) + { + int b_index_y = (max_cols-1)/BLOCK_SIZE + blk - 2 - b_index_x; + + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j +1]; + } + } + } + } + +#ifdef OMP_OFFLOAD + } +#endif + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// + void +runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + //int *matrix_cuda, *matrix_cuda_out, *referrence_cuda; + //int size; + int omp_num_threads; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 4) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + omp_num_threads = atoi(argv[3]); + } + else{ + usage(argc, argv); + } + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + + //Compute top-left matrix + printf("Num of threads: %d\n", omp_num_threads); + printf("Processing top-left matrix\n"); + + long long start_time = get_time(); + + nw_optimized( input_itemsets, output_itemsets, referrence, + max_rows, max_cols, penalty ); + + long long end_time = get_time(); + + printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); + +#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = input_itemsets[(i - 1) * max_cols + j - 1]; + w = input_itemsets[ i * max_cols + j - 1 ]; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = input_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +} + + + diff --git a/NW/baselines/cpu/run b/NW/baselines/cpu/run new file mode 100644 index 0000000..8c8088a --- /dev/null +++ b/NW/baselines/cpu/run @@ -0,0 +1 @@ +./needle 2048 10 2 diff --git a/NW/baselines/cpu/run_offload b/NW/baselines/cpu/run_offload new file mode 100644 index 0000000..8c5989a --- /dev/null +++ b/NW/baselines/cpu/run_offload @@ -0,0 +1 @@ +./needle_offload 2048 10 2 diff --git a/NW/baselines/gpu/Makefile b/NW/baselines/gpu/Makefile new file mode 100644 index 0000000..ebf130f --- /dev/null +++ b/NW/baselines/gpu/Makefile @@ -0,0 +1,28 @@ +include ./common/make.config + +CC := $(CUDA_DIR)/bin/nvcc + +INCLUDE := $(CUDA_DIR)/include + +SRC = needle.cu + +EXE = needle + +release: $(SRC) + $(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -DTIMING + +clang: $(SRC) + clang++ $(SRC) -o $(EXE) -I../util --cuda-gpu-arch=sm_20 \ + -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -DTIMING + +enum: $(SRC) + $(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +debug: $(SRC) + $(CC) ${KERNEL_DIM} -g $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +debugenum: $(SRC) + $(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +clean: $(SRC) + rm -f $(EXE) $(EXE).linkinfo result.txt diff --git a/NW/baselines/gpu/Makefile_nvidia b/NW/baselines/gpu/Makefile_nvidia new file mode 100644 index 0000000..330e3d1 --- /dev/null +++ b/NW/baselines/gpu/Makefile_nvidia @@ -0,0 +1,50 @@ +################################################################################ +# +# Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Build script for project +# +################################################################################ + +# Add source files here +EXECUTABLE := needle +# CUDA source files (compiled with cudacc) +CUFILES := needle.cu +# CUDA dependency files +CU_DEPS := needle_kernel.cu +# C/C++ source files (compiled with gcc / c++) +# CCFILES := BlackScholes_gold.cpp + + + +################################################################################ +# Rules and targets + +include ./common/common.mk diff --git a/NW/baselines/gpu/README b/NW/baselines/gpu/README new file mode 100644 index 0000000..00d3695 --- /dev/null +++ b/NW/baselines/gpu/README @@ -0,0 +1,23 @@ +Needleman-Wunsch (NW) + +Compilation instructions + + make + +Execution instructions + + ./needle 46080 10 + + +Note: This program generate two sequences randomly. Please specify your own sequences for different uses. + At the current stage, the program only supports two sequences with the same lengh, which can be divided by 16. +Usage: needle 32 10 + 32 //the length of both sequences + 10 //penalty value + +******Adjustable work group size***** +RD_WG_SIZE_0 or RD_WG_SIZE_0_0 + +USAGE: +make clean +make KERNEL_DIM="-DRD_WG_SIZE_0=16" diff --git a/NW/baselines/gpu/common/common.mk b/NW/baselines/gpu/common/common.mk new file mode 100644 index 0000000..4a5d800 --- /dev/null +++ b/NW/baselines/gpu/common/common.mk @@ -0,0 +1,341 @@ +################################################################################ +# +# Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Common build script +# +################################################################################ + +.SUFFIXES : .cu .cu_dbg_o .c_dbg_o .cpp_dbg_o .cu_rel_o .c_rel_o .cpp_rel_o .cubin + +# Add new SM Versions here as devices with new Compute Capability are released +SM_VERSIONS := sm_10 sm_11 sm_12 sm_13 + +CUDA_INSTALL_PATH ?= /usr/local/cuda + +ifdef cuda-install + CUDA_INSTALL_PATH := $(cuda-install) +endif + +# detect OS +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) +OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) +# 'linux' is output for Linux system, 'darwin' for OS X +DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) + +# Basic directory setup for SDK +# (override directories only if they are not already defined) +SRCDIR ?= +ROOTDIR ?= .. +ROOTBINDIR ?= $(ROOTDIR)/../bin +BINDIR ?= $(ROOTBINDIR)/$(OSLOWER) +ROOTOBJDIR ?= obj +LIBDIR := $(ROOTDIR)/../lib +COMMONDIR := $(ROOTDIR)/../common + +# Compilers +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CXX := g++ +CC := gcc +LINK := g++ -fPIC + +# Includes +INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := -m32 + +# Warning flags +CXXWARN_FLAGS := \ + -W -Wall \ + -Wimplicit \ + -Wswitch \ + -Wformat \ + -Wchar-subscripts \ + -Wparentheses \ + -Wmultichar \ + -Wtrigraphs \ + -Wpointer-arith \ + -Wcast-align \ + -Wreturn-type \ + -Wno-unused-function \ + $(SPACE) + +CWARN_FLAGS := $(CXXWARN_FLAGS) \ + -Wstrict-prototypes \ + -Wmissing-prototypes \ + -Wmissing-declarations \ + -Wnested-externs \ + -Wmain \ + +# Compiler-specific flags +NVCCFLAGS := +CXXFLAGS := $(CXXWARN_FLAGS) +CFLAGS := $(CWARN_FLAGS) + +# Common flags +COMMONFLAGS += $(INCLUDES) -DUNIX + +# Debug/release configuration +ifeq ($(dbg),1) + COMMONFLAGS += -g + NVCCFLAGS += -D_DEBUG + BINSUBDIR := debug + LIBSUFFIX := D +else + COMMONFLAGS += -O3 + BINSUBDIR := release + LIBSUFFIX := + NVCCFLAGS += --compiler-options -fno-strict-aliasing + CXXFLAGS += -fno-strict-aliasing + CFLAGS += -fno-strict-aliasing +endif + +# append optional arch/SM version flags (such as -arch sm_11) +#NVCCFLAGS += $(SMVERSIONFLAGS) + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := -m32 + +# detect if 32 bit or 64 bit system +HP_64 = $(shell uname -m | grep 64) + +# OpenGL is used or not (if it is used, then it is necessary to include GLEW) +ifeq ($(USEGLLIB),1) + + ifneq ($(DARWIN),) + OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a + else + OPENGLLIB := -lGL -lGLU + + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW + else + OPENGLLIB += -lGLEW_x86_64 + endif + endif + + CUBIN_ARCH_FLAG := -m64 +endif + +ifeq ($(USEGLUT),1) + ifneq ($(DARWIN),) + OPENGLLIB += -framework GLUT + else + OPENGLLIB += -lglut + endif +endif + +ifeq ($(USEPARAMGL),1) + PARAMGLLIB := -lparamgl$(LIBSUFFIX) +endif + +ifeq ($(USERENDERCHECKGL),1) + RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX) +endif + +ifeq ($(USECUDPP), 1) + ifeq "$(strip $(HP_64))" "" + CUDPPLIB := -lcudpp + else + CUDPPLIB := -lcudpp64 + endif + + CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX) + + ifeq ($(emu), 1) + CUDPPLIB := $(CUDPPLIB)_emu + endif +endif + +# Libs +LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) +ifeq ($(USEDRVAPI),1) + LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} +else + LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} +endif + +ifeq ($(USECUFFT),1) + ifeq ($(emu),1) + LIB += -lcufftemu + else + LIB += -lcufft + endif +endif + +ifeq ($(USECUBLAS),1) + ifeq ($(emu),1) + LIB += -lcublasemu + else + LIB += -lcublas + endif +endif + +# Lib/exe configuration +ifneq ($(STATIC_LIB),) + TARGETDIR := $(LIBDIR) + TARGET := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB)) + LINKLINE = ar qv $(TARGET) $(OBJS) +else + # LIB += -lcutil$(LIBSUFFIX) + # Device emulation configuration + ifeq ($(emu), 1) + NVCCFLAGS += -deviceemu + CUDACCFLAGS += + BINSUBDIR := emu$(BINSUBDIR) + # consistency, makes developing easier + CXXFLAGS += -D__DEVICE_EMULATION__ + CFLAGS += -D__DEVICE_EMULATION__ + endif + TARGETDIR := $(BINDIR)/$(BINSUBDIR) + TARGET := $(TARGETDIR)/$(EXECUTABLE) + LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB) +endif + +# check if verbose +ifeq ($(verbose), 1) + VERBOSE := +else + VERBOSE := @ +endif + +################################################################################ +# Check for input flags and set compiler flags appropriately +################################################################################ +ifeq ($(fastmath), 1) + NVCCFLAGS += -use_fast_math +endif + +ifeq ($(keep), 1) + NVCCFLAGS += -keep + NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx +endif + +ifdef maxregisters + NVCCFLAGS += -maxrregcount $(maxregisters) +endif + +# Add cudacc flags +NVCCFLAGS += $(CUDACCFLAGS) + +# workaround for mac os x cuda 1.1 compiler issues +ifneq ($(DARWIN),) + NVCCFLAGS += --host-compilation=C +endif + +# Add common flags +NVCCFLAGS += $(COMMONFLAGS) +CXXFLAGS += $(COMMONFLAGS) +CFLAGS += $(COMMONFLAGS) + +ifeq ($(nvcc_warn_verbose),1) + NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) + NVCCFLAGS += --compiler-options -fno-strict-aliasing +endif + +################################################################################ +# Set up object files +################################################################################ +OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR) +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp_o,$(notdir $(CCFILES))) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c_o,$(notdir $(CFILES))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_o,$(notdir $(CUFILES))) + +################################################################################ +# Set up cubin files +################################################################################ +CUBINDIR := $(SRCDIR)data +CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES))) + +################################################################################ +# Rules +################################################################################ +$(OBJDIR)/%.c_o : $(SRCDIR)%.c $(C_DEPS) + $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $< + +$(OBJDIR)/%.cpp_o : $(SRCDIR)%.cpp $(C_DEPS) + $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< + +$(OBJDIR)/%.cu_o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< + +$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory + $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $< + +# +# The following definition is a template that gets instantiated for each SM +# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things: +# 1. It adds to OBJS a .cu_sm_XX_o for each .cu file it finds in CUFILES_sm_XX. +# 2. It generates a rule for building .cu_sm_XX_o files from the corresponding +# .cu file. +# +# The intended use for this is to allow Makefiles that use common.mk to compile +# files to different Compute Capability targets (aka SM arch version). To do +# so, in the Makefile, list files for each SM arch separately, like so: +# +# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu +# CUFILES_sm_12 := anothercudakernel_sm12.cu +# +define SMVERSION_template +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1)_o,$(notdir $(CUFILES_$(1)))) +$(OBJDIR)/%.cu_$(1)_o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1) +endef + +# This line invokes the above template for each arch version stored in +# SM_VERSIONS. The call funtion invokes the template, and the eval +# function interprets it as make commands. +$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver)))) + +$(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile + $(VERBOSE)$(LINKLINE) + +cubindirectory: + $(VERBOSE)mkdir -p $(CUBINDIR) + +makedirectories: + $(VERBOSE)mkdir -p $(LIBDIR) + $(VERBOSE)mkdir -p $(OBJDIR) + $(VERBOSE)mkdir -p $(TARGETDIR) + + +tidy : + $(VERBOSE)find . | egrep "#" | xargs rm -f + $(VERBOSE)find . | egrep "\~" | xargs rm -f + +clean : tidy + $(VERBOSE)rm -f $(OBJS) + $(VERBOSE)rm -f $(CUBINS) + $(VERBOSE)rm -f $(TARGET) + $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN) + +clobber : clean + $(VERBOSE)rm -rf $(ROOTOBJDIR) diff --git a/NW/baselines/gpu/common/make.config b/NW/baselines/gpu/common/make.config new file mode 100644 index 0000000..38c3157 --- /dev/null +++ b/NW/baselines/gpu/common/make.config @@ -0,0 +1,40 @@ +# CUDA toolkit installation path +CUDA_DIR = /usr/local/cuda + +# CUDA toolkit libraries +CUDA_LIB_DIR := $(CUDA_DIR)/lib +ifeq ($(shell uname -m), x86_64) + ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T) + CUDA_LIB_DIR := $(CUDA_DIR)/lib64 + endif +endif + +# CUDA SDK installation path +SDK_DIR = /usr/local/cuda/samples/ + +# OPENCL + +# NVIDIA_DIR +NV_OPENCL_DIR =/usr/local/cuda +NV_OPENCL_INC = $(NV_OPENCL_DIR)/include +NV_OPENCL_LIB = $(NV_OPENCL_DIR)/lib64 + +# INTEL_DIR +INTEL_OPENCL_DIR = /opt/intel/opencl +INTEL_OPENCL_INC = $(INTEL_OPENCL_DIR)/include +INTEL_OPENCL_LIB = $(INTEL_OPENCL_DIR) + +# AMD_DIR +# OPENCL_DIR = /usr/local/cuda +# OPENCL_INC = $(OPENCL_DIR)/include/ +# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ -lOpenCL +#ifeq ($(shell uname -m), x86_64) +# ifeq ($(shell if test -d $(OPENCL_DIR)/lib/x86_64/; then echo T; else echo F; fi), T) +# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ +# endif +#endif + +# DEFAULT OCL +OPENCL_DIR = $(NV_OPENCL_DIR) +OPENCL_INC = $(NV_OPENCL_INC) +OPENCL_LIB = $(NV_OPENCL_LIB) diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu new file mode 100644 index 0000000..697a9e7 --- /dev/null +++ b/NW/baselines/gpu/needle.cu @@ -0,0 +1,266 @@ +#define LIMIT -999 +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include "needle.h" +#include <cuda.h> +#include <sys/time.h> + +// includes, kernels +#include "needle_kernel.cu" + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + + printf("WG size of kernel = %d \n", BLOCK_SIZE); + + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]); + fprintf(stderr, "\t<dimension> - x and y dimensions\n"); + fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); + exit(1); +} + +void runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + int *matrix_cuda, *referrence_cuda; + int size; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 3) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + } + else{ + usage(argc, argv); + } + + if(atoi(argv[1])%16!=0){ + fprintf(stderr,"The dimension values must be a multiple of 16\n"); + exit(1); + } + + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + size = max_cols * max_rows; + cudaMalloc((void**)& referrence_cuda, sizeof(int)*size); + cudaMalloc((void**)& matrix_cuda, sizeof(int)*size); + + cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice); + cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice); + + dim3 dimGrid; + dim3 dimBlock(BLOCK_SIZE, 1); + int block_width = ( max_cols - 1 )/BLOCK_SIZE; + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + + printf("Processing top-left matrix\n"); + //process top-left matrix + for( int i = 1 ; i <= block_width ; i++){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + printf("Processing bottom-right matrix\n"); + //process bottom-right matrix + for( int i = block_width - 1 ; i >= 1 ; i--){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + +#ifdef TIMING + gettimeofday(&tv_kernel_end, NULL); + tvsub(&tv_kernel_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; +#endif + + cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); + + //#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = output_itemsets[(i - 1) * max_cols + j - 1]; + w = output_itemsets[ i * max_cols + j - 1 ]; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = output_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + cudaFree(referrence_cuda); + cudaFree(matrix_cuda); + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} + diff --git a/NW/baselines/gpu/needle.h b/NW/baselines/gpu/needle.h new file mode 100644 index 0000000..6ffef52 --- /dev/null +++ b/NW/baselines/gpu/needle.h @@ -0,0 +1,11 @@ +#ifdef RD_WG_SIZE_0_0 + #define BLOCK_SIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) + #define BLOCK_SIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) + #define BLOCK_SIZE RD_WG_SIZE +#else + #define BLOCK_SIZE 16 +#endif +//#define TRACE + diff --git a/NW/baselines/gpu/needle_kernel.cu b/NW/baselines/gpu/needle_kernel.cu new file mode 100644 index 0000000..72f8501 --- /dev/null +++ b/NW/baselines/gpu/needle_kernel.cu @@ -0,0 +1,188 @@ +#include "needle.h" +#include <stdio.h> + + +#define SDATA( index) CUT_BANK_CHECKER(sdata, index) + +__device__ __host__ int +maximum( int a, + int b, + int c){ + + int k; + if( a <= b ) + k = b; + else + k = a; + + if( k <=c ) + return(c); + else + return(k); + +} + +__global__ void +needle_cuda_shared_1( int* referrence, + int* matrix_cuda, + int cols, + int penalty, + int i, + int block_width) +{ + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx; + int b_index_y = i - 1 - bx; + + int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 ); + int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 ); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols ); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + + for( int m = 0 ; m < BLOCK_SIZE ; m++){ + + if ( tx <= m ){ + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + + + } + + __syncthreads(); + + } + + for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){ + + if ( tx <= m){ + + int t_index_x = tx + BLOCK_SIZE - m ; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + } + + __syncthreads(); + } + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + matrix_cuda[index + ty * cols] = temp[ty+1][tx+1]; + +} + + +__global__ void +needle_cuda_shared_2( int* referrence, + int* matrix_cuda, + + int cols, + int penalty, + int i, + int block_width) +{ + + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx + block_width - i ; + int b_index_y = block_width - bx -1; + + int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 ); + int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 ); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols ); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + + for( int m = 0 ; m < BLOCK_SIZE ; m++){ + + if ( tx <= m ){ + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + } + + __syncthreads(); + + } + + + for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){ + + if ( tx <= m){ + + int t_index_x = tx + BLOCK_SIZE - m ; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + + } + + __syncthreads(); + } + + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + matrix_cuda[index + ty * cols] = temp[ty+1][tx+1]; + +} + diff --git a/NW/baselines/gpu/run b/NW/baselines/gpu/run new file mode 100644 index 0000000..11e9559 --- /dev/null +++ b/NW/baselines/gpu/run @@ -0,0 +1 @@ +./needle 2048 10 diff --git a/NW/baselines/gpu/timing.h b/NW/baselines/gpu/timing.h new file mode 100644 index 0000000..6ef8813 --- /dev/null +++ b/NW/baselines/gpu/timing.h @@ -0,0 +1,22 @@ +#ifndef __TIMING_H__ +#define __TIMING_H__ + +#include <sys/time.h> + +void time_measure_start(struct timeval *tv); +void time_measure_end(struct timeval *tv); + +/* tvsub: ret = x - y. */ +static inline void tvsub(struct timeval *x, + struct timeval *y, + struct timeval *ret) +{ + ret->tv_sec = x->tv_sec - y->tv_sec; + ret->tv_usec = x->tv_usec - y->tv_usec; + if (ret->tv_usec < 0) { + ret->tv_sec--; + ret->tv_usec += 1000000; + } +} + +#endif
\ No newline at end of file |