diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'NW')
-rw-r--r-- | NW/Makefile | 46 | ||||
-rw-r--r-- | NW/baselines/cpu/Makefile | 16 | ||||
-rw-r--r-- | NW/baselines/cpu/README | 9 | ||||
-rw-r--r-- | NW/baselines/cpu/needle.cpp | 382 | ||||
-rw-r--r-- | NW/baselines/cpu/run | 1 | ||||
-rw-r--r-- | NW/baselines/cpu/run_offload | 1 | ||||
-rw-r--r-- | NW/baselines/gpu/Makefile | 28 | ||||
-rw-r--r-- | NW/baselines/gpu/Makefile_nvidia | 50 | ||||
-rw-r--r-- | NW/baselines/gpu/README | 23 | ||||
-rw-r--r-- | NW/baselines/gpu/common/common.mk | 341 | ||||
-rw-r--r-- | NW/baselines/gpu/common/make.config | 40 | ||||
-rw-r--r-- | NW/baselines/gpu/needle.cu | 266 | ||||
-rw-r--r-- | NW/baselines/gpu/needle.h | 11 | ||||
-rw-r--r-- | NW/baselines/gpu/needle_kernel.cu | 188 | ||||
-rw-r--r-- | NW/baselines/gpu/run | 1 | ||||
-rw-r--r-- | NW/baselines/gpu/timing.h | 22 | ||||
-rw-r--r-- | NW/dpu/task.c | 185 | ||||
-rw-r--r-- | NW/host/app.c | 879 | ||||
-rwxr-xr-x | NW/support/common.h | 76 | ||||
-rw-r--r-- | NW/support/params.h | 56 | ||||
-rwxr-xr-x | NW/support/timer.h | 59 |
21 files changed, 2680 insertions, 0 deletions
diff --git a/NW/Makefile b/NW/Makefile new file mode 100644 index 0000000..68f495a --- /dev/null +++ b/NW/Makefile @@ -0,0 +1,46 @@ +DPU_DIR := dpu +HOST_DIR := host +BUILDDIR ?= bin +NR_TASKLETS ?= 13 +BL ?= 1024 +BL_IN ?= 4 +NR_DPUS ?= 1 +ENERGY ?= 0 + +define conf_filename + ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf +endef +CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL}) + +HOST_TARGET := ${BUILDDIR}/nw_host +DPU_TARGET := ${BUILDDIR}/nw_dpu + +COMMON_INCLUDES := support +HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) +DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) + +.PHONY: all clean test + +__dirs := $(shell mkdir -p ${BUILDDIR}) + +COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY} +DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DBL_IN=${BL_IN} + +all: ${HOST_TARGET} ${DPU_TARGET} + +${CONF}: + $(RM) $(call conf_filename,*,*) + touch ${CONF} + +${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} + $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + +${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} + dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} + +clean: + $(RM) -r $(BUILDDIR) + +test: all + ./${HOST_TARGET} diff --git a/NW/baselines/cpu/Makefile b/NW/baselines/cpu/Makefile new file mode 100644 index 0000000..f49dd2f --- /dev/null +++ b/NW/baselines/cpu/Makefile @@ -0,0 +1,16 @@ +# C compiler +CC = g++ +ICC = icc +CC_FLAGS = -g -O3 -fopenmp +OFFLOAD_CC_FLAGS = -offload-option,mic,compiler,"-no-opt-prefetch" + +all: needle needle_offload + +needle: + $(CC) $(CC_FLAGS) needle.cpp -o needle + +needle_offload: + $(ICC) $(CC_FLAGS) $(OFFLOAD_CC_FLAGS) -DOMP_OFFLOAD needle.cpp -o needle_offload + +clean: + rm -f needle needle_offload diff --git a/NW/baselines/cpu/README b/NW/baselines/cpu/README new file mode 100644 index 0000000..f3fd2c2 --- /dev/null +++ b/NW/baselines/cpu/README @@ -0,0 +1,9 @@ +Needleman-Wunsch (NW) + +Compilation instructions + + make + +Execution instructions + + ./needle 46080 10 4 diff --git a/NW/baselines/cpu/needle.cpp b/NW/baselines/cpu/needle.cpp new file mode 100644 index 0000000..0f1e2b6 --- /dev/null +++ b/NW/baselines/cpu/needle.cpp @@ -0,0 +1,382 @@ +#define LIMIT -999 +//#define TRACE +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include <sys/time.h> +#include <omp.h> +#define OPENMP +//#define NUM_THREAD 4 + +#define BLOCK_SIZE 16 + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + +// Returns the current system time in microseconds +long long get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; + +} + +#ifdef OMP_OFFLOAD +#pragma omp declare target +#endif +int maximum( int a, + int b, + int c){ + + int k; + if( a <= b ) + k = b; + else + k = a; + + if( k <=c ) + return(c); + else + return(k); +} +#ifdef OMP_OFFLOAD +#pragma omp end declare target +#endif + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> <num_threads>\n", argv[0]); + fprintf(stderr, "\t<dimension> - x and y dimensions\n"); + fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); + fprintf(stderr, "\t<num_threads> - no. of threads\n"); + exit(1); +} + +void nw_optimized(int *input_itemsets, int *output_itemsets, int *referrence, + int max_rows, int max_cols, int penalty) +{ +#ifdef OMP_OFFLOAD + int transfer_size = max_rows * max_cols; +#pragma omp target data map(to: max_cols, penalty, referrence[0:transfer_size]) map(input_itemsets[0:transfer_size]) + { + +#pragma omp target +#endif + for( int blk = 1; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = 0; b_index_x < blk; ++b_index_x) + { + int b_index_y = blk - 1 - b_index_x; + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j + 1]; + } + } + + } + } + + printf("Processing bottom-right matrix\n"); + +#ifdef OMP_OFFLOAD +#pragma omp target +#endif + for ( int blk = 2; blk <= (max_cols-1)/BLOCK_SIZE; blk++ ) + { +#ifdef OPENMP +#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty) +#endif + for( int b_index_x = blk - 1; b_index_x < (max_cols-1)/BLOCK_SIZE; ++b_index_x) + { + int b_index_y = (max_cols-1)/BLOCK_SIZE + blk - 2 - b_index_x; + + int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64))); + int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64))); + + // Copy referrence to local memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1]; + } + } + + // Copy input_itemsets to local memory + for ( int i = 0; i < BLOCK_SIZE + 1; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE + j]; + } + } + + // Compute + for ( int i = 1; i < BLOCK_SIZE + 1; ++i ) + { + for ( int j = 1; j < BLOCK_SIZE + 1; ++j) + { + input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1], + input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty, + input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty); + } + } + + // Copy results to global memory + for ( int i = 0; i < BLOCK_SIZE; ++i ) + { +#pragma omp simd + for ( int j = 0; j < BLOCK_SIZE; ++j) + { + input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE + j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j +1]; + } + } + } + } + +#ifdef OMP_OFFLOAD + } +#endif + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// + void +runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + //int *matrix_cuda, *matrix_cuda_out, *referrence_cuda; + //int size; + int omp_num_threads; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 4) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + omp_num_threads = atoi(argv[3]); + } + else{ + usage(argc, argv); + } + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + + //Compute top-left matrix + printf("Num of threads: %d\n", omp_num_threads); + printf("Processing top-left matrix\n"); + + long long start_time = get_time(); + + nw_optimized( input_itemsets, output_itemsets, referrence, + max_rows, max_cols, penalty ); + + long long end_time = get_time(); + + printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); + +#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = input_itemsets[(i - 1) * max_cols + j - 1]; + w = input_itemsets[ i * max_cols + j - 1 ]; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = input_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = input_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +} + + + diff --git a/NW/baselines/cpu/run b/NW/baselines/cpu/run new file mode 100644 index 0000000..8c8088a --- /dev/null +++ b/NW/baselines/cpu/run @@ -0,0 +1 @@ +./needle 2048 10 2 diff --git a/NW/baselines/cpu/run_offload b/NW/baselines/cpu/run_offload new file mode 100644 index 0000000..8c5989a --- /dev/null +++ b/NW/baselines/cpu/run_offload @@ -0,0 +1 @@ +./needle_offload 2048 10 2 diff --git a/NW/baselines/gpu/Makefile b/NW/baselines/gpu/Makefile new file mode 100644 index 0000000..ebf130f --- /dev/null +++ b/NW/baselines/gpu/Makefile @@ -0,0 +1,28 @@ +include ./common/make.config + +CC := $(CUDA_DIR)/bin/nvcc + +INCLUDE := $(CUDA_DIR)/include + +SRC = needle.cu + +EXE = needle + +release: $(SRC) + $(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -DTIMING + +clang: $(SRC) + clang++ $(SRC) -o $(EXE) -I../util --cuda-gpu-arch=sm_20 \ + -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -DTIMING + +enum: $(SRC) + $(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +debug: $(SRC) + $(CC) ${KERNEL_DIM} -g $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +debugenum: $(SRC) + $(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) + +clean: $(SRC) + rm -f $(EXE) $(EXE).linkinfo result.txt diff --git a/NW/baselines/gpu/Makefile_nvidia b/NW/baselines/gpu/Makefile_nvidia new file mode 100644 index 0000000..330e3d1 --- /dev/null +++ b/NW/baselines/gpu/Makefile_nvidia @@ -0,0 +1,50 @@ +################################################################################ +# +# Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Build script for project +# +################################################################################ + +# Add source files here +EXECUTABLE := needle +# CUDA source files (compiled with cudacc) +CUFILES := needle.cu +# CUDA dependency files +CU_DEPS := needle_kernel.cu +# C/C++ source files (compiled with gcc / c++) +# CCFILES := BlackScholes_gold.cpp + + + +################################################################################ +# Rules and targets + +include ./common/common.mk diff --git a/NW/baselines/gpu/README b/NW/baselines/gpu/README new file mode 100644 index 0000000..00d3695 --- /dev/null +++ b/NW/baselines/gpu/README @@ -0,0 +1,23 @@ +Needleman-Wunsch (NW) + +Compilation instructions + + make + +Execution instructions + + ./needle 46080 10 + + +Note: This program generate two sequences randomly. Please specify your own sequences for different uses. + At the current stage, the program only supports two sequences with the same lengh, which can be divided by 16. +Usage: needle 32 10 + 32 //the length of both sequences + 10 //penalty value + +******Adjustable work group size***** +RD_WG_SIZE_0 or RD_WG_SIZE_0_0 + +USAGE: +make clean +make KERNEL_DIM="-DRD_WG_SIZE_0=16" diff --git a/NW/baselines/gpu/common/common.mk b/NW/baselines/gpu/common/common.mk new file mode 100644 index 0000000..4a5d800 --- /dev/null +++ b/NW/baselines/gpu/common/common.mk @@ -0,0 +1,341 @@ +################################################################################ +# +# Copyright 1993-2006 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Common build script +# +################################################################################ + +.SUFFIXES : .cu .cu_dbg_o .c_dbg_o .cpp_dbg_o .cu_rel_o .c_rel_o .cpp_rel_o .cubin + +# Add new SM Versions here as devices with new Compute Capability are released +SM_VERSIONS := sm_10 sm_11 sm_12 sm_13 + +CUDA_INSTALL_PATH ?= /usr/local/cuda + +ifdef cuda-install + CUDA_INSTALL_PATH := $(cuda-install) +endif + +# detect OS +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) +OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) +# 'linux' is output for Linux system, 'darwin' for OS X +DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) + +# Basic directory setup for SDK +# (override directories only if they are not already defined) +SRCDIR ?= +ROOTDIR ?= .. +ROOTBINDIR ?= $(ROOTDIR)/../bin +BINDIR ?= $(ROOTBINDIR)/$(OSLOWER) +ROOTOBJDIR ?= obj +LIBDIR := $(ROOTDIR)/../lib +COMMONDIR := $(ROOTDIR)/../common + +# Compilers +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CXX := g++ +CC := gcc +LINK := g++ -fPIC + +# Includes +INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := -m32 + +# Warning flags +CXXWARN_FLAGS := \ + -W -Wall \ + -Wimplicit \ + -Wswitch \ + -Wformat \ + -Wchar-subscripts \ + -Wparentheses \ + -Wmultichar \ + -Wtrigraphs \ + -Wpointer-arith \ + -Wcast-align \ + -Wreturn-type \ + -Wno-unused-function \ + $(SPACE) + +CWARN_FLAGS := $(CXXWARN_FLAGS) \ + -Wstrict-prototypes \ + -Wmissing-prototypes \ + -Wmissing-declarations \ + -Wnested-externs \ + -Wmain \ + +# Compiler-specific flags +NVCCFLAGS := +CXXFLAGS := $(CXXWARN_FLAGS) +CFLAGS := $(CWARN_FLAGS) + +# Common flags +COMMONFLAGS += $(INCLUDES) -DUNIX + +# Debug/release configuration +ifeq ($(dbg),1) + COMMONFLAGS += -g + NVCCFLAGS += -D_DEBUG + BINSUBDIR := debug + LIBSUFFIX := D +else + COMMONFLAGS += -O3 + BINSUBDIR := release + LIBSUFFIX := + NVCCFLAGS += --compiler-options -fno-strict-aliasing + CXXFLAGS += -fno-strict-aliasing + CFLAGS += -fno-strict-aliasing +endif + +# append optional arch/SM version flags (such as -arch sm_11) +#NVCCFLAGS += $(SMVERSIONFLAGS) + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := -m32 + +# detect if 32 bit or 64 bit system +HP_64 = $(shell uname -m | grep 64) + +# OpenGL is used or not (if it is used, then it is necessary to include GLEW) +ifeq ($(USEGLLIB),1) + + ifneq ($(DARWIN),) + OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a + else + OPENGLLIB := -lGL -lGLU + + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW + else + OPENGLLIB += -lGLEW_x86_64 + endif + endif + + CUBIN_ARCH_FLAG := -m64 +endif + +ifeq ($(USEGLUT),1) + ifneq ($(DARWIN),) + OPENGLLIB += -framework GLUT + else + OPENGLLIB += -lglut + endif +endif + +ifeq ($(USEPARAMGL),1) + PARAMGLLIB := -lparamgl$(LIBSUFFIX) +endif + +ifeq ($(USERENDERCHECKGL),1) + RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX) +endif + +ifeq ($(USECUDPP), 1) + ifeq "$(strip $(HP_64))" "" + CUDPPLIB := -lcudpp + else + CUDPPLIB := -lcudpp64 + endif + + CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX) + + ifeq ($(emu), 1) + CUDPPLIB := $(CUDPPLIB)_emu + endif +endif + +# Libs +LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) +ifeq ($(USEDRVAPI),1) + LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} +else + LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} +endif + +ifeq ($(USECUFFT),1) + ifeq ($(emu),1) + LIB += -lcufftemu + else + LIB += -lcufft + endif +endif + +ifeq ($(USECUBLAS),1) + ifeq ($(emu),1) + LIB += -lcublasemu + else + LIB += -lcublas + endif +endif + +# Lib/exe configuration +ifneq ($(STATIC_LIB),) + TARGETDIR := $(LIBDIR) + TARGET := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB)) + LINKLINE = ar qv $(TARGET) $(OBJS) +else + # LIB += -lcutil$(LIBSUFFIX) + # Device emulation configuration + ifeq ($(emu), 1) + NVCCFLAGS += -deviceemu + CUDACCFLAGS += + BINSUBDIR := emu$(BINSUBDIR) + # consistency, makes developing easier + CXXFLAGS += -D__DEVICE_EMULATION__ + CFLAGS += -D__DEVICE_EMULATION__ + endif + TARGETDIR := $(BINDIR)/$(BINSUBDIR) + TARGET := $(TARGETDIR)/$(EXECUTABLE) + LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB) +endif + +# check if verbose +ifeq ($(verbose), 1) + VERBOSE := +else + VERBOSE := @ +endif + +################################################################################ +# Check for input flags and set compiler flags appropriately +################################################################################ +ifeq ($(fastmath), 1) + NVCCFLAGS += -use_fast_math +endif + +ifeq ($(keep), 1) + NVCCFLAGS += -keep + NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx +endif + +ifdef maxregisters + NVCCFLAGS += -maxrregcount $(maxregisters) +endif + +# Add cudacc flags +NVCCFLAGS += $(CUDACCFLAGS) + +# workaround for mac os x cuda 1.1 compiler issues +ifneq ($(DARWIN),) + NVCCFLAGS += --host-compilation=C +endif + +# Add common flags +NVCCFLAGS += $(COMMONFLAGS) +CXXFLAGS += $(COMMONFLAGS) +CFLAGS += $(COMMONFLAGS) + +ifeq ($(nvcc_warn_verbose),1) + NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) + NVCCFLAGS += --compiler-options -fno-strict-aliasing +endif + +################################################################################ +# Set up object files +################################################################################ +OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR) +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp_o,$(notdir $(CCFILES))) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c_o,$(notdir $(CFILES))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_o,$(notdir $(CUFILES))) + +################################################################################ +# Set up cubin files +################################################################################ +CUBINDIR := $(SRCDIR)data +CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES))) + +################################################################################ +# Rules +################################################################################ +$(OBJDIR)/%.c_o : $(SRCDIR)%.c $(C_DEPS) + $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $< + +$(OBJDIR)/%.cpp_o : $(SRCDIR)%.cpp $(C_DEPS) + $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< + +$(OBJDIR)/%.cu_o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< + +$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory + $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $< + +# +# The following definition is a template that gets instantiated for each SM +# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things: +# 1. It adds to OBJS a .cu_sm_XX_o for each .cu file it finds in CUFILES_sm_XX. +# 2. It generates a rule for building .cu_sm_XX_o files from the corresponding +# .cu file. +# +# The intended use for this is to allow Makefiles that use common.mk to compile +# files to different Compute Capability targets (aka SM arch version). To do +# so, in the Makefile, list files for each SM arch separately, like so: +# +# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu +# CUFILES_sm_12 := anothercudakernel_sm12.cu +# +define SMVERSION_template +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1)_o,$(notdir $(CUFILES_$(1)))) +$(OBJDIR)/%.cu_$(1)_o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1) +endef + +# This line invokes the above template for each arch version stored in +# SM_VERSIONS. The call funtion invokes the template, and the eval +# function interprets it as make commands. +$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver)))) + +$(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile + $(VERBOSE)$(LINKLINE) + +cubindirectory: + $(VERBOSE)mkdir -p $(CUBINDIR) + +makedirectories: + $(VERBOSE)mkdir -p $(LIBDIR) + $(VERBOSE)mkdir -p $(OBJDIR) + $(VERBOSE)mkdir -p $(TARGETDIR) + + +tidy : + $(VERBOSE)find . | egrep "#" | xargs rm -f + $(VERBOSE)find . | egrep "\~" | xargs rm -f + +clean : tidy + $(VERBOSE)rm -f $(OBJS) + $(VERBOSE)rm -f $(CUBINS) + $(VERBOSE)rm -f $(TARGET) + $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN) + +clobber : clean + $(VERBOSE)rm -rf $(ROOTOBJDIR) diff --git a/NW/baselines/gpu/common/make.config b/NW/baselines/gpu/common/make.config new file mode 100644 index 0000000..38c3157 --- /dev/null +++ b/NW/baselines/gpu/common/make.config @@ -0,0 +1,40 @@ +# CUDA toolkit installation path +CUDA_DIR = /usr/local/cuda + +# CUDA toolkit libraries +CUDA_LIB_DIR := $(CUDA_DIR)/lib +ifeq ($(shell uname -m), x86_64) + ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T) + CUDA_LIB_DIR := $(CUDA_DIR)/lib64 + endif +endif + +# CUDA SDK installation path +SDK_DIR = /usr/local/cuda/samples/ + +# OPENCL + +# NVIDIA_DIR +NV_OPENCL_DIR =/usr/local/cuda +NV_OPENCL_INC = $(NV_OPENCL_DIR)/include +NV_OPENCL_LIB = $(NV_OPENCL_DIR)/lib64 + +# INTEL_DIR +INTEL_OPENCL_DIR = /opt/intel/opencl +INTEL_OPENCL_INC = $(INTEL_OPENCL_DIR)/include +INTEL_OPENCL_LIB = $(INTEL_OPENCL_DIR) + +# AMD_DIR +# OPENCL_DIR = /usr/local/cuda +# OPENCL_INC = $(OPENCL_DIR)/include/ +# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ -lOpenCL +#ifeq ($(shell uname -m), x86_64) +# ifeq ($(shell if test -d $(OPENCL_DIR)/lib/x86_64/; then echo T; else echo F; fi), T) +# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ +# endif +#endif + +# DEFAULT OCL +OPENCL_DIR = $(NV_OPENCL_DIR) +OPENCL_INC = $(NV_OPENCL_INC) +OPENCL_LIB = $(NV_OPENCL_LIB) diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu new file mode 100644 index 0000000..697a9e7 --- /dev/null +++ b/NW/baselines/gpu/needle.cu @@ -0,0 +1,266 @@ +#define LIMIT -999 +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include "needle.h" +#include <cuda.h> +#include <sys/time.h> + +// includes, kernels +#include "needle_kernel.cu" + +#ifdef TIMING +#include "timing.h" + +struct timeval tv; +struct timeval tv_total_start, tv_total_end; +struct timeval tv_h2d_start, tv_h2d_end; +struct timeval tv_d2h_start, tv_d2h_end; +struct timeval tv_kernel_start, tv_kernel_end; +struct timeval tv_mem_alloc_start, tv_mem_alloc_end; +struct timeval tv_close_start, tv_close_end; +float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, + d2h_time = 0, close_time = 0, total_time = 0; +#endif + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest( int argc, char** argv); + + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +double gettime() { + struct timeval t; + gettimeofday(&t,NULL); + return t.tv_sec+t.tv_usec*1e-6; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// + int +main( int argc, char** argv) +{ + + printf("WG size of kernel = %d \n", BLOCK_SIZE); + + runTest( argc, argv); + + return EXIT_SUCCESS; +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]); + fprintf(stderr, "\t<dimension> - x and y dimensions\n"); + fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); + exit(1); +} + +void runTest( int argc, char** argv) +{ + int max_rows, max_cols, penalty; + int *input_itemsets, *output_itemsets, *referrence; + int *matrix_cuda, *referrence_cuda; + int size; + + + // the lengths of the two sequences should be able to divided by 16. + // And at current stage max_rows needs to equal max_cols + if (argc == 3) + { + max_rows = atoi(argv[1]); + max_cols = atoi(argv[1]); + penalty = atoi(argv[2]); + } + else{ + usage(argc, argv); + } + + if(atoi(argv[1])%16!=0){ + fprintf(stderr,"The dimension values must be a multiple of 16\n"); + exit(1); + } + + + max_rows = max_rows + 1; + max_cols = max_cols + 1; + referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); + input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); + + + if (!input_itemsets) + fprintf(stderr, "error: can not allocate memory"); + + srand ( 7 ); + + + for (int i = 0 ; i < max_cols; i++){ + for (int j = 0 ; j < max_rows; j++){ + input_itemsets[i*max_cols+j] = 0; + } + } + + printf("Start Needleman-Wunsch\n"); + + for( int i=1; i< max_rows ; i++){ //please define your own sequence. + input_itemsets[i*max_cols] = rand() % 10 + 1; + } + for( int j=1; j< max_cols ; j++){ //please define your own sequence. + input_itemsets[j] = rand() % 10 + 1; + } + + + for (int i = 1 ; i < max_cols; i++){ + for (int j = 1 ; j < max_rows; j++){ + referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; + } + } + + for( int i = 1; i< max_rows ; i++) + input_itemsets[i*max_cols] = -i * penalty; + for( int j = 1; j< max_cols ; j++) + input_itemsets[j] = -j * penalty; + + + size = max_cols * max_rows; + cudaMalloc((void**)& referrence_cuda, sizeof(int)*size); + cudaMalloc((void**)& matrix_cuda, sizeof(int)*size); + + cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice); + cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice); + + dim3 dimGrid; + dim3 dimBlock(BLOCK_SIZE, 1); + int block_width = ( max_cols - 1 )/BLOCK_SIZE; + +#ifdef TIMING + gettimeofday(&tv_kernel_start, NULL); +#endif + + printf("Processing top-left matrix\n"); + //process top-left matrix + for( int i = 1 ; i <= block_width ; i++){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + printf("Processing bottom-right matrix\n"); + //process bottom-right matrix + for( int i = block_width - 1 ; i >= 1 ; i--){ + dimGrid.x = i; + dimGrid.y = 1; + needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda + ,max_cols, penalty, i, block_width); + } + +#ifdef TIMING + gettimeofday(&tv_kernel_end, NULL); + tvsub(&tv_kernel_end, &tv_kernel_start, &tv); + kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; +#endif + + cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); + + //#define TRACEBACK +#ifdef TRACEBACK + + FILE *fpo = fopen("result.txt","w"); + fprintf(fpo, "print traceback value GPU:\n"); + + for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ + int nw, n, w, traceback; + if ( i == max_rows - 2 && j == max_rows - 2 ) + fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element + if ( i == 0 && j == 0 ) + break; + if ( i > 0 && j > 0 ){ + nw = output_itemsets[(i - 1) * max_cols + j - 1]; + w = output_itemsets[ i * max_cols + j - 1 ]; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else if ( i == 0 ){ + nw = n = LIMIT; + w = output_itemsets[ i * max_cols + j - 1 ]; + } + else if ( j == 0 ){ + nw = w = LIMIT; + n = output_itemsets[(i - 1) * max_cols + j]; + } + else{ + } + + //traceback = maximum(nw, w, n); + int new_nw, new_w, new_n; + new_nw = nw + referrence[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if(traceback == new_nw) + traceback = nw; + if(traceback == new_w) + traceback = w; + if(traceback == new_n) + traceback = n; + + fprintf(fpo, "%d ", traceback); + + if(traceback == nw ) + {i--; j--; continue;} + + else if(traceback == w ) + {j--; continue;} + + else if(traceback == n ) + {i--; continue;} + + else + ; + } + + fclose(fpo); + +#endif + + cudaFree(referrence_cuda); + cudaFree(matrix_cuda); + + free(referrence); + free(input_itemsets); + free(output_itemsets); + +#ifdef TIMING + printf("Exec: %f\n", kernel_time); +#endif +} + diff --git a/NW/baselines/gpu/needle.h b/NW/baselines/gpu/needle.h new file mode 100644 index 0000000..6ffef52 --- /dev/null +++ b/NW/baselines/gpu/needle.h @@ -0,0 +1,11 @@ +#ifdef RD_WG_SIZE_0_0 + #define BLOCK_SIZE RD_WG_SIZE_0_0 +#elif defined(RD_WG_SIZE_0) + #define BLOCK_SIZE RD_WG_SIZE_0 +#elif defined(RD_WG_SIZE) + #define BLOCK_SIZE RD_WG_SIZE +#else + #define BLOCK_SIZE 16 +#endif +//#define TRACE + diff --git a/NW/baselines/gpu/needle_kernel.cu b/NW/baselines/gpu/needle_kernel.cu new file mode 100644 index 0000000..72f8501 --- /dev/null +++ b/NW/baselines/gpu/needle_kernel.cu @@ -0,0 +1,188 @@ +#include "needle.h" +#include <stdio.h> + + +#define SDATA( index) CUT_BANK_CHECKER(sdata, index) + +__device__ __host__ int +maximum( int a, + int b, + int c){ + + int k; + if( a <= b ) + k = b; + else + k = a; + + if( k <=c ) + return(c); + else + return(k); + +} + +__global__ void +needle_cuda_shared_1( int* referrence, + int* matrix_cuda, + int cols, + int penalty, + int i, + int block_width) +{ + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx; + int b_index_y = i - 1 - bx; + + int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 ); + int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 ); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols ); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + + for( int m = 0 ; m < BLOCK_SIZE ; m++){ + + if ( tx <= m ){ + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + + + } + + __syncthreads(); + + } + + for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){ + + if ( tx <= m){ + + int t_index_x = tx + BLOCK_SIZE - m ; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + } + + __syncthreads(); + } + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + matrix_cuda[index + ty * cols] = temp[ty+1][tx+1]; + +} + + +__global__ void +needle_cuda_shared_2( int* referrence, + int* matrix_cuda, + + int cols, + int penalty, + int i, + int block_width) +{ + + int bx = blockIdx.x; + int tx = threadIdx.x; + + int b_index_x = bx + block_width - i ; + int b_index_y = block_width - bx -1; + + int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 ); + int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 ); + int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols ); + int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; + + __shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1]; + __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + ref[ty][tx] = referrence[index + cols * ty]; + + __syncthreads(); + + if (tx == 0) + temp[tx][0] = matrix_cuda[index_nw]; + + + temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; + + __syncthreads(); + + temp[0][tx + 1] = matrix_cuda[index_n]; + + __syncthreads(); + + + for( int m = 0 ; m < BLOCK_SIZE ; m++){ + + if ( tx <= m ){ + + int t_index_x = tx + 1; + int t_index_y = m - tx + 1; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + } + + __syncthreads(); + + } + + + for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){ + + if ( tx <= m){ + + int t_index_x = tx + BLOCK_SIZE - m ; + int t_index_y = BLOCK_SIZE - tx; + + temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1], + temp[t_index_y][t_index_x-1] - penalty, + temp[t_index_y-1][t_index_x] - penalty); + + + } + + __syncthreads(); + } + + + for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++) + matrix_cuda[index + ty * cols] = temp[ty+1][tx+1]; + +} + diff --git a/NW/baselines/gpu/run b/NW/baselines/gpu/run new file mode 100644 index 0000000..11e9559 --- /dev/null +++ b/NW/baselines/gpu/run @@ -0,0 +1 @@ +./needle 2048 10 diff --git a/NW/baselines/gpu/timing.h b/NW/baselines/gpu/timing.h new file mode 100644 index 0000000..6ef8813 --- /dev/null +++ b/NW/baselines/gpu/timing.h @@ -0,0 +1,22 @@ +#ifndef __TIMING_H__ +#define __TIMING_H__ + +#include <sys/time.h> + +void time_measure_start(struct timeval *tv); +void time_measure_end(struct timeval *tv); + +/* tvsub: ret = x - y. */ +static inline void tvsub(struct timeval *x, + struct timeval *y, + struct timeval *ret) +{ + ret->tv_sec = x->tv_sec - y->tv_sec; + ret->tv_usec = x->tv_usec - y->tv_usec; + if (ret->tv_usec < 0) { + ret->tv_sec--; + ret->tv_usec += 1000000; + } +} + +#endif
\ No newline at end of file diff --git a/NW/dpu/task.c b/NW/dpu/task.c new file mode 100644 index 0000000..c022f70 --- /dev/null +++ b/NW/dpu/task.c @@ -0,0 +1,185 @@ +/** +* Needleman-Wunsch with multiple tasklets +* +*/ +#include <stdint.h> +#include <stdio.h> +#include <defs.h> +#include <mram.h> +#include <alloc.h> +#include <perfcounter.h> +#include <barrier.h> + +#include "../support/common.h" + +__host dpu_arguments_t DPU_INPUT_ARGUMENTS; + +// Barrier +BARRIER_INIT(my_barrier, NR_TASKLETS); + +// main +int main() { + unsigned int tasklet_id = me(); + if (tasklet_id == 0){ // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + uint32_t nblocks = DPU_INPUT_ARGUMENTS.nblocks; + uint32_t active_blocks = DPU_INPUT_ARGUMENTS.active_blocks; + uint32_t penalty = DPU_INPUT_ARGUMENTS.penalty; +#if PRINT + printf("tasklet_id = %d, nblocks = %d \n", tasklet_id, nblocks); +#endif + + uint32_t mram_base_addr_input_itemsets = (uint32_t) (DPU_MRAM_HEAP_POINTER); + uint32_t mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + nblocks * (BL+1) * (BL+2) * sizeof(int32_t)); + if (nblocks != active_blocks) + mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + active_blocks * (BL+1) * (BL+2) * sizeof(int32_t)); + + int32_t *cache_input = mem_alloc((BL_IN+1) * (BL_IN+2) * sizeof(int32_t)); + int32_t *cache_ref = mem_alloc(BL_IN * BL_IN * sizeof(int32_t)); + uint32_t REP = BL/BL_IN; + uint32_t chunks; + uint32_t mod; + uint32_t start; + uint32_t addr_input; + uint32_t addr_ref; + uint32_t cache_input_offset; + + for (uint32_t bl = 0; bl < nblocks; bl++) { + + // Top-left computation + for(uint32_t blk = 0; blk <= REP; blk++) { + + // Partition chunks/subblocks of the diagonal to tasklets + chunks = blk / NR_TASKLETS; + mod = blk % NR_TASKLETS; + if (tasklet_id < mod) + chunks++; + if (mod > 0) { + if(tasklet_id < mod) + start = tasklet_id * chunks; + else + start = mod * (chunks + 1) + (tasklet_id - mod) * chunks; + } else + start = tasklet_id * chunks; + + // Compute all assigned chunks + for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) { + int t_index_x = start + bl_indx; + int t_index_y = blk - 1 - t_index_x; + + // Move input from MRAM to WRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = 0; + for (int i = 0; i < BL_IN; i++) { + mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); + cache_input_offset += BL_IN; + addr_ref += (BL * sizeof(int32_t)); + } + + // Computation + for (uint32_t i = 1; i < BL_IN + 1; i++) { + for (uint32_t j = 1; j < BL_IN + 1; j++) { + cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1], + cache_input[i*(BL_IN+2) + j - 1] - penalty, + cache_input[(i-1)*(BL_IN+2) + j] - penalty); + } + } + + // Move output from WRAM to MRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_write((cache_input + cache_input_offset), (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + } + + barrier_wait(&my_barrier); + } + + // Bottom-right computation + for(uint32_t blk = 2; blk <= REP; blk++) { + // Partition chunks/subblocks of the diagonal to tasklets + chunks = (REP - blk + 1) / NR_TASKLETS; + mod = (REP - blk + 1) % NR_TASKLETS; + if (tasklet_id < mod) + chunks++; + if (mod > 0){ + if(tasklet_id < mod) + start = tasklet_id * chunks; + else + start = mod * (chunks + 1) + (tasklet_id - mod) * chunks; + } else + start = tasklet_id * chunks; + + // Compute all assigned chunks + for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) { + int t_index_x = blk - 1 + start + bl_indx; + int t_index_y = REP + blk - 2 - t_index_x; + + // Move input from MRAM to WRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = 0; + for (int i = 0; i < BL_IN; i++) { + mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); + cache_input_offset += BL_IN; + addr_ref += (BL * sizeof(int32_t)); + } + + + // Computation + for (int i = 1; i < BL_IN + 1; i++) { + for (int j = 1; j < BL_IN + 1; j++) { + cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1], + cache_input[i*(BL_IN+2) + j - 1] - penalty, + cache_input[(i-1)*(BL_IN+2) + j] - penalty); + } + } + + // Move output from WRAM to MRAM + addr_input = mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t)); + cache_input_offset = (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + for (int i = 1; i < BL_IN + 1; i++) { + mram_write(cache_input + cache_input_offset, (__mram_ptr void *) addr_input, (BL_IN+2) * sizeof(int32_t)); + cache_input_offset += (BL_IN+2); + addr_input += ((BL+2) * sizeof(int32_t)); + } + + } + + barrier_wait(&my_barrier); + + } + + mram_base_addr_input_itemsets += ((BL+1) * (BL+2) * sizeof(int32_t)); + mram_base_addr_ref += (BL * BL * sizeof(int32_t)); + } + return 0; +} diff --git a/NW/host/app.c b/NW/host/app.c new file mode 100644 index 0000000..0e899ec --- /dev/null +++ b/NW/host/app.c @@ -0,0 +1,879 @@ +/** +* app.c +* NW Host Application Source File +* +*/ +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <dpu.h> +#include <dpu_log.h> +#include <unistd.h> +#include <getopt.h> +#include <assert.h> + +#include "../support/common.h" +#include "../support/timer.h" +#include "../support/params.h" + +#if ENERGY +#include <dpu_probe.h> +#endif + +// Define the DPU Binary path as DPU_BINARY here +#ifndef DPU_BINARY +#define DPU_BINARY "./bin/nw_dpu" +#endif + +// Traceback in the host +#if PRINT_FILE +static void traceback(int* traceback_output, char *file, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) { + FILE *fpo = fopen(file, "w"); // Use to print to an output file +#else +static void traceback(int* traceback_output, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) { +#endif + + int k = 0; + for (int i = max_rows - 2, j = max_rows - 2; i>=0 && j>=0;) { + int nw = 0, n = 0, w = 0, traceback = 0; +#if PRINT_FILE + if ( i == (int)max_rows - 2 && j == (int)max_rows - 2 ) + fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element +#endif + + if (i == 0 && j == 0) + break; + if (i > 0 && j > 0) { + nw = input_itemsets[(i - 1) * max_cols + j - 1]; + w = input_itemsets[i * max_cols + j - 1]; + n = input_itemsets[(i - 1) * max_cols + j]; + } else if (i == 0) { + nw = n = LIMIT; + w = input_itemsets[ i * max_cols + j - 1 ]; + } else if (j == 0) { + nw = w = LIMIT; + n = input_itemsets[(i - 1) * max_cols + j]; + } else { + ; + } + + int new_nw, new_w, new_n; + new_nw = nw + reference[i * max_cols + j]; + new_w = w - penalty; + new_n = n - penalty; + + traceback = maximum(new_nw, new_w, new_n); + if (traceback == new_nw) + traceback = nw; + if (traceback == new_w) + traceback = w; + if (traceback == new_n) + traceback = n; + +#if PRINT_FILE + fprintf(fpo, "%d ", traceback); +#endif + traceback_output[k++] = traceback; + + if (traceback == nw) { + i--; + j--; + continue; + } else if (traceback == w) { + j--; + continue; + } else if (traceback == n) { + i--; + continue; + } else { + ; + } + } + + return; +} + +// Compute output in the host +static void nw_host(int32_t *input_itemsets, int32_t *reference, uint64_t max_cols, unsigned int penalty) { + + int32_t *input_itemsets_l = (int32_t *) malloc((BL + 1) * (BL + 1) * sizeof(int32_t)); + int32_t *reference_l = (int32_t *) malloc((BL * BL) * sizeof(int32_t)); + + + // top-left + for (uint64_t blk = 1; blk <= (max_cols-1)/BL; blk++) { + for (uint64_t b_index_x = 0; b_index_x < blk; b_index_x++) { + uint64_t b_index_y = blk - 1 - b_index_x; + + for (uint64_t i = 0; i < BL; i++){ + for (uint64_t j = 0; j < BL; j++) { + reference_l[i*BL + j] = reference[(max_cols-1) * (b_index_y*BL + i) + b_index_x*BL + j]; + } + } + + for (uint64_t i = 0; i < BL + 1; i++){ + for (uint64_t j = 0; j < BL + 1; j++) { + input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j]; + } + } + + // Computation + for (uint64_t i = 1; i < BL + 1; i++) { + for (uint64_t j = 1; j < BL + 1; j++) { + input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1], + input_itemsets_l[i*(BL+1) + j - 1] - penalty, + input_itemsets_l[(i-1)*(BL+1) + j] - penalty); + } + } + + for (uint64_t i = 0; i < BL; i++) { + for (uint64_t j = 0; j < BL; j++) { + input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1]; + } + } + + } + + } + + // bottom-right + for (uint64_t blk = 2; blk <= (max_cols-1)/BL; blk++) { + for (uint64_t b_index_x = blk - 1; b_index_x < (max_cols-1)/BL; b_index_x++) { + uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x; + + for (uint64_t i = 0; i < BL; i++){ + for (uint64_t j = 0; j < BL; j++) { + reference_l[i*BL + j] = reference[(max_cols-1)*(b_index_y*BL + i) + b_index_x*BL + j]; + } + } + + for (uint64_t i = 0; i < BL + 1; i++){ + for (uint64_t j = 0; j < BL + 1; j++) { + input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j]; + } + } + + // Computation + for (uint64_t i = 1; i < BL + 1; i++) { + for (uint64_t j = 1; j < BL + 1; j++) { + input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1], + input_itemsets_l[i*(BL+1) + j - 1] - penalty, + input_itemsets_l[(i-1)*(BL+1) + j] - penalty); + } + } + + for (uint64_t i = 0; i < BL; i++) { + for (uint64_t j = 0; j < BL; j++) { + input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1]; + } + } + + } + + } + + + free(input_itemsets_l); + free(reference_l); + return; +} + +// Main of the Host Application +int main(int argc, char **argv) { + + struct Params p = input_params(argc, argv); + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus, max_dpus; + +#if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy probe", &probe)); +#endif + + // Allocate DPUs and load binary + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + printf("Allocated %d DPU(s)\n", nr_of_dpus); + printf("Allocated %d TASKLET(s) per DPU\n", NR_TASKLETS); +#if DYNAMIC + max_dpus = nr_of_dpus; +#endif + + uint64_t max_rows = p.max_rows + 1; + uint64_t max_cols = p.max_rows + 1; + unsigned int penalty = p.penalty; + int32_t *reference = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t)); + int32_t *input_itemsets_host = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t)); + int32_t *input_itemsets = (int32_t *) malloc((max_rows+1) * (max_cols+1) * sizeof(int32_t)); + dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t)); + printf("Max size %d\n", p.max_rows); + + // Traceback output + int32_t* traceback_output = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t)); + int32_t* traceback_output_host = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t)); + memset(traceback_output, 0, (max_rows + max_cols) * sizeof(int32_t)); + memset(traceback_output_host, 0, (max_rows + max_cols) * sizeof(int32_t)); + + // This array is used for dummy/stale CPU-DPU transfers + int32_t *dummy = (int32_t *) malloc(nr_of_dpus * (BL+2) * sizeof(int32_t)); + unsigned int blocks_per_dpu; + unsigned int mram_offset = 0; + + // Timer + Timer timer; + Timer long_diagonal_timer; +#if ENERGY + double tacc_energy, tacc_time, tavg_time; + double tavg_energy=0; +#endif + + for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + + // Initializing inputs are needed at each iteration + // Initialize input itemsets + for(unsigned int i = 0; i < max_rows; i++) { + for (unsigned int j = 0; j < max_cols; j++) { + input_itemsets_host[i * max_cols + j] = 0; + } + } + + for(unsigned int i = 0; i <= max_rows; i++) { + for (unsigned int j = 0; j <= max_cols; j++) { + input_itemsets[i * (max_cols+1) + j] = 0; + } + } + + // Define random sequences + srand(7); + for (unsigned int i = 1; i < max_rows; i++) { + input_itemsets_host[i * max_cols] = rand() % 10 + 1; + } + + for (unsigned int j = 1; j < max_cols; j++) { + input_itemsets_host[j] = rand() % 10 + 1; + } + + for (unsigned int i = 0; i < max_rows-1; i++) { + for (unsigned int j = 0; j < max_cols-1; j++) { + reference[i * (max_cols-1) + j] = blosum62[input_itemsets[(i+1) * max_cols]][input_itemsets[j+1]]; + } + } + + for (unsigned int i = 1; i < max_rows; i++) { + input_itemsets_host[i * max_cols] = -i * penalty; + input_itemsets[i * (max_cols+1)] = -i * penalty; + } + + for (unsigned int j = 1; j < max_cols; j++) { + input_itemsets_host[j] = -j * penalty; + input_itemsets[j] = -j * penalty; + } + + if (rep >= p.n_warmup) + start(&timer, 0, rep - p.n_warmup); + // Computation on host CPU + nw_host(input_itemsets_host, reference, max_cols, penalty); + + // Print host output +#if PRINT_FILE + if (rep >= p.n_warmup) { + char *host_file = "./bin/host_output.txt"; + traceback(traceback_output_host, host_file, input_itemsets_host, reference, max_rows, max_cols, penalty); + } +#endif + if (rep >= p.n_warmup) + stop(&timer, 0); + + // Top-left computation on DPUs + for (unsigned int blk = 1; blk <= (max_cols-1)/BL; blk++) { +#if DYNAMIC + // If nr_of_blocks are lower than max_dpus, + // set nr_of_dpus to be equal with nr_of_blocks + unsigned nr_of_blocks = blk; + if (nr_of_blocks < max_dpus) { + DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + } else if (nr_of_dpus == max_dpus) { + ; + } else { + DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + } +#if PRINT + printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, blk); +#endif +#endif + + // Copy data to DPUs + unsigned int i=0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int blocks_per_dpu = blk / nr_of_dpus; + unsigned int active_blocks_per_dpu = blk / nr_of_dpus; + unsigned int rest_blocks = blk % nr_of_dpus; + if(i < rest_blocks) + blocks_per_dpu++; + + if(rest_blocks != 0) + active_blocks_per_dpu++; + + // Copy input arguments to dpu + input_args[i].nblocks = blocks_per_dpu; + input_args[i].active_blocks = active_blocks_per_dpu; + input_args[i].penalty = penalty; + DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); + + // Copy itemsets to DPUs + blocks_per_dpu = blk / nr_of_dpus; + if (blk % nr_of_dpus != 0) + blocks_per_dpu++; + mram_offset = 0; + + + if (rep >= p.n_warmup) { + if ((max_cols-1)/BL == 1) + start(&timer, 2, rep - p.n_warmup + blk - 1); + else + start(&timer, 1, rep - p.n_warmup + blk - 1); + + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + if ((max_cols-1)/BL == 1) + start(&long_diagonal_timer, 2, rep - p.n_warmup); + else + start(&long_diagonal_timer, 1, rep - p.n_warmup); + } + } + +#if PRINT + uint64_t total_dpu_memory = 0; + total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t); + printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory); +#endif + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL + 1; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = blk / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = blk % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t input_itemsets_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= blk) { + dpu_pointer = dummy; + input_itemsets_offset = 0; + } else { + uint64_t b_index_x = prev_block_index + bl_indx; + uint64_t b_index_y = blk - 1 - b_index_x; + dpu_pointer = input_itemsets; + input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1); + } + + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset)); + } + + if (bl == 0) + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT)); + else + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += ((BL+2) * sizeof(int32_t)); + + } + } + if (rep >= p.n_warmup) { + if ((max_cols-1)/BL == 1) + stop(&timer, 2); + else + stop(&timer, 1); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + if ((max_cols-1)/BL == 1) + stop(&long_diagonal_timer, 2); + else + stop(&long_diagonal_timer, 1); + } + } + + + if (rep >= p.n_warmup) { + start(&timer, 2, rep - p.n_warmup + blk - 1); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + start(&long_diagonal_timer, 2, rep - p.n_warmup); + } + } + // Copy reference to DPUs + mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t); + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = blk / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = blk % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t reference_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= blk) { + dpu_pointer = dummy; + reference_offset = 0; + } else { + uint64_t b_index_x = prev_block_index + bl_indx; + uint64_t b_index_y = blk - 1 - b_index_x; + dpu_pointer = reference; + reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1); + } + + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += BL * sizeof(int32_t); + + } + } + if (rep >= p.n_warmup) { + stop(&timer, 2); + if (blk == ((max_cols-1)/BL)) { + stop(&long_diagonal_timer, 2); + } + } + +#if ENERGY + if (rep >= p.n_warmup) { + DPU_ASSERT(dpu_probe_start(&probe)); + } +#endif + if (rep >= p.n_warmup) { + start(&timer, 3, rep - p.n_warmup + blk - 1); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + start(&long_diagonal_timer, 3, rep - p.n_warmup); + } + } + // Launch kernel on DPUs + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if (rep >= p.n_warmup) { + stop(&timer, 3); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + stop(&long_diagonal_timer, 3); + } + } +#if ENERGY + if (rep >= p.n_warmup) { + DPU_ASSERT(dpu_probe_stop(&probe)); + } +#endif + +#if ENERGY + double acc_energy, avg_energy, acc_time, avg_time; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); + tavg_energy += avg_energy; +#endif + +#if PRINT + // Display DPU Logs + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); + } +#endif + + if (rep >= p.n_warmup) { + start(&timer, 4, rep - p.n_warmup + blk - 1); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + start(&long_diagonal_timer, 4, rep - p.n_warmup); + } + } + // Retrieve results + // Copy output result to Host CPU + mram_offset = 0; + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL + 1; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = blk / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = blk % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t input_itemsets_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= blk) { + dpu_pointer = dummy; + input_itemsets_offset = 0; + } else { + uint64_t b_index_x = prev_block_index + bl_indx; + uint64_t b_index_y = blk - 1 - b_index_x; + dpu_pointer = input_itemsets; + input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1); + } + + if (bl == 0) // Skip the first row of the block + continue; + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset)); + + } + if (bl == 0) { + mram_offset += (BL+2) * sizeof(int32_t); + continue; + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += (BL+2) * sizeof(int32_t); + + } + } + if (rep >= p.n_warmup) { + stop(&timer, 4); + // Timer for longest diagonal + if (blk == ((max_cols-1)/BL)) { + stop(&long_diagonal_timer, 4); + } + } + } + + + // Bottom-right computation on DPUs + for (unsigned int blk = 2; blk <= (max_cols-1)/BL; blk++) { +#if DYNAMIC + // If nr_of_blocks are lower than max_dpus, + // set nr_of_dpus to be equal with nr_of_blocks + unsigned nr_of_blocks = (((max_cols-1)/BL) - blk + 1); + if (nr_of_blocks < max_dpus) { + DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + } else if (nr_of_dpus == max_dpus) { + ; + } else { + DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set)); + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + } +#if PRINT + printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, (((max_cols-1)/BL) - blk + 1)); +#endif +#endif + + // Copy data to DPUs + unsigned int i=0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + unsigned int active_blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus; + if(i < rest_blocks) + blocks_per_dpu++; + + if(rest_blocks != 0) + active_blocks_per_dpu++; + + // Copy input arguments to dpu + input_args[i].nblocks = blocks_per_dpu; + input_args[i].active_blocks = active_blocks_per_dpu; + input_args[i].penalty = penalty; + DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); + + if (rep >= p.n_warmup) + start(&timer, 1, rep - p.n_warmup + blk - 1); + // Copy itemsets to DPUs + unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + if ((((max_cols-1)/BL) - blk + 1) % nr_of_dpus != 0) + blocks_per_dpu++; +#if PRINT + uint64_t total_dpu_memory = 0; + total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t); + printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory); +#endif + unsigned int mram_offset = 0; + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL + 1; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t input_itemsets_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) { + dpu_pointer = dummy; + input_itemsets_offset = 0; + } else { + uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx; + uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x; + dpu_pointer = input_itemsets; + input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1); + } + + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset)); + } + + if (bl == 0) + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT)); + else + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += (BL+2) * sizeof(int32_t); + + } + } + if (rep >= p.n_warmup) + stop(&timer, 1); + + + if (rep >= p.n_warmup) + start(&timer, 2, rep - p.n_warmup + blk - 1); + // Copy reference to DPUs + mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t); + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t reference_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) { + dpu_pointer = dummy; + reference_offset = 0; + } else { + uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx; + uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x; + dpu_pointer = reference; + reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1); + } + + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset)); + } + + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += BL * sizeof(int32_t); + + } + } + if (rep >= p.n_warmup) + stop(&timer, 2); + +#if ENERGY + if (rep >= p.n_warmup) { + DPU_ASSERT(dpu_probe_start(&probe)); + } +#endif + if (rep >= p.n_warmup) + start(&timer, 3, rep - p.n_warmup + blk - 1); // Do not re-initialize the counter + // Launch kernel on DPUs + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if (rep >= p.n_warmup) + stop(&timer, 3); +#if ENERGY + if (rep >= p.n_warmup) { + DPU_ASSERT(dpu_probe_stop(&probe)); + } +#endif + +#if ENERGY + double acc_energy, avg_energy, acc_time, avg_time; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); + DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); + tavg_energy += avg_energy; +#endif + +#if PRINT + // Display DPU Logs + DPU_FOREACH(dpu_set, dpu) { + DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); + } +#endif + + + if (rep >= p.n_warmup) + start(&timer, 4, rep - p.n_warmup + blk - 1); + // Retrieve results + // Copy output result to Host CPU + mram_offset = 0; + for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) { + for (unsigned int bl = 0; bl < BL + 1; bl++) { + + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus; + unsigned int prev_block_index = 0; + unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus; + if (rest_blocks > 0) { + if (i >= rest_blocks) { + prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks; + } else { + prev_block_index = i * (chunks + 1); + } + } else { + prev_block_index = i * blocks_per_dpu; + } + + uint64_t input_itemsets_offset = 0; + int32_t *dpu_pointer; + if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) { + dpu_pointer = dummy; + input_itemsets_offset = 0; + } else { + uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx; + uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x; + dpu_pointer = input_itemsets; + input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1); + } + + if (bl == 0) // Skip the first row of the block + continue; + DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset)); + + } + + if (bl == 0) { + mram_offset += (BL+2) * sizeof(int32_t); + continue; + } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT)); + mram_offset += (BL+2) * sizeof(int32_t); + + } + } + if (rep >= p.n_warmup) + stop(&timer, 4); + + + } + + // Traceback step + if (rep >= p.n_warmup) + start(&timer, 1, 1); +#if PRINT_FILE + char *dpu_file = "./bin/dpu_output.txt"; + traceback(traceback_output, dpu_file, input_itemsets, reference, max_rows+1, max_cols+1, penalty); +#else + traceback(traceback_output, input_itemsets, reference, max_rows+1, max_cols+1, penalty); +#endif + if (rep >= p.n_warmup) + stop(&timer, 1); + + } + + // Print timing results + printf("CPU version "); + print(&timer, 0, p.n_reps); + printf("CPU-DPU "); + print(&timer, 2, p.n_reps); + printf("DPU Kernel "); + print(&timer, 3, p.n_reps); + printf("Inter-DPU "); + print(&timer, 1, p.n_reps); + printf("DPU-CPU "); + print(&timer, 4, p.n_reps); + printf("\n"); + printf("Longest Diagonal CPU-DPU "); + print(&long_diagonal_timer, 2, p.n_reps); + printf("Longest Diagonal DPU Kernel "); + print(&long_diagonal_timer, 3, p.n_reps); + printf("Longest Diagonal Inter-DPU "); + print(&long_diagonal_timer, 1, p.n_reps); + printf("Longest Diagonal DPU-CPU "); + print(&long_diagonal_timer, 4, p.n_reps); + printf("\n"); + +#if ENERGY + printf("DPU Energy (J): %f \t ", tavg_energy / p.n_reps); +#endif + + // Check output + bool status = true; + for (uint64_t i = 1; i < max_rows; i++) { + for (uint64_t j = 1; j < max_cols; j++) { + if (input_itemsets_host[i*max_cols + j] != input_itemsets[i*(max_cols+1) + j]) { + status = false; +#if PRINT + printf("%ld (%ld, %ld): %d %d\n", i*max_cols + j, i, j, input_itemsets_host[i*max_cols + j], input_itemsets[i*(max_cols+1) + j]); +#endif + } + } + } + + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); + } + + free(input_itemsets_host); + free(input_itemsets); + free(reference); + free(traceback_output); + free(traceback_output_host); + DPU_ASSERT(dpu_free(dpu_set)); + return status ? 0 : -1; + return 0; +} diff --git a/NW/support/common.h b/NW/support/common.h new file mode 100755 index 0000000..69069e7 --- /dev/null +++ b/NW/support/common.h @@ -0,0 +1,76 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +// Structures used by both the host and the dpu to communicate information +typedef struct { + uint32_t nblocks; + uint32_t active_blocks; + uint32_t penalty; + uint32_t dummy; +} dpu_arguments_t; + +#ifndef BL +#define BL 16 +#endif + +// Data type +#define T int32_t + +// MAX +int32_t maximum(int32_t a, int32_t b, int32_t c) { + + int32_t k; + if (a <= b) + k = b; + else + k = a; + + if (k <= c) + return c; + else + return k; + +} + +#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_RESET "\x1b[0m" + +#define LIMIT -999 + +int blosum62[24][24] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, + {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, + {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, + { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, + {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} +}; + +#define DYNAMIC 1 +#define PRINT 0 +#define PRINT_FILE 0 +#ifndef ENERGY +#define ENERGY 0 +#endif +#endif diff --git a/NW/support/params.h b/NW/support/params.h new file mode 100644 index 0000000..8874248 --- /dev/null +++ b/NW/support/params.h @@ -0,0 +1,56 @@ +#ifndef _PARAMS_H_ +#define _PARAMS_H_ + +#include "common.h" + +typedef struct Params { + unsigned int max_rows; + unsigned int penalty; + unsigned int n_warmup; + unsigned int n_reps; +} Params; + +static void usage() { + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -n <N> size of sequence: length of the sequence" + "\n -p <P> penalty: a positive integer" + "\n"); +} + +struct Params input_params(int argc, char **argv) { + struct Params p; + p.n_warmup = 1; + p.n_reps = 3; + p.max_rows = 256; + p.penalty = 1; + + int opt; + while((opt = getopt(argc, argv, "hw:e:n:p:")) >= 0) { + switch(opt) { + case 'h': + usage(); + exit(0); + break; + case 'w': p.n_warmup = atoi(optarg); break; + case 'e': p.n_reps = atoi(optarg); break; + case 'n': p.max_rows = atoi(optarg); break; + case 'p': p.penalty = atoi(optarg); break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); + + return p; +} +#endif diff --git a/NW/support/timer.h b/NW/support/timer.h new file mode 100755 index 0000000..efaefcd --- /dev/null +++ b/NW/support/timer.h @@ -0,0 +1,59 @@ +/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer{
+
+ struct timeval startTime[5];
+ struct timeval stopTime[5];
+ double time[5];
+
+}Timer;
+
+void start(Timer *timer, int i, int rep) {
+ if(rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i) {
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
|