PrIM -- first commit

author: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
committer: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
commit: 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree: fc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW
parent: ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
21 files changed, 2680 insertions, 0 deletions
diff --git a/NW/Makefile b/NW/Makefile
new file mode 100644
index 0000000..68f495a
--- /dev/null
+++ b/NW/Makefile
@@ -0,0 +1,46 @@
+DPU_DIR := dpu
+HOST_DIR := host
+BUILDDIR ?= bin
+NR_TASKLETS ?= 13
+BL ?= 1024 
+BL_IN ?= 4 
+NR_DPUS ?= 1 
+ENERGY ?= 0
+
+define conf_filename
+	${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf
+endef
+CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL})
+
+HOST_TARGET := ${BUILDDIR}/nw_host
+DPU_TARGET := ${BUILDDIR}/nw_dpu
+
+COMMON_INCLUDES := support
+HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
+DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
+
+.PHONY: all clean test
+
+__dirs := $(shell mkdir -p ${BUILDDIR})
+
+COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
+HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY}
+DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DBL_IN=${BL_IN}
+
+all: ${HOST_TARGET} ${DPU_TARGET}
+
+${CONF}:
+	$(RM) $(call conf_filename,*,*)
+	touch ${CONF}
+
+${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
+	$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
+
+${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
+	dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
+
+clean:
+	$(RM) -r $(BUILDDIR)
+
+test: all
+	./${HOST_TARGET}
diff --git a/NW/baselines/cpu/Makefile b/NW/baselines/cpu/Makefile
new file mode 100644
index 0000000..f49dd2f
--- /dev/null
+++ b/NW/baselines/cpu/Makefile
@@ -0,0 +1,16 @@
+# C compiler
+CC = g++
+ICC = icc
+CC_FLAGS = -g -O3 -fopenmp
+OFFLOAD_CC_FLAGS = -offload-option,mic,compiler,"-no-opt-prefetch"
+
+all: needle needle_offload
+
+needle: 
+	$(CC) $(CC_FLAGS) needle.cpp -o needle 
+
+needle_offload:
+	$(ICC) $(CC_FLAGS) $(OFFLOAD_CC_FLAGS) -DOMP_OFFLOAD needle.cpp -o needle_offload
+
+clean:
+	rm -f needle needle_offload
diff --git a/NW/baselines/cpu/README b/NW/baselines/cpu/README
new file mode 100644
index 0000000..f3fd2c2
--- /dev/null
+++ b/NW/baselines/cpu/README
@@ -0,0 +1,9 @@
+Needleman-Wunsch (NW)
+
+Compilation instructions
+
+    make
+
+Execution instructions
+
+    ./needle 46080 10 4
diff --git a/NW/baselines/cpu/needle.cpp b/NW/baselines/cpu/needle.cpp
new file mode 100644
index 0000000..0f1e2b6
--- /dev/null
+++ b/NW/baselines/cpu/needle.cpp
@@ -0,0 +1,382 @@
+#define LIMIT -999
+//#define TRACE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <omp.h>
+#define OPENMP
+//#define NUM_THREAD 4
+
+#define BLOCK_SIZE 16
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+// Returns the current system time in microseconds 
+long long get_time()
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (tv.tv_sec * 1000000) + tv.tv_usec;
+
+}
+
+#ifdef OMP_OFFLOAD
+#pragma omp declare target
+#endif
+int maximum( int a,
+        int b,
+        int c){
+
+    int k;
+    if( a <= b )
+        k = b;
+    else 
+        k = a;
+
+    if( k <=c )
+        return(c);
+    else
+        return(k);
+}
+#ifdef OMP_OFFLOAD
+#pragma omp end declare target
+#endif
+
+
+int blosum62[24][24] = {
+    { 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0, -2, -1,  0, -4},
+    {-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3, -1,  0, -1, -4},
+    {-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3,  3,  0, -1, -4},
+    {-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    { 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+    {-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2,  0,  3, -1, -4},
+    {-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3, -1, -2, -1, -4},
+    {-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3,  0,  0, -1, -4},
+    {-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3, -3, -3, -1, -4},
+    {-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1, -4, -3, -1, -4},
+    {-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2,  0,  1, -1, -4},
+    {-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1, -3, -1, -1, -4},
+    {-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1, -3, -3, -1, -4},
+    {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+    { 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2,  0,  0,  0, -4},
+    { 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0, -1, -1,  0, -4},
+    {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3, -4, -3, -2, -4},
+    {-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1, -3, -2, -1, -4},
+    { 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4, -3, -2, -1, -4},
+    {-2, -1,  3,  4, -3,  0,  1, -1,  0, -3, -4,  0, -3, -3, -2,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    {-1,  0,  0,  1, -3,  3,  4, -2,  0, -3, -3,  1, -1, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2,  0,  0, -2, -1, -1, -1, -1, -1, -4},
+    {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1}
+};
+
+double gettime() {
+    struct timeval t;
+    gettimeofday(&t,NULL);
+    return t.tv_sec+t.tv_usec*1e-6;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+    int
+main( int argc, char** argv) 
+{
+    runTest( argc, argv);
+
+    return EXIT_SUCCESS;
+}
+
+void usage(int argc, char **argv)
+{
+    fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> <num_threads>\n", argv[0]);
+    fprintf(stderr, "\t<dimension>      - x and y dimensions\n");
+    fprintf(stderr, "\t<penalty>        - penalty(positive integer)\n");
+    fprintf(stderr, "\t<num_threads>    - no. of threads\n");
+    exit(1);
+}
+
+void nw_optimized(int *input_itemsets, int *output_itemsets, int *referrence,
+        int max_rows, int max_cols, int penalty)
+{
+#ifdef OMP_OFFLOAD
+    int transfer_size = max_rows * max_cols;
+#pragma omp target data map(to: max_cols, penalty, referrence[0:transfer_size]) map(input_itemsets[0:transfer_size])
+    {
+
+#pragma omp target 
+#endif
+        for( int blk = 1; blk <= (max_cols-1)/BLOCK_SIZE; blk++ )
+        {
+#ifdef OPENMP
+#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty)
+#endif
+            for( int b_index_x = 0; b_index_x < blk; ++b_index_x)
+            {
+                int b_index_y = blk - 1 - b_index_x;
+                int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64)));
+                int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64)));
+
+                // Copy referrence to local memory
+                for ( int i = 0; i < BLOCK_SIZE; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE; ++j)
+                    {
+                        reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE +  j + 1];
+                    }
+                }
+
+                // Copy input_itemsets to local memory
+                for ( int i = 0; i < BLOCK_SIZE + 1; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE + 1; ++j)
+                    {
+                        input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE +  j];
+                    }
+                }
+
+                // Compute
+                for ( int i = 1; i < BLOCK_SIZE + 1; ++i )
+                {
+                    for ( int j = 1; j < BLOCK_SIZE + 1; ++j)
+                    {
+                        input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1],
+                                input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty,
+                                input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty);
+                    }
+                }
+
+                // Copy results to global memory
+                for ( int i = 0; i < BLOCK_SIZE; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE; ++j)
+                    {
+                        input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE +  j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j + 1];
+                    }
+                }
+
+            }
+        }    
+
+        printf("Processing bottom-right matrix\n");
+
+#ifdef OMP_OFFLOAD
+#pragma omp target
+#endif
+        for ( int blk = 2; blk <= (max_cols-1)/BLOCK_SIZE; blk++ )
+        {
+#ifdef OPENMP
+#pragma omp parallel for schedule(static) shared(input_itemsets, referrence) firstprivate(blk, max_rows, max_cols, penalty)
+#endif
+            for( int b_index_x = blk - 1; b_index_x < (max_cols-1)/BLOCK_SIZE; ++b_index_x)
+            {
+                int b_index_y = (max_cols-1)/BLOCK_SIZE + blk - 2 - b_index_x;
+
+                int input_itemsets_l[(BLOCK_SIZE + 1) *(BLOCK_SIZE+1)] __attribute__ ((aligned (64)));
+                int reference_l[BLOCK_SIZE * BLOCK_SIZE] __attribute__ ((aligned (64)));
+
+                // Copy referrence to local memory
+                for ( int i = 0; i < BLOCK_SIZE; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE; ++j)
+                    {
+                        reference_l[i*BLOCK_SIZE + j] = referrence[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE +  j + 1];
+                    }
+                }
+
+                // Copy input_itemsets to local memory
+                for ( int i = 0; i < BLOCK_SIZE + 1; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE + 1; ++j)
+                    {
+                        input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i) + b_index_x*BLOCK_SIZE +  j];
+                    }
+                }
+
+                // Compute
+                for ( int i = 1; i < BLOCK_SIZE + 1; ++i )
+                {
+                    for ( int j = 1; j < BLOCK_SIZE + 1; ++j)
+                    {
+                        input_itemsets_l[i*(BLOCK_SIZE + 1) + j] = maximum( input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j - 1] + reference_l[(i - 1)*BLOCK_SIZE + j - 1],
+                                input_itemsets_l[i*(BLOCK_SIZE + 1) + j - 1] - penalty,
+                                input_itemsets_l[(i - 1)*(BLOCK_SIZE + 1) + j] - penalty);
+                    }
+                }
+
+                // Copy results to global memory
+                for ( int i = 0; i < BLOCK_SIZE; ++i )
+                {
+#pragma omp simd
+                    for ( int j = 0; j < BLOCK_SIZE; ++j)
+                    {
+                        input_itemsets[max_cols*(b_index_y*BLOCK_SIZE + i + 1) + b_index_x*BLOCK_SIZE +  j + 1] = input_itemsets_l[(i + 1)*(BLOCK_SIZE+1) + j +1];
+                    }
+                }
+            }
+        }
+
+#ifdef OMP_OFFLOAD
+    }
+#endif
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test for CUDA
+////////////////////////////////////////////////////////////////////////////////
+    void
+runTest( int argc, char** argv) 
+{
+    int max_rows, max_cols, penalty;
+    int *input_itemsets, *output_itemsets, *referrence;
+    //int *matrix_cuda, *matrix_cuda_out, *referrence_cuda;
+    //int size;
+    int omp_num_threads;
+
+
+    // the lengths of the two sequences should be able to divided by 16.
+    // And at current stage  max_rows needs to equal max_cols
+    if (argc == 4)
+    {
+        max_rows = atoi(argv[1]);
+        max_cols = atoi(argv[1]);
+        penalty = atoi(argv[2]);
+        omp_num_threads = atoi(argv[3]);
+    }
+    else{
+        usage(argc, argv);
+    }
+
+    max_rows = max_rows + 1;
+    max_cols = max_cols + 1;
+    referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
+    input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+    output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+
+
+    if (!input_itemsets)
+        fprintf(stderr, "error: can not allocate memory");
+
+    srand ( 7 );
+
+    for (int i = 0 ; i < max_cols; i++){
+        for (int j = 0 ; j < max_rows; j++){
+            input_itemsets[i*max_cols+j] = 0;
+        }
+    }
+
+    printf("Start Needleman-Wunsch\n");
+
+    for( int i=1; i< max_rows ; i++){    //please define your own sequence. 
+        input_itemsets[i*max_cols] = rand() % 10 + 1;
+    }
+    for( int j=1; j< max_cols ; j++){    //please define your own sequence.
+        input_itemsets[j] = rand() % 10 + 1;
+    }
+
+
+    for (int i = 1 ; i < max_cols; i++){
+        for (int j = 1 ; j < max_rows; j++){
+            referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
+        }
+    }
+
+    for( int i = 1; i< max_rows ; i++)
+        input_itemsets[i*max_cols] = -i * penalty;
+    for( int j = 1; j< max_cols ; j++)
+        input_itemsets[j] = -j * penalty;
+
+
+
+    //Compute top-left matrix 
+    printf("Num of threads: %d\n", omp_num_threads);
+    printf("Processing top-left matrix\n");
+
+    long long start_time = get_time();
+
+    nw_optimized( input_itemsets, output_itemsets, referrence,
+            max_rows, max_cols, penalty );
+
+    long long end_time = get_time();
+
+    printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000));
+
+#define TRACEBACK
+#ifdef TRACEBACK
+
+    FILE *fpo = fopen("result.txt","w");
+    fprintf(fpo, "print traceback value GPU:\n");
+
+    for (int i = max_rows - 2,  j = max_rows - 2; i>=0, j>=0;){
+        int nw, n, w, traceback;
+        if ( i == max_rows - 2 && j == max_rows - 2 )
+            fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element
+        if ( i == 0 && j == 0 )
+            break;
+        if ( i > 0 && j > 0 ){
+            nw = input_itemsets[(i - 1) * max_cols + j - 1];
+            w  = input_itemsets[ i * max_cols + j - 1 ];
+            n  = input_itemsets[(i - 1) * max_cols + j];
+        }
+        else if ( i == 0 ){
+            nw = n = LIMIT;
+            w  = input_itemsets[ i * max_cols + j - 1 ];
+        }
+        else if ( j == 0 ){
+            nw = w = LIMIT;
+            n  = input_itemsets[(i - 1) * max_cols + j];
+        }
+        else{
+        }
+
+        //traceback = maximum(nw, w, n);
+        int new_nw, new_w, new_n;
+        new_nw = nw + referrence[i * max_cols + j];
+        new_w = w - penalty;
+        new_n = n - penalty;
+
+        traceback = maximum(new_nw, new_w, new_n);
+        if(traceback == new_nw)
+            traceback = nw;
+        if(traceback == new_w)
+            traceback = w;
+        if(traceback == new_n)
+            traceback = n;
+
+        fprintf(fpo, "%d ", traceback);
+
+        if(traceback == nw )
+        {i--; j--; continue;}
+
+        else if(traceback == w )
+        {j--; continue;}
+
+        else if(traceback == n )
+        {i--; continue;}
+
+        else
+            ;
+    }
+
+    fclose(fpo);
+
+#endif
+
+    free(referrence);
+    free(input_itemsets);
+    free(output_itemsets);
+
+}
+
+
+
diff --git a/NW/baselines/cpu/run b/NW/baselines/cpu/run
new file mode 100644
index 0000000..8c8088a
--- /dev/null
+++ b/NW/baselines/cpu/run
@@ -0,0 +1 @@
+./needle 2048 10 2 
diff --git a/NW/baselines/cpu/run_offload b/NW/baselines/cpu/run_offload
new file mode 100644
index 0000000..8c5989a
--- /dev/null
+++ b/NW/baselines/cpu/run_offload
@@ -0,0 +1 @@
+./needle_offload 2048 10 2
diff --git a/NW/baselines/gpu/Makefile b/NW/baselines/gpu/Makefile
new file mode 100644
index 0000000..ebf130f
--- /dev/null
+++ b/NW/baselines/gpu/Makefile
@@ -0,0 +1,28 @@
+include ./common/make.config
+
+CC := $(CUDA_DIR)/bin/nvcc
+
+INCLUDE := $(CUDA_DIR)/include
+
+SRC = needle.cu
+
+EXE = needle
+
+release: $(SRC)
+	$(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -DTIMING 
+
+clang: $(SRC)
+	clang++ $(SRC) -o $(EXE) -I../util --cuda-gpu-arch=sm_20 \
+		-L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -DTIMING
+
+enum: $(SRC)
+	$(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) 
+
+debug: $(SRC)
+	$(CC) ${KERNEL_DIM} -g $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) 
+
+debugenum: $(SRC)
+	$(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) 
+
+clean: $(SRC)
+	rm -f $(EXE) $(EXE).linkinfo result.txt
diff --git a/NW/baselines/gpu/Makefile_nvidia b/NW/baselines/gpu/Makefile_nvidia
new file mode 100644
index 0000000..330e3d1
--- /dev/null
+++ b/NW/baselines/gpu/Makefile_nvidia
@@ -0,0 +1,50 @@
+################################################################################
+#
+# Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+# Build script for project
+#
+################################################################################
+
+# Add source files here
+EXECUTABLE	:= needle
+# CUDA source files (compiled with cudacc)
+CUFILES		:= needle.cu
+# CUDA dependency files
+CU_DEPS		:= needle_kernel.cu
+# C/C++ source files (compiled with gcc / c++)
+# CCFILES		:= BlackScholes_gold.cpp
+
+
+
+################################################################################
+# Rules and targets
+
+include ./common/common.mk
diff --git a/NW/baselines/gpu/README b/NW/baselines/gpu/README
new file mode 100644
index 0000000..00d3695
--- /dev/null
+++ b/NW/baselines/gpu/README
@@ -0,0 +1,23 @@
+Needleman-Wunsch (NW)
+
+Compilation instructions
+
+    make
+
+Execution instructions
+
+    ./needle 46080 10
+
+
+Note: This program generate two sequences randomly. Please specify your own sequences for different uses.
+      At the current stage, the program only supports two sequences with the same lengh, which can be divided by 16. 
+Usage: needle 32 10 
+	  32 //the length of both sequences
+	  10 //penalty value
+
+******Adjustable work group size*****
+RD_WG_SIZE_0 or RD_WG_SIZE_0_0 
+
+USAGE:
+make clean
+make KERNEL_DIM="-DRD_WG_SIZE_0=16"
diff --git a/NW/baselines/gpu/common/common.mk b/NW/baselines/gpu/common/common.mk
new file mode 100644
index 0000000..4a5d800
--- /dev/null
+++ b/NW/baselines/gpu/common/common.mk
@@ -0,0 +1,341 @@
+################################################################################
+#
+# Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+# Common build script
+#
+################################################################################
+
+.SUFFIXES : .cu .cu_dbg_o .c_dbg_o .cpp_dbg_o .cu_rel_o .c_rel_o .cpp_rel_o .cubin
+
+# Add new SM Versions here as devices with new Compute Capability are released
+SM_VERSIONS := sm_10 sm_11 sm_12 sm_13
+
+CUDA_INSTALL_PATH ?= /usr/local/cuda
+
+ifdef cuda-install
+	CUDA_INSTALL_PATH := $(cuda-install)
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
+# 'linux' is output for Linux system, 'darwin' for OS X
+DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
+
+# Basic directory setup for SDK
+# (override directories only if they are not already defined)
+SRCDIR     ?= 
+ROOTDIR    ?= ..
+ROOTBINDIR ?= $(ROOTDIR)/../bin
+BINDIR     ?= $(ROOTBINDIR)/$(OSLOWER)
+ROOTOBJDIR ?= obj
+LIBDIR     := $(ROOTDIR)/../lib
+COMMONDIR  := $(ROOTDIR)/../common
+
+# Compilers
+NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc 
+CXX        := g++
+CC         := gcc
+LINK       := g++ -fPIC
+
+# Includes
+INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc
+
+# architecture flag for cubin build
+CUBIN_ARCH_FLAG := -m32
+
+# Warning flags
+CXXWARN_FLAGS := \
+	-W -Wall \
+	-Wimplicit \
+	-Wswitch \
+	-Wformat \
+	-Wchar-subscripts \
+	-Wparentheses \
+	-Wmultichar \
+	-Wtrigraphs \
+	-Wpointer-arith \
+	-Wcast-align \
+	-Wreturn-type \
+	-Wno-unused-function \
+	$(SPACE)
+
+CWARN_FLAGS := $(CXXWARN_FLAGS) \
+	-Wstrict-prototypes \
+	-Wmissing-prototypes \
+	-Wmissing-declarations \
+	-Wnested-externs \
+	-Wmain \
+
+# Compiler-specific flags
+NVCCFLAGS := 
+CXXFLAGS  := $(CXXWARN_FLAGS)
+CFLAGS    := $(CWARN_FLAGS)
+
+# Common flags
+COMMONFLAGS += $(INCLUDES) -DUNIX
+
+# Debug/release configuration
+ifeq ($(dbg),1)
+	COMMONFLAGS += -g
+	NVCCFLAGS   += -D_DEBUG
+	BINSUBDIR   := debug
+	LIBSUFFIX   := D
+else 
+	COMMONFLAGS += -O3 
+	BINSUBDIR   := release
+	LIBSUFFIX   :=
+	NVCCFLAGS   += --compiler-options -fno-strict-aliasing
+	CXXFLAGS    += -fno-strict-aliasing
+	CFLAGS      += -fno-strict-aliasing
+endif
+
+# append optional arch/SM version flags (such as -arch sm_11)
+#NVCCFLAGS += $(SMVERSIONFLAGS)
+
+# architecture flag for cubin build
+CUBIN_ARCH_FLAG := -m32
+
+# detect if 32 bit or 64 bit system
+HP_64 =	$(shell uname -m | grep 64)
+
+# OpenGL is used or not (if it is used, then it is necessary to include GLEW)
+ifeq ($(USEGLLIB),1)
+
+	ifneq ($(DARWIN),)
+		OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
+	else
+		OPENGLLIB := -lGL -lGLU
+
+		ifeq "$(strip $(HP_64))" ""
+			OPENGLLIB += -lGLEW
+		else
+			OPENGLLIB += -lGLEW_x86_64
+		endif
+	endif
+
+	CUBIN_ARCH_FLAG := -m64
+endif
+
+ifeq ($(USEGLUT),1)
+	ifneq ($(DARWIN),)
+		OPENGLLIB += -framework GLUT
+	else
+		OPENGLLIB += -lglut
+	endif
+endif
+
+ifeq ($(USEPARAMGL),1)
+	PARAMGLLIB := -lparamgl$(LIBSUFFIX)
+endif
+
+ifeq ($(USERENDERCHECKGL),1)
+	RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX)
+endif
+
+ifeq ($(USECUDPP), 1)
+	ifeq "$(strip $(HP_64))" ""
+		CUDPPLIB := -lcudpp
+	else
+		CUDPPLIB := -lcudpp64
+	endif
+
+	CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX)
+
+	ifeq ($(emu), 1)
+		CUDPPLIB := $(CUDPPLIB)_emu
+	endif
+endif
+
+# Libs
+LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER)
+ifeq ($(USEDRVAPI),1)
+   LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 
+else
+   LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
+endif
+
+ifeq ($(USECUFFT),1)
+  ifeq ($(emu),1)
+    LIB += -lcufftemu
+  else
+    LIB += -lcufft
+  endif
+endif
+
+ifeq ($(USECUBLAS),1)
+  ifeq ($(emu),1)
+    LIB += -lcublasemu
+  else
+    LIB += -lcublas
+  endif
+endif
+
+# Lib/exe configuration
+ifneq ($(STATIC_LIB),)
+	TARGETDIR := $(LIBDIR)
+	TARGET   := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
+	LINKLINE  = ar qv $(TARGET) $(OBJS) 
+else
+	# LIB += -lcutil$(LIBSUFFIX)
+	# Device emulation configuration
+	ifeq ($(emu), 1)
+		NVCCFLAGS   += -deviceemu
+		CUDACCFLAGS += 
+		BINSUBDIR   := emu$(BINSUBDIR)
+		# consistency, makes developing easier
+		CXXFLAGS		+= -D__DEVICE_EMULATION__
+		CFLAGS			+= -D__DEVICE_EMULATION__
+	endif
+	TARGETDIR := $(BINDIR)/$(BINSUBDIR)
+	TARGET    := $(TARGETDIR)/$(EXECUTABLE)
+	LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
+endif
+
+# check if verbose 
+ifeq ($(verbose), 1)
+	VERBOSE :=
+else
+	VERBOSE := @
+endif
+
+################################################################################
+# Check for input flags and set compiler flags appropriately
+################################################################################
+ifeq ($(fastmath), 1)
+	NVCCFLAGS += -use_fast_math
+endif
+
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+	NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
+endif
+
+ifdef maxregisters
+	NVCCFLAGS += -maxrregcount $(maxregisters)
+endif
+
+# Add cudacc flags
+NVCCFLAGS += $(CUDACCFLAGS)
+
+# workaround for mac os x cuda 1.1 compiler issues
+ifneq ($(DARWIN),)
+	NVCCFLAGS += --host-compilation=C
+endif
+
+# Add common flags
+NVCCFLAGS += $(COMMONFLAGS)
+CXXFLAGS  += $(COMMONFLAGS)
+CFLAGS    += $(COMMONFLAGS)
+
+ifeq ($(nvcc_warn_verbose),1)
+	NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 
+	NVCCFLAGS += --compiler-options -fno-strict-aliasing
+endif
+
+################################################################################
+# Set up object files
+################################################################################
+OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR)
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp_o,$(notdir $(CCFILES)))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c_o,$(notdir $(CFILES)))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu_o,$(notdir $(CUFILES)))
+
+################################################################################
+# Set up cubin files
+################################################################################
+CUBINDIR := $(SRCDIR)data
+CUBINS +=  $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
+
+################################################################################
+# Rules
+################################################################################
+$(OBJDIR)/%.c_o : $(SRCDIR)%.c $(C_DEPS)
+	$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
+
+$(OBJDIR)/%.cpp_o : $(SRCDIR)%.cpp $(C_DEPS)
+	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(OBJDIR)/%.cu_o : $(SRCDIR)%.cu $(CU_DEPS)
+	$(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
+
+$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
+	$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
+
+#
+# The following definition is a template that gets instantiated for each SM
+# version (sm_10, sm_13, etc.) stored in SMVERSIONS.  It does 2 things:
+# 1. It adds to OBJS a .cu_sm_XX_o for each .cu file it finds in CUFILES_sm_XX.
+# 2. It generates a rule for building .cu_sm_XX_o files from the corresponding 
+#    .cu file.
+#
+# The intended use for this is to allow Makefiles that use common.mk to compile
+# files to different Compute Capability targets (aka SM arch version).  To do
+# so, in the Makefile, list files for each SM arch separately, like so:
+#
+# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
+# CUFILES_sm_12 := anothercudakernel_sm12.cu
+#
+define SMVERSION_template
+OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1)_o,$(notdir $(CUFILES_$(1))))
+$(OBJDIR)/%.cu_$(1)_o : $(SRCDIR)%.cu $(CU_DEPS)
+	$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1)
+endef
+
+# This line invokes the above template for each arch version stored in
+# SM_VERSIONS.  The call funtion invokes the template, and the eval
+# function interprets it as make commands.
+$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
+
+$(TARGET): makedirectories $(OBJS) $(CUBINS) Makefile
+	$(VERBOSE)$(LINKLINE)
+
+cubindirectory:
+	$(VERBOSE)mkdir -p $(CUBINDIR)
+
+makedirectories:
+	$(VERBOSE)mkdir -p $(LIBDIR)
+	$(VERBOSE)mkdir -p $(OBJDIR)
+	$(VERBOSE)mkdir -p $(TARGETDIR)
+
+
+tidy :
+	$(VERBOSE)find . | egrep "#" | xargs rm -f
+	$(VERBOSE)find . | egrep "\~" | xargs rm -f
+
+clean : tidy
+	$(VERBOSE)rm -f $(OBJS)
+	$(VERBOSE)rm -f $(CUBINS)
+	$(VERBOSE)rm -f $(TARGET)
+	$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
+
+clobber : clean
+	$(VERBOSE)rm -rf $(ROOTOBJDIR)
diff --git a/NW/baselines/gpu/common/make.config b/NW/baselines/gpu/common/make.config
new file mode 100644
index 0000000..38c3157
--- /dev/null
+++ b/NW/baselines/gpu/common/make.config
@@ -0,0 +1,40 @@
+# CUDA toolkit installation path
+CUDA_DIR = /usr/local/cuda
+
+# CUDA toolkit libraries
+CUDA_LIB_DIR := $(CUDA_DIR)/lib
+ifeq ($(shell uname -m), x86_64)
+     ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T)
+     	CUDA_LIB_DIR := $(CUDA_DIR)/lib64
+     endif
+endif
+
+# CUDA SDK installation path
+SDK_DIR = /usr/local/cuda/samples/
+
+# OPENCL
+
+# NVIDIA_DIR
+NV_OPENCL_DIR =/usr/local/cuda
+NV_OPENCL_INC = $(NV_OPENCL_DIR)/include
+NV_OPENCL_LIB = $(NV_OPENCL_DIR)/lib64
+
+# INTEL_DIR
+INTEL_OPENCL_DIR = /opt/intel/opencl
+INTEL_OPENCL_INC = $(INTEL_OPENCL_DIR)/include
+INTEL_OPENCL_LIB = $(INTEL_OPENCL_DIR)
+
+# AMD_DIR
+# OPENCL_DIR = /usr/local/cuda
+# OPENCL_INC = $(OPENCL_DIR)/include/ 
+# OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/ -lOpenCL
+#ifeq ($(shell uname -m), x86_64)
+#     ifeq ($(shell if test -d $(OPENCL_DIR)/lib/x86_64/; then echo T; else echo F; fi), T)
+#     	OPENCL_LIB = $(OPENCL_DIR)/lib/x86_64/
+#     endif
+#endif
+
+# DEFAULT OCL
+OPENCL_DIR = $(NV_OPENCL_DIR)
+OPENCL_INC = $(NV_OPENCL_INC)
+OPENCL_LIB = $(NV_OPENCL_LIB)
diff --git a/NW/baselines/gpu/needle.cu b/NW/baselines/gpu/needle.cu
new file mode 100644
index 0000000..697a9e7
--- /dev/null
+++ b/NW/baselines/gpu/needle.cu
@@ -0,0 +1,266 @@
+#define LIMIT -999
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "needle.h"
+#include <cuda.h>
+#include <sys/time.h>
+
+// includes, kernels
+#include "needle_kernel.cu"
+
+#ifdef TIMING
+#include "timing.h"
+
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
+      d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest( int argc, char** argv);
+
+
+int blosum62[24][24] = {
+    { 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0, -2, -1,  0, -4},
+    {-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3, -1,  0, -1, -4},
+    {-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3,  3,  0, -1, -4},
+    {-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    { 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+    {-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2,  0,  3, -1, -4},
+    {-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3, -1, -2, -1, -4},
+    {-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3,  0,  0, -1, -4},
+    {-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3, -3, -3, -1, -4},
+    {-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1, -4, -3, -1, -4},
+    {-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2,  0,  1, -1, -4},
+    {-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1, -3, -1, -1, -4},
+    {-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1, -3, -3, -1, -4},
+    {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+    { 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2,  0,  0,  0, -4},
+    { 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0, -1, -1,  0, -4},
+    {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3, -4, -3, -2, -4},
+    {-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1, -3, -2, -1, -4},
+    { 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4, -3, -2, -1, -4},
+    {-2, -1,  3,  4, -3,  0,  1, -1,  0, -3, -4,  0, -3, -3, -2,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    {-1,  0,  0,  1, -3,  3,  4, -2,  0, -3, -3,  1, -1, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2,  0,  0, -2, -1, -1, -1, -1, -1, -4},
+    {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1}
+};
+
+double gettime() {
+    struct timeval t;
+    gettimeofday(&t,NULL);
+    return t.tv_sec+t.tv_usec*1e-6;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+    int
+main( int argc, char** argv) 
+{
+
+    printf("WG size of kernel = %d \n", BLOCK_SIZE);
+
+    runTest( argc, argv);
+
+    return EXIT_SUCCESS;
+}
+
+void usage(int argc, char **argv)
+{
+    fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
+    fprintf(stderr, "\t<dimension>  - x and y dimensions\n");
+    fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
+    exit(1);
+}
+
+void runTest( int argc, char** argv) 
+{
+    int max_rows, max_cols, penalty;
+    int *input_itemsets, *output_itemsets, *referrence;
+    int *matrix_cuda,  *referrence_cuda;
+    int size;
+
+
+    // the lengths of the two sequences should be able to divided by 16.
+    // And at current stage  max_rows needs to equal max_cols
+    if (argc == 3)
+    {
+        max_rows = atoi(argv[1]);
+        max_cols = atoi(argv[1]);
+        penalty = atoi(argv[2]);
+    }
+    else{
+        usage(argc, argv);
+    }
+
+    if(atoi(argv[1])%16!=0){
+        fprintf(stderr,"The dimension values must be a multiple of 16\n");
+        exit(1);
+    }
+
+
+    max_rows = max_rows + 1;
+    max_cols = max_cols + 1;
+    referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
+    input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+    output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
+
+
+    if (!input_itemsets)
+        fprintf(stderr, "error: can not allocate memory");
+
+    srand ( 7 );
+
+
+    for (int i = 0 ; i < max_cols; i++){
+        for (int j = 0 ; j < max_rows; j++){
+            input_itemsets[i*max_cols+j] = 0;
+        }
+    }
+
+    printf("Start Needleman-Wunsch\n");
+
+    for( int i=1; i< max_rows ; i++){    //please define your own sequence. 
+        input_itemsets[i*max_cols] = rand() % 10 + 1;
+    }
+    for( int j=1; j< max_cols ; j++){    //please define your own sequence.
+        input_itemsets[j] = rand() % 10 + 1;
+    }
+
+
+    for (int i = 1 ; i < max_cols; i++){
+        for (int j = 1 ; j < max_rows; j++){
+            referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
+        }
+    }
+
+    for( int i = 1; i< max_rows ; i++)
+        input_itemsets[i*max_cols] = -i * penalty;
+    for( int j = 1; j< max_cols ; j++)
+        input_itemsets[j] = -j * penalty;
+
+
+    size = max_cols * max_rows;
+    cudaMalloc((void**)& referrence_cuda, sizeof(int)*size);
+    cudaMalloc((void**)& matrix_cuda, sizeof(int)*size);
+
+    cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice);
+    cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice);
+
+    dim3 dimGrid;
+    dim3 dimBlock(BLOCK_SIZE, 1);
+    int block_width = ( max_cols - 1 )/BLOCK_SIZE;
+
+#ifdef  TIMING
+    gettimeofday(&tv_kernel_start, NULL);
+#endif
+
+    printf("Processing top-left matrix\n");
+    //process top-left matrix
+    for( int i = 1 ; i <= block_width ; i++){
+        dimGrid.x = i;
+        dimGrid.y = 1;
+        needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+                ,max_cols, penalty, i, block_width); 
+    }
+    printf("Processing bottom-right matrix\n");
+    //process bottom-right matrix
+    for( int i = block_width - 1  ; i >= 1 ; i--){
+        dimGrid.x = i;
+        dimGrid.y = 1;
+        needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
+                ,max_cols, penalty, i, block_width); 
+    }
+
+#ifdef  TIMING
+    gettimeofday(&tv_kernel_end, NULL);
+    tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
+    kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+    cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
+
+    //#define TRACEBACK
+#ifdef TRACEBACK
+
+    FILE *fpo = fopen("result.txt","w");
+    fprintf(fpo, "print traceback value GPU:\n");
+
+    for (int i = max_rows - 2,  j = max_rows - 2; i>=0, j>=0;){
+        int nw, n, w, traceback;
+        if ( i == max_rows - 2 && j == max_rows - 2 )
+            fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
+        if ( i == 0 && j == 0 )
+            break;
+        if ( i > 0 && j > 0 ){
+            nw = output_itemsets[(i - 1) * max_cols + j - 1];
+            w  = output_itemsets[ i * max_cols + j - 1 ];
+            n  = output_itemsets[(i - 1) * max_cols + j];
+        }
+        else if ( i == 0 ){
+            nw = n = LIMIT;
+            w  = output_itemsets[ i * max_cols + j - 1 ];
+        }
+        else if ( j == 0 ){
+            nw = w = LIMIT;
+            n  = output_itemsets[(i - 1) * max_cols + j];
+        }
+        else{
+        }
+
+        //traceback = maximum(nw, w, n);
+        int new_nw, new_w, new_n;
+        new_nw = nw + referrence[i * max_cols + j];
+        new_w = w - penalty;
+        new_n = n - penalty;
+
+        traceback = maximum(new_nw, new_w, new_n);
+        if(traceback == new_nw)
+            traceback = nw;
+        if(traceback == new_w)
+            traceback = w;
+        if(traceback == new_n)
+            traceback = n;
+
+        fprintf(fpo, "%d ", traceback);
+
+        if(traceback == nw )
+        {i--; j--; continue;}
+
+        else if(traceback == w )
+        {j--; continue;}
+
+        else if(traceback == n )
+        {i--; continue;}
+
+        else
+            ;
+    }
+
+    fclose(fpo);
+
+#endif
+
+    cudaFree(referrence_cuda);
+    cudaFree(matrix_cuda);
+
+    free(referrence);
+    free(input_itemsets);
+    free(output_itemsets);
+
+#ifdef  TIMING
+    printf("Exec: %f\n", kernel_time);
+#endif
+}
+
diff --git a/NW/baselines/gpu/needle.h b/NW/baselines/gpu/needle.h
new file mode 100644
index 0000000..6ffef52
--- /dev/null
+++ b/NW/baselines/gpu/needle.h
@@ -0,0 +1,11 @@
+#ifdef RD_WG_SIZE_0_0
+	#define BLOCK_SIZE RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+	#define BLOCK_SIZE RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE RD_WG_SIZE
+#else
+	#define BLOCK_SIZE 16
+#endif
+//#define TRACE
+
diff --git a/NW/baselines/gpu/needle_kernel.cu b/NW/baselines/gpu/needle_kernel.cu
new file mode 100644
index 0000000..72f8501
--- /dev/null
+++ b/NW/baselines/gpu/needle_kernel.cu
@@ -0,0 +1,188 @@
+#include "needle.h"
+#include <stdio.h>
+
+
+#define SDATA( index)      CUT_BANK_CHECKER(sdata, index)
+
+__device__ __host__ int 
+maximum( int a,
+        int b,
+        int c){
+
+    int k;
+    if( a <= b )
+        k = b;
+    else 
+        k = a;
+
+    if( k <=c )
+        return(c);
+    else
+        return(k);
+
+}
+
+__global__ void
+needle_cuda_shared_1(  int* referrence,
+        int* matrix_cuda, 
+        int cols,
+        int penalty,
+        int i,
+        int block_width) 
+{
+    int bx = blockIdx.x;
+    int tx = threadIdx.x;
+
+    int b_index_x = bx;
+    int b_index_y = i - 1 - bx;
+
+    int index   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
+    int index_n   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
+    int index_w   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
+    int index_nw =  cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
+
+    __shared__  int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
+    __shared__  int ref[BLOCK_SIZE][BLOCK_SIZE];
+
+    if (tx == 0)
+        temp[tx][0] = matrix_cuda[index_nw];
+
+
+    for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+        ref[ty][tx] = referrence[index + cols * ty];
+
+    __syncthreads();
+
+    temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
+
+    __syncthreads();
+
+    temp[0][tx + 1] = matrix_cuda[index_n];
+
+    __syncthreads();
+
+
+    for( int m = 0 ; m < BLOCK_SIZE ; m++){
+
+        if ( tx <= m ){
+
+            int t_index_x =  tx + 1;
+            int t_index_y =  m - tx + 1;
+
+            temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+                    temp[t_index_y][t_index_x-1]  - penalty, 
+                    temp[t_index_y-1][t_index_x]  - penalty);
+
+
+
+        }
+
+        __syncthreads();
+
+    }
+
+    for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
+
+        if ( tx <= m){
+
+            int t_index_x =  tx + BLOCK_SIZE - m ;
+            int t_index_y =  BLOCK_SIZE - tx;
+
+            temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+                    temp[t_index_y][t_index_x-1]  - penalty, 
+                    temp[t_index_y-1][t_index_x]  - penalty);
+
+        }
+
+        __syncthreads();
+    }
+
+    for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+        matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
+
+}
+
+
+__global__ void
+needle_cuda_shared_2(  int* referrence,
+        int* matrix_cuda, 
+
+        int cols,
+        int penalty,
+        int i,
+        int block_width) 
+{
+
+    int bx = blockIdx.x;
+    int tx = threadIdx.x;
+
+    int b_index_x = bx + block_width - i  ;
+    int b_index_y = block_width - bx -1;
+
+    int index   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
+    int index_n   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
+    int index_w   = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
+    int index_nw =  cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
+
+    __shared__  int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
+    __shared__  int ref[BLOCK_SIZE][BLOCK_SIZE];
+
+    for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+        ref[ty][tx] = referrence[index + cols * ty];
+
+    __syncthreads();
+
+    if (tx == 0)
+        temp[tx][0] = matrix_cuda[index_nw];
+
+
+    temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
+
+    __syncthreads();
+
+    temp[0][tx + 1] = matrix_cuda[index_n];
+
+    __syncthreads();
+
+
+    for( int m = 0 ; m < BLOCK_SIZE ; m++){
+
+        if ( tx <= m ){
+
+            int t_index_x =  tx + 1;
+            int t_index_y =  m - tx + 1;
+
+            temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+                    temp[t_index_y][t_index_x-1]  - penalty, 
+                    temp[t_index_y-1][t_index_x]  - penalty);	  
+
+        }
+
+        __syncthreads();
+
+    }
+
+
+    for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
+
+        if ( tx <= m){
+
+            int t_index_x =  tx + BLOCK_SIZE - m ;
+            int t_index_y =  BLOCK_SIZE - tx;
+
+            temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
+                    temp[t_index_y][t_index_x-1]  - penalty, 
+                    temp[t_index_y-1][t_index_x]  - penalty);
+
+
+        }
+
+        __syncthreads();
+    }
+
+
+    for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
+        matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
+
+}
+
diff --git a/NW/baselines/gpu/run b/NW/baselines/gpu/run
new file mode 100644
index 0000000..11e9559
--- /dev/null
+++ b/NW/baselines/gpu/run
@@ -0,0 +1 @@
+./needle 2048 10
diff --git a/NW/baselines/gpu/timing.h b/NW/baselines/gpu/timing.h
new file mode 100644
index 0000000..6ef8813
--- /dev/null
+++ b/NW/baselines/gpu/timing.h
@@ -0,0 +1,22 @@
+#ifndef __TIMING_H__
+#define __TIMING_H__
+
+#include <sys/time.h>
+
+void time_measure_start(struct timeval *tv);
+void time_measure_end(struct timeval *tv);
+
+/* tvsub: ret = x - y. */
+static inline void tvsub(struct timeval *x,
+						 struct timeval *y,
+						 struct timeval *ret)
+{
+	ret->tv_sec = x->tv_sec - y->tv_sec;
+	ret->tv_usec = x->tv_usec - y->tv_usec;
+	if (ret->tv_usec < 0) {
+		ret->tv_sec--;
+		ret->tv_usec += 1000000;
+	}
+}
+
+#endif
+\ No newline at end of file
diff --git a/NW/dpu/task.c b/NW/dpu/task.c
new file mode 100644
index 0000000..c022f70
--- /dev/null
+++ b/NW/dpu/task.c
@@ -0,0 +1,185 @@
+/**
+* Needleman-Wunsch with multiple tasklets
+*
+*/
+#include <stdint.h>
+#include <stdio.h>
+#include <defs.h>
+#include <mram.h>
+#include <alloc.h>
+#include <perfcounter.h>
+#include <barrier.h>
+
+#include "../support/common.h"
+
+__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
+
+// Barrier
+BARRIER_INIT(my_barrier, NR_TASKLETS);
+
+// main
+int main() {
+    unsigned int tasklet_id = me();
+    if (tasklet_id == 0){ // Initialize once the cycle counter
+        mem_reset(); // Reset the heap
+    }
+    // Barrier
+    barrier_wait(&my_barrier);
+    uint32_t nblocks = DPU_INPUT_ARGUMENTS.nblocks;
+    uint32_t active_blocks = DPU_INPUT_ARGUMENTS.active_blocks;
+    uint32_t penalty = DPU_INPUT_ARGUMENTS.penalty;
+#if PRINT
+    printf("tasklet_id = %d, nblocks = %d \n", tasklet_id, nblocks);
+#endif
+	
+    uint32_t mram_base_addr_input_itemsets = (uint32_t) (DPU_MRAM_HEAP_POINTER);
+    uint32_t mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + nblocks * (BL+1) * (BL+2) * sizeof(int32_t));
+    if (nblocks != active_blocks)
+        mram_base_addr_ref = (uint32_t) (DPU_MRAM_HEAP_POINTER + active_blocks * (BL+1) * (BL+2) * sizeof(int32_t));
+
+    int32_t *cache_input = mem_alloc((BL_IN+1) * (BL_IN+2) * sizeof(int32_t));
+    int32_t *cache_ref = mem_alloc(BL_IN * BL_IN * sizeof(int32_t));
+    uint32_t REP = BL/BL_IN;
+    uint32_t chunks;
+    uint32_t mod;
+    uint32_t start;
+    uint32_t addr_input;
+    uint32_t addr_ref;
+    uint32_t cache_input_offset;
+
+    for (uint32_t bl = 0; bl < nblocks; bl++) {
+
+        // Top-left computation
+        for(uint32_t blk = 0; blk <= REP; blk++) {
+            
+            // Partition chunks/subblocks of the diagonal to tasklets 
+            chunks = blk / NR_TASKLETS; 
+            mod = blk % NR_TASKLETS;
+            if (tasklet_id < mod)
+                chunks++;
+            if (mod > 0) {
+                if(tasklet_id < mod)
+                    start = tasklet_id * chunks;
+                else
+                    start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
+            } else
+                start = tasklet_id * chunks;
+            
+            // Compute all assigned chunks  
+            for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
+                int t_index_x = start + bl_indx;
+                int t_index_y = blk - 1 - t_index_x; 
+                
+                // Move input from MRAM to WRAM
+                addr_input =  mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = (BL_IN+2);
+                mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); 
+                addr_input += ((BL+2) * sizeof(int32_t));
+                for (int i = 1; i < BL_IN + 1; i++) {
+                    mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); 
+                    cache_input_offset += (BL_IN+2); 
+                    addr_input += ((BL+2) * sizeof(int32_t));
+                }
+
+                addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) +  (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = 0;
+                for (int i = 0; i < BL_IN; i++) {
+                    mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); 
+                    cache_input_offset += BL_IN; 
+                    addr_ref += (BL * sizeof(int32_t));
+                }
+
+                // Computation
+                for (uint32_t i = 1; i < BL_IN + 1; i++) {
+                    for (uint32_t j = 1; j < BL_IN + 1; j++) {
+                        cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
+                                                cache_input[i*(BL_IN+2) + j - 1] - penalty,
+                                                cache_input[(i-1)*(BL_IN+2) + j] - penalty);
+                    }
+                }
+
+                // Move output from WRAM to MRAM
+                addr_input =  mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = (BL_IN+2);
+                addr_input += ((BL+2) * sizeof(int32_t));
+                for (int i = 1; i < BL_IN + 1; i++) {
+                    mram_write((cache_input + cache_input_offset), (__mram_ptr void *)  addr_input, (BL_IN+2) * sizeof(int32_t)); 
+                    cache_input_offset += (BL_IN+2); 
+                    addr_input += ((BL+2) * sizeof(int32_t));
+                }
+
+            }
+            
+            barrier_wait(&my_barrier);
+        }
+       
+        // Bottom-right computation
+        for(uint32_t blk = 2; blk <= REP; blk++) {
+            // Partition chunks/subblocks of the diagonal to tasklets 
+            chunks = (REP - blk + 1) / NR_TASKLETS; 
+            mod = (REP - blk + 1) % NR_TASKLETS;
+            if (tasklet_id < mod)
+                chunks++;
+            if (mod > 0){
+                if(tasklet_id < mod)
+                    start = tasklet_id * chunks;
+                else
+                    start = mod * (chunks + 1) + (tasklet_id - mod) * chunks;
+            } else
+                start = tasklet_id * chunks;
+
+            // Compute all assigned chunks  
+            for (uint32_t bl_indx = 0; bl_indx < chunks; bl_indx++) {
+                int t_index_x = blk - 1 + start + bl_indx;
+                int t_index_y = REP + blk - 2 - t_index_x; 
+
+                // Move input from MRAM to WRAM
+                addr_input =  mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = (BL_IN+2);
+                mram_read((__mram_ptr void const *) addr_input, (void *) cache_input, (BL_IN+2) * sizeof(int32_t)); 
+                addr_input += ((BL+2) * sizeof(int32_t));
+                for (int i = 1; i < BL_IN + 1; i++) {
+                    mram_read((__mram_ptr void const *) addr_input, (void *) (cache_input + cache_input_offset), (2) * sizeof(int32_t)); 
+                    cache_input_offset += (BL_IN+2); 
+                    addr_input += ((BL+2) * sizeof(int32_t));
+                }
+
+                addr_ref = mram_base_addr_ref + (t_index_x * BL * BL_IN * sizeof(int32_t)) +  (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = 0;
+                for (int i = 0; i < BL_IN; i++) {
+                    mram_read((__mram_ptr void const *) addr_ref, (void *) (cache_ref + cache_input_offset), (BL_IN) * sizeof(int32_t)); 
+                    cache_input_offset += BL_IN; 
+                    addr_ref += (BL * sizeof(int32_t));
+                }
+
+
+                // Computation
+                for (int i = 1; i < BL_IN + 1; i++) {
+                    for (int j = 1; j < BL_IN + 1; j++) {
+                        cache_input[i*(BL_IN+2) + j] = maximum(cache_input[(i-1)*(BL_IN+2) + j - 1] + cache_ref[(i-1)*BL_IN + j-1],
+                                                cache_input[i*(BL_IN+2) + j - 1] - penalty,
+                                                cache_input[(i-1)*(BL_IN+2) + j] - penalty);
+                    }
+                }
+
+                // Move output from WRAM to MRAM
+                addr_input =  mram_base_addr_input_itemsets + (t_index_x * (BL+2) * BL_IN * sizeof(int32_t)) + (t_index_y * BL_IN * sizeof(int32_t));
+                cache_input_offset = (BL_IN+2);
+                addr_input += ((BL+2) * sizeof(int32_t));
+                for (int i = 1; i < BL_IN + 1; i++) {
+                    mram_write(cache_input + cache_input_offset, (__mram_ptr void *)  addr_input, (BL_IN+2) * sizeof(int32_t)); 
+                    cache_input_offset += (BL_IN+2); 
+                    addr_input += ((BL+2) * sizeof(int32_t));
+                }
+
+            }
+            
+            barrier_wait(&my_barrier);
+
+        }
+		
+        mram_base_addr_input_itemsets += ((BL+1) * (BL+2) * sizeof(int32_t));
+        mram_base_addr_ref += (BL * BL * sizeof(int32_t)); 
+    }
+    return 0;
+}
diff --git a/NW/host/app.c b/NW/host/app.c
new file mode 100644
index 0000000..0e899ec
--- /dev/null
+++ b/NW/host/app.c
@@ -0,0 +1,879 @@
+/**
+* app.c
+* NW Host Application Source File 
+*
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <dpu.h>
+#include <dpu_log.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include "../support/common.h"
+#include "../support/timer.h"
+#include "../support/params.h"
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+// Define the DPU Binary path as DPU_BINARY here
+#ifndef DPU_BINARY
+#define DPU_BINARY "./bin/nw_dpu"
+#endif
+
+// Traceback in the host
+#if PRINT_FILE
+static void traceback(int* traceback_output, char *file, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) {
+    FILE *fpo = fopen(file, "w"); // Use to print to an output file
+#else
+static void traceback(int* traceback_output, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) {
+#endif
+
+    int k = 0;
+    for (int i = max_rows - 2,  j = max_rows - 2; i>=0 && j>=0;) {
+        int nw = 0, n = 0, w = 0, traceback = 0;
+#if PRINT_FILE
+        if ( i == (int)max_rows - 2 && j == (int)max_rows - 2 )
+            fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element
+#endif
+
+        if (i == 0 && j == 0)
+            break;
+        if (i > 0 && j > 0) {
+            nw = input_itemsets[(i - 1) * max_cols + j - 1];
+            w  = input_itemsets[i * max_cols + j - 1];
+            n  = input_itemsets[(i - 1) * max_cols + j];
+        } else if (i == 0) {
+            nw = n = LIMIT;
+            w  = input_itemsets[ i * max_cols + j - 1 ];
+        } else if (j == 0) {
+            nw = w = LIMIT;
+            n  = input_itemsets[(i - 1) * max_cols + j];
+        } else {
+            ;
+        }
+
+        int new_nw, new_w, new_n;
+        new_nw = nw + reference[i * max_cols + j];
+        new_w = w - penalty;
+        new_n = n - penalty;
+
+        traceback = maximum(new_nw, new_w, new_n);
+        if (traceback == new_nw)
+            traceback = nw;
+        if (traceback == new_w)
+            traceback = w;
+        if (traceback == new_n)
+            traceback = n;
+
+#if PRINT_FILE
+	fprintf(fpo, "%d ", traceback);
+#endif 
+ 	traceback_output[k++] = traceback;
+
+	if (traceback == nw) { 
+            i--; 
+            j--; 
+            continue;
+        } else if (traceback == w) {
+            j--; 
+            continue;
+        } else if (traceback == n) {
+            i--; 
+            continue;
+        } else {
+            ;
+        }
+    }
+
+    return;
+}
+
+// Compute output in the host
+static void nw_host(int32_t *input_itemsets, int32_t *reference, uint64_t max_cols, unsigned int penalty) {
+
+    int32_t *input_itemsets_l = (int32_t *) malloc((BL + 1) * (BL + 1) * sizeof(int32_t));
+    int32_t *reference_l = (int32_t *) malloc((BL * BL) * sizeof(int32_t));
+
+
+    // top-left
+    for (uint64_t blk = 1; blk <= (max_cols-1)/BL; blk++) {
+        for (uint64_t b_index_x = 0; b_index_x < blk; b_index_x++) {
+            uint64_t b_index_y = blk - 1 - b_index_x;
+
+            for (uint64_t i = 0; i < BL; i++){
+                for (uint64_t j = 0; j < BL; j++) {
+                    reference_l[i*BL + j] = reference[(max_cols-1) * (b_index_y*BL + i) + b_index_x*BL + j];
+                }
+            }
+
+            for (uint64_t i = 0; i < BL + 1; i++){
+                for (uint64_t j = 0; j < BL + 1; j++) {
+                    input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j];
+                }
+            }
+
+            // Computation
+            for (uint64_t i = 1; i < BL + 1; i++) {
+                for (uint64_t j = 1; j < BL + 1; j++) {
+                    input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1],
+                            input_itemsets_l[i*(BL+1) + j - 1] - penalty,
+                            input_itemsets_l[(i-1)*(BL+1) + j] - penalty);
+                }
+            }
+
+            for (uint64_t i = 0; i < BL; i++) {
+                for (uint64_t j = 0; j < BL; j++) {
+                    input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1];
+                }
+            }
+
+        }
+
+    }
+
+    // bottom-right 
+    for (uint64_t blk = 2; blk <= (max_cols-1)/BL; blk++) {
+        for (uint64_t b_index_x = blk - 1; b_index_x < (max_cols-1)/BL; b_index_x++) {
+            uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+
+            for (uint64_t i = 0; i < BL; i++){
+                for (uint64_t j = 0; j < BL; j++) {
+                    reference_l[i*BL + j] = reference[(max_cols-1)*(b_index_y*BL + i) + b_index_x*BL + j];
+                }
+            }
+
+            for (uint64_t i = 0; i < BL + 1; i++){
+                for (uint64_t j = 0; j < BL + 1; j++) {
+                    input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j];
+                }
+            }
+
+            // Computation
+            for (uint64_t i = 1; i < BL + 1; i++) {
+                for (uint64_t j = 1; j < BL + 1; j++) {
+                    input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1],
+                            input_itemsets_l[i*(BL+1) + j - 1] - penalty,
+                            input_itemsets_l[(i-1)*(BL+1) + j] - penalty);
+                }
+            }
+
+            for (uint64_t i = 0; i < BL; i++) {
+                for (uint64_t j = 0; j < BL; j++) {
+                    input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1];
+                }
+            }
+
+        }
+
+    }
+
+
+    free(input_itemsets_l);
+    free(reference_l);
+    return;
+}
+
+// Main of the Host Application
+int main(int argc, char **argv) {
+
+    struct Params p = input_params(argc, argv);
+    struct dpu_set_t dpu_set, dpu;
+    uint32_t nr_of_dpus, max_dpus;
+
+#if ENERGY
+    struct dpu_probe_t probe;
+    DPU_ASSERT(dpu_probe_init("energy probe", &probe));
+#endif
+
+    // Allocate DPUs and load binary
+    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+    printf("Allocated %d DPU(s)\n", nr_of_dpus);
+    printf("Allocated %d TASKLET(s) per DPU\n", NR_TASKLETS);
+#if DYNAMIC
+    max_dpus = nr_of_dpus;
+#endif
+
+    uint64_t max_rows = p.max_rows + 1;
+    uint64_t max_cols = p.max_rows + 1;
+    unsigned int penalty = p.penalty;
+    int32_t *reference = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t));
+    int32_t *input_itemsets_host = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t));
+    int32_t *input_itemsets = (int32_t *) malloc((max_rows+1) * (max_cols+1) * sizeof(int32_t));
+    dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
+    printf("Max size %d\n", p.max_rows);
+
+    // Traceback output
+    int32_t* traceback_output = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t));
+    int32_t* traceback_output_host = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t));
+    memset(traceback_output, 0, (max_rows + max_cols) * sizeof(int32_t));
+    memset(traceback_output_host, 0, (max_rows + max_cols) * sizeof(int32_t));
+
+    // This array is used for dummy/stale CPU-DPU transfers
+    int32_t *dummy = (int32_t *) malloc(nr_of_dpus * (BL+2) * sizeof(int32_t));
+    unsigned int blocks_per_dpu;
+    unsigned int mram_offset = 0;
+
+    // Timer
+    Timer timer; 
+    Timer long_diagonal_timer; 
+#if ENERGY
+    double tacc_energy, tacc_time, tavg_time;
+    double tavg_energy=0;
+#endif
+
+    for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+
+        // Initializing inputs are needed at each iteration
+        // Initialize input itemsets
+        for(unsigned int i = 0; i < max_rows; i++) {
+            for (unsigned int j = 0; j < max_cols; j++) {
+                input_itemsets_host[i * max_cols + j] = 0; 
+            }
+        }
+
+        for(unsigned int i = 0; i <= max_rows; i++) {
+            for (unsigned int j = 0; j <= max_cols; j++) {
+                input_itemsets[i * (max_cols+1) + j] = 0; 
+            }
+        }
+
+        // Define random sequences
+        srand(7);
+        for (unsigned int i = 1; i < max_rows; i++) {
+            input_itemsets_host[i * max_cols] = rand() % 10 + 1;
+        }
+
+        for (unsigned int j = 1; j < max_cols; j++) {
+            input_itemsets_host[j] = rand() % 10 + 1;
+        }   
+
+        for (unsigned int i = 0; i < max_rows-1; i++) {
+            for (unsigned int j = 0; j < max_cols-1; j++) {
+                reference[i * (max_cols-1) + j] = blosum62[input_itemsets[(i+1) * max_cols]][input_itemsets[j+1]];
+            }
+        }
+
+        for (unsigned int i = 1; i < max_rows; i++) {
+            input_itemsets_host[i * max_cols] = -i * penalty;
+            input_itemsets[i * (max_cols+1)] = -i * penalty;
+        }
+
+        for (unsigned int j = 1; j < max_cols; j++) {
+            input_itemsets_host[j] = -j * penalty;
+            input_itemsets[j] = -j * penalty;
+        }
+
+        if (rep >= p.n_warmup)
+            start(&timer, 0, rep - p.n_warmup);
+        // Computation on host CPU
+        nw_host(input_itemsets_host, reference, max_cols, penalty);
+
+        // Print host output
+#if PRINT_FILE
+        if (rep >= p.n_warmup) {
+            char *host_file = "./bin/host_output.txt";
+            traceback(traceback_output_host, host_file, input_itemsets_host, reference, max_rows, max_cols, penalty);
+        }
+#endif
+        if (rep >= p.n_warmup)
+            stop(&timer, 0);
+
+        // Top-left computation on DPUs
+        for (unsigned int blk = 1; blk <= (max_cols-1)/BL; blk++) {
+#if DYNAMIC 
+            // If nr_of_blocks are lower than max_dpus,
+            // set nr_of_dpus to be equal with nr_of_blocks
+            unsigned nr_of_blocks = blk;
+            if (nr_of_blocks < max_dpus) {
+                DPU_ASSERT(dpu_free(dpu_set));
+                DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set));
+                DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+                DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+            } else if (nr_of_dpus == max_dpus) {
+                ;
+            } else {
+                DPU_ASSERT(dpu_free(dpu_set));
+                DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set));
+                DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+                DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+            }
+#if PRINT
+            printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, blk);
+#endif
+#endif
+
+            // Copy data to DPUs
+            unsigned int i=0;
+            DPU_FOREACH(dpu_set, dpu, i) {
+                unsigned int blocks_per_dpu = blk / nr_of_dpus;
+                unsigned int active_blocks_per_dpu = blk / nr_of_dpus;
+                unsigned int rest_blocks = blk % nr_of_dpus;
+                if(i < rest_blocks)
+                    blocks_per_dpu++;
+
+                if(rest_blocks != 0)
+                    active_blocks_per_dpu++;
+
+                // Copy input arguments to dpu
+                input_args[i].nblocks = blocks_per_dpu;
+                input_args[i].active_blocks = active_blocks_per_dpu;
+                input_args[i].penalty = penalty;
+                DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
+            } 
+            DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+
+            // Copy itemsets to DPUs
+            blocks_per_dpu = blk / nr_of_dpus;
+            if (blk % nr_of_dpus != 0)
+                blocks_per_dpu++;
+            mram_offset = 0;
+
+
+            if (rep >= p.n_warmup) {
+                if ((max_cols-1)/BL == 1) 
+                    start(&timer, 2, rep - p.n_warmup + blk - 1);
+                else 
+                    start(&timer, 1, rep - p.n_warmup + blk - 1);
+                
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    if ((max_cols-1)/BL == 1) 
+                        start(&long_diagonal_timer, 2, rep - p.n_warmup);
+                    else 
+                        start(&long_diagonal_timer, 1, rep - p.n_warmup);
+                }
+            }
+
+#if PRINT
+            uint64_t total_dpu_memory = 0;
+            total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t);
+            printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory);
+#endif
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = blk / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = blk % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t input_itemsets_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= blk) {
+                            dpu_pointer = dummy;
+                            input_itemsets_offset = 0;  
+                        } else {
+                            uint64_t b_index_x =  prev_block_index + bl_indx;
+                            uint64_t b_index_y = blk - 1 - b_index_x;
+                            dpu_pointer = input_itemsets;
+                            input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);  
+                        }
+
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+                    }
+
+                    if (bl == 0) 
+                        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    else
+                        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += ((BL+2) * sizeof(int32_t));
+
+                }
+            }
+            if (rep >= p.n_warmup) {
+                if ((max_cols-1)/BL == 1) 
+                    stop(&timer, 2);
+                else
+                    stop(&timer, 1);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    if ((max_cols-1)/BL == 1) 
+                        stop(&long_diagonal_timer, 2);
+                    else 
+                        stop(&long_diagonal_timer, 1);
+                }
+            }
+
+
+            if (rep >= p.n_warmup) {
+                start(&timer, 2, rep - p.n_warmup + blk - 1);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    start(&long_diagonal_timer, 2, rep - p.n_warmup);
+                }
+            }
+            // Copy reference to DPUs
+            mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t); 
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = blk / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = blk % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t reference_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= blk) {
+                            dpu_pointer = dummy;
+                            reference_offset = 0;  
+                        } else {
+                            uint64_t b_index_x =  prev_block_index + bl_indx;
+                            uint64_t b_index_y = blk - 1 - b_index_x;
+                            dpu_pointer = reference;
+                            reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1);  
+                        }
+
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset));
+                    }
+                    DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += BL * sizeof(int32_t);
+
+                }
+            }
+            if (rep >= p.n_warmup) {
+                stop(&timer, 2);
+                if (blk == ((max_cols-1)/BL)) {
+                    stop(&long_diagonal_timer, 2);
+                }
+            }
+
+#if ENERGY
+            if (rep >= p.n_warmup) {
+                DPU_ASSERT(dpu_probe_start(&probe));
+            }
+#endif
+            if (rep >= p.n_warmup) {
+                start(&timer, 3, rep - p.n_warmup + blk - 1);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    start(&long_diagonal_timer, 3, rep - p.n_warmup);
+                }
+            }
+            // Launch kernel on DPUs
+            DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+            if (rep >= p.n_warmup) {
+                stop(&timer, 3);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    stop(&long_diagonal_timer, 3);
+                }
+            }
+#if ENERGY
+            if (rep >= p.n_warmup) {
+                DPU_ASSERT(dpu_probe_stop(&probe));
+            }
+#endif
+
+#if ENERGY
+    	    double acc_energy, avg_energy, acc_time, avg_time;
+	        DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
+    	    tavg_energy += avg_energy;
+#endif
+
+#if PRINT
+            // Display DPU Logs
+            DPU_FOREACH(dpu_set, dpu) {
+                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
+            }
+#endif
+
+            if (rep >= p.n_warmup) {
+                start(&timer, 4, rep - p.n_warmup + blk - 1);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    start(&long_diagonal_timer, 4, rep - p.n_warmup);
+                }
+            }
+            // Retrieve results
+            // Copy output result to Host CPU
+            mram_offset = 0;
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = blk / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = blk % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t input_itemsets_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= blk) {
+                            dpu_pointer = dummy;
+                            input_itemsets_offset = 0;  
+                        } else {
+                            uint64_t b_index_x =  prev_block_index + bl_indx;
+                            uint64_t b_index_y = blk - 1 - b_index_x;
+                            dpu_pointer = input_itemsets;
+                            input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);  
+                        }
+
+                        if (bl == 0) // Skip the first row of the block
+                            continue;
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+
+                    }
+                    if (bl == 0) {
+                        mram_offset += (BL+2) * sizeof(int32_t);
+                        continue;
+                    }
+                    DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += (BL+2) * sizeof(int32_t);
+
+                }
+            }
+            if (rep >= p.n_warmup) {
+                stop(&timer, 4);
+                // Timer for longest diagonal
+                if (blk == ((max_cols-1)/BL)) {
+                    stop(&long_diagonal_timer, 4);
+                }
+            }
+        }
+
+
+        // Bottom-right computation on DPUs
+        for (unsigned int blk = 2; blk <= (max_cols-1)/BL; blk++) {
+#if DYNAMIC
+            // If nr_of_blocks are lower than max_dpus,
+            // set nr_of_dpus to be equal with nr_of_blocks
+            unsigned nr_of_blocks = (((max_cols-1)/BL) - blk + 1);
+            if (nr_of_blocks < max_dpus) {
+                DPU_ASSERT(dpu_free(dpu_set));
+                DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set));
+                DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+                DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+            } else if (nr_of_dpus == max_dpus) {
+                ;
+            } else {
+                DPU_ASSERT(dpu_free(dpu_set));
+                DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set));
+                DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+                DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+            }
+#if PRINT
+            printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, (((max_cols-1)/BL) - blk + 1));
+#endif
+#endif
+
+            // Copy data to DPUs
+            unsigned int i=0;
+            DPU_FOREACH(dpu_set, dpu, i) {
+                unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+                unsigned int active_blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+                unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+                if(i < rest_blocks)
+                    blocks_per_dpu++;
+
+                if(rest_blocks != 0)
+                    active_blocks_per_dpu++;
+
+                // Copy input arguments to dpu
+                input_args[i].nblocks = blocks_per_dpu;
+                input_args[i].active_blocks = active_blocks_per_dpu;
+                input_args[i].penalty = penalty;
+                DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
+            } 
+            DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+
+            if (rep >= p.n_warmup)
+                start(&timer, 1, rep - p.n_warmup + blk - 1);
+            // Copy itemsets to DPUs
+            unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+            if ((((max_cols-1)/BL) - blk + 1) % nr_of_dpus != 0)
+                blocks_per_dpu++;
+#if PRINT
+            uint64_t total_dpu_memory = 0;
+            total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t);
+            printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory);
+#endif
+            unsigned int mram_offset = 0;
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t input_itemsets_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+                            dpu_pointer = dummy;
+                            input_itemsets_offset = 0;  
+                        } else {
+                            uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+                            uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+                            dpu_pointer = input_itemsets;
+                            input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);  
+                        }
+
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+                    }
+
+                    if (bl == 0) 
+                        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    else
+                        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += (BL+2) * sizeof(int32_t);
+
+                }
+            }
+            if (rep >= p.n_warmup)
+                stop(&timer, 1);
+
+
+            if (rep >= p.n_warmup)
+                start(&timer, 2, rep - p.n_warmup + blk - 1);
+            // Copy reference to DPUs
+            mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t); 
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t reference_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+                            dpu_pointer = dummy;
+                            reference_offset = 0;  
+                        } else {
+                            uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+                            uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+                            dpu_pointer = reference;
+                            reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1);  
+                        }
+
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset));
+                    }
+
+                    DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += BL * sizeof(int32_t);
+
+                }
+            }
+            if (rep >= p.n_warmup)
+                stop(&timer, 2);
+
+#if ENERGY
+            if (rep >= p.n_warmup) {
+                DPU_ASSERT(dpu_probe_start(&probe));
+            }
+#endif
+            if (rep >= p.n_warmup)
+                start(&timer, 3, rep - p.n_warmup + blk - 1); // Do not re-initialize the counter
+            // Launch kernel on DPUs
+            DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+            if (rep >= p.n_warmup)
+                stop(&timer, 3);
+#if ENERGY
+            if (rep >= p.n_warmup) {
+                DPU_ASSERT(dpu_probe_stop(&probe));
+            }
+#endif
+
+#if ENERGY
+    	    double acc_energy, avg_energy, acc_time, avg_time;
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
+    	    DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
+    	    tavg_energy += avg_energy;
+#endif
+
+#if PRINT
+            // Display DPU Logs
+            DPU_FOREACH(dpu_set, dpu) {
+                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
+            }
+#endif
+
+
+            if (rep >= p.n_warmup)
+                start(&timer, 4, rep - p.n_warmup + blk - 1);
+            // Retrieve results
+            // Copy output result to Host CPU
+            mram_offset = 0;
+            for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+                for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+                    i = 0;
+                    DPU_FOREACH(dpu_set, dpu, i) {
+                        unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+                        unsigned int prev_block_index = 0;
+                        unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+                        if (rest_blocks > 0) {
+                            if (i >= rest_blocks) {
+                                prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+                            } else {
+                                prev_block_index = i * (chunks + 1);
+                            }
+                        } else {
+                            prev_block_index = i * blocks_per_dpu; 
+                        }
+
+                        uint64_t input_itemsets_offset = 0;  
+                        int32_t *dpu_pointer;  
+                        if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+                            dpu_pointer = dummy;
+                            input_itemsets_offset = 0;  
+                        } else {
+                            uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+                            uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+                            dpu_pointer = input_itemsets;
+                            input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);  
+                        }
+
+                        if (bl == 0) // Skip the first row of the block
+                            continue;
+                        DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+
+                    }
+
+                    if (bl == 0) {
+                        mram_offset += (BL+2) * sizeof(int32_t);
+                        continue;
+                    }
+                    DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+                    mram_offset += (BL+2) * sizeof(int32_t);
+
+                }
+            }
+            if (rep >= p.n_warmup)
+                stop(&timer, 4);
+
+
+        }
+
+        // Traceback step
+        if (rep >= p.n_warmup)
+            start(&timer, 1, 1);
+#if PRINT_FILE
+        char *dpu_file = "./bin/dpu_output.txt";
+        traceback(traceback_output, dpu_file, input_itemsets, reference, max_rows+1, max_cols+1, penalty);
+#else
+        traceback(traceback_output, input_itemsets, reference, max_rows+1, max_cols+1, penalty);
+#endif
+        if (rep >= p.n_warmup)
+            stop(&timer, 1);
+
+    }
+
+    // Print timing results
+    printf("CPU version ");
+    print(&timer, 0, p.n_reps);
+    printf("CPU-DPU ");
+    print(&timer, 2, p.n_reps);
+    printf("DPU Kernel ");
+    print(&timer, 3, p.n_reps);
+    printf("Inter-DPU ");
+    print(&timer, 1, p.n_reps);
+    printf("DPU-CPU ");
+    print(&timer, 4, p.n_reps);
+    printf("\n");
+    printf("Longest Diagonal CPU-DPU ");
+    print(&long_diagonal_timer, 2, p.n_reps);
+    printf("Longest Diagonal DPU Kernel ");
+    print(&long_diagonal_timer, 3, p.n_reps);
+    printf("Longest Diagonal Inter-DPU ");
+    print(&long_diagonal_timer, 1, p.n_reps);
+    printf("Longest Diagonal DPU-CPU ");
+    print(&long_diagonal_timer, 4, p.n_reps);
+    printf("\n");
+    
+#if ENERGY
+    printf("DPU Energy (J): %f \t ", tavg_energy / p.n_reps);
+#endif
+
+    // Check output
+    bool status = true;
+    for (uint64_t i = 1; i < max_rows; i++) {
+        for (uint64_t j = 1; j < max_cols; j++) {
+            if (input_itemsets_host[i*max_cols + j] != input_itemsets[i*(max_cols+1) + j]) {
+                status = false;
+#if PRINT
+                printf("%ld (%ld, %ld): %d %d\n", i*max_cols + j, i, j, input_itemsets_host[i*max_cols + j], input_itemsets[i*(max_cols+1) + j]); 
+#endif
+            } 
+        }
+    }
+    
+    if (status) {
+        printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+    } else {
+        printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+    }
+
+    free(input_itemsets_host);
+    free(input_itemsets);
+    free(reference);
+    free(traceback_output);
+    free(traceback_output_host);
+    DPU_ASSERT(dpu_free(dpu_set));
+    return status ? 0 : -1;
+    return 0;
+}
diff --git a/NW/support/common.h b/NW/support/common.h
new file mode 100755
index 0000000..69069e7
--- /dev/null
+++ b/NW/support/common.h
@@ -0,0 +1,76 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+// Structures used by both the host and the dpu to communicate information 
+typedef struct {
+    uint32_t nblocks;
+    uint32_t active_blocks;
+    uint32_t penalty;
+    uint32_t dummy;
+} dpu_arguments_t;
+
+#ifndef BL
+#define BL 16 
+#endif
+
+// Data type
+#define T int32_t
+
+// MAX
+int32_t maximum(int32_t a, int32_t b, int32_t c) {
+
+    int32_t k;
+    if (a <= b)
+        k = b;
+    else
+        k = a;
+
+    if (k <= c)
+        return c;
+    else
+        return k;
+        
+}
+
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+
+#define LIMIT -999
+
+int blosum62[24][24] = {
+    { 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0, -2, -1,  0, -4},
+    {-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3, -1,  0, -1, -4},
+    {-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3,  3,  0, -1, -4},
+    {-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    { 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+    {-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2,  0,  3, -1, -4},
+    {-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3, -1, -2, -1, -4},
+    {-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3,  0,  0, -1, -4},
+    {-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3, -3, -3, -1, -4},
+    {-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1, -4, -3, -1, -4},
+    {-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2,  0,  1, -1, -4},
+    {-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1, -3, -1, -1, -4},
+    {-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1, -3, -3, -1, -4},
+    {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+    { 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2,  0,  0,  0, -4},
+    { 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0, -1, -1,  0, -4},
+    {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3, -4, -3, -2, -4},
+    {-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1, -3, -2, -1, -4},
+    { 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4, -3, -2, -1, -4},
+    {-2, -1,  3,  4, -3,  0,  1, -1,  0, -3, -4,  0, -3, -3, -2,  0, -1, -4, -3, -3,  4,  1, -1, -4},
+    {-1,  0,  0,  1, -3,  3,  4, -2,  0, -3, -3,  1, -1, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4},
+    { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2,  0,  0, -2, -1, -1, -1, -1, -1, -4},
+    {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1}
+};
+
+#define DYNAMIC 1
+#define PRINT 0
+#define PRINT_FILE 0
+#ifndef ENERGY
+#define ENERGY 0
+#endif
+#endif
diff --git a/NW/support/params.h b/NW/support/params.h
new file mode 100644
index 0000000..8874248
--- /dev/null
+++ b/NW/support/params.h
@@ -0,0 +1,56 @@
+#ifndef _PARAMS_H_
+#define _PARAMS_H_
+
+#include "common.h"
+
+typedef struct Params {
+    unsigned int   max_rows;
+    unsigned int   penalty;
+    unsigned int   n_warmup;
+    unsigned int   n_reps;
+} Params;
+
+static void usage() {
+    fprintf(stderr,
+            "\nUsage:  ./program [options]"
+            "\n"
+            "\nGeneral options:"
+            "\n    -h        help"
+            "\n    -w <W>    # of untimed warmup iterations (default=1)"
+            "\n    -e <E>    # of timed repetition iterations (default=3)"
+            "\n"
+            "\nBenchmark-specific options:"
+            "\n    -n <N>    size of sequence: length of the sequence"
+            "\n    -p <P>    penalty: a positive integer"
+            "\n");
+}
+
+struct Params input_params(int argc, char **argv) {
+    struct Params p;
+    p.n_warmup      = 1;
+    p.n_reps        = 3;
+    p.max_rows      = 256;
+    p.penalty       = 1;
+
+    int opt;
+    while((opt = getopt(argc, argv, "hw:e:n:p:")) >= 0) {
+        switch(opt) {
+            case 'h':
+                usage();
+                exit(0);
+                break;
+            case 'w': p.n_warmup      = atoi(optarg); break;
+            case 'e': p.n_reps        = atoi(optarg); break;
+            case 'n': p.max_rows      = atoi(optarg); break;
+            case 'p': p.penalty       = atoi(optarg); break;
+            default:
+                      fprintf(stderr, "\nUnrecognized option!\n");
+                      usage();
+                      exit(0);
+        }
+    }
+    assert(NR_DPUS > 0 && "Invalid # of dpus!");
+
+    return p;
+}
+#endif
diff --git a/NW/support/timer.h b/NW/support/timer.h
new file mode 100755
index 0000000..efaefcd
--- /dev/null
+++ b/NW/support/timer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the 
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba, 
+ *        University of Illinois nor the names of its contributors may be used 
+ *        to endorse or promote products derived from this Software without 
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer{
+
+    struct timeval startTime[5];
+    struct timeval stopTime[5];
+    double         time[5];
+
+}Timer;
+
+void start(Timer *timer, int i, int rep) {
+    if(rep == 0) {
+        timer->time[i] = 0.0;
+    }
+    gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i) {
+    gettimeofday(&timer->stopTime[i], NULL);
+    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); 
+}
+
+void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
author	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
committer	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
commit	3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree	fc6776a94549d2d4039898f183dbbeb2ce013ba9 /NW
parent	ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)