5 files changed, 674 insertions, 0 deletions
diff --git a/SEL/baselines/gpu/Makefile b/SEL/baselines/gpu/Makefile
new file mode 100644
index 0000000..22edd15
--- /dev/null
+++ b/SEL/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+	/usr/local/cuda/bin/nvcc select.cu -I/usr/local/cuda/include -lm -o select -D COARSENING=32 -D THREADS=512 -D INT64 
+
+clean:
+	rm select
diff --git a/SEL/baselines/gpu/README b/SEL/baselines/gpu/README
new file mode 100644
index 0000000..e0e9a3e
--- /dev/null
+++ b/SEL/baselines/gpu/README
@@ -0,0 +1,21 @@
+Select (SEL)
+
+Compilation instructions
+
+    make
+
+Execution instructions
+
+    ./select 0 50 1258291200
+
+Compilation flags
+
+    FLOAT - For single precision arrays (Default: Double precision)
+    INT - For integer arrays (Note: Sample predicate is only for INT)
+    THREADS - Thread block size (Default: 1024)
+    COARSENING - Coarsening factor (Default: 16 (SP and INT); 8 (DP))
+    ATOMIC - Global atomics for synchronization (Default: No atomics)
+
+Read more
+J. Gomez-Luna et al., “In-place Data Sliding Algorithms for Many-core Architectures,” ICPP 2015.
+
diff --git a/SEL/baselines/gpu/ds.h b/SEL/baselines/gpu/ds.h
new file mode 100644
index 0000000..1fa73de
--- /dev/null
+++ b/SEL/baselines/gpu/ds.h
@@ -0,0 +1,293 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <ctime>
+#include <cstdio>
+#include <math.h>
+#include <sys/time.h>
+#include <vector>
+
+#ifdef FLOAT
+#define T float
+#elif INT
+#define T int
+#elif INT64
+#define T int64_t
+#else
+#define T double
+#endif
+
+#ifdef THREADS
+#define L_DIM THREADS
+#else 
+#define L_DIM 1024
+#endif
+
+#ifdef COARSENING
+#define REGS COARSENING
+#else
+#ifdef FLOAT
+#define REGS 16
+#elif INT
+#define REGS 16
+#else
+#define REGS 8 
+#endif
+#endif
+
+#ifdef ATOMIC
+#define ATOM 1
+#else
+#define ATOM 0
+#endif
+
+#define WARP_SIZE 32
+
+#define PRINT 0
+
+// Dynamic allocation of runtime workgroup id
+__device__ int dynamic_wg_id(volatile unsigned int *flags, const int num_flags){
+  __shared__ int gid_;
+  if (threadIdx.x == 0) gid_ = atomicAdd((unsigned int*)&flags[num_flags + 1], 1);
+  __syncthreads();
+  int my_s = gid_;
+  return my_s;
+}
+
+// Set global synchronization (regular DS)
+__device__ void ds_sync(volatile unsigned int *flags, const int my_s){
+#if ATOM
+  if (threadIdx.x == 0){
+    while (atomicOr((unsigned int*)&flags[my_s], 0) == 0){}
+    atomicOr((unsigned int*)&flags[my_s + 1], 1);
+  }
+#else
+  if (threadIdx.x == 0){
+    while (flags[my_s] == 0){}
+    flags[my_s + 1] = 1;
+  }
+#endif
+  __syncthreads();
+}
+
+// Set global synchronization (irregular DS)
+__device__ void ds_sync_irregular(volatile unsigned int *flags, const int my_s, int *count){
+#if ATOM
+  if (threadIdx.x == 0){
+    while (atomicOr((unsigned int*)&flags[my_s], 0) == 0){}
+    int flag = flags[my_s];
+    atomicAdd((unsigned int*)&flags[my_s + 1], flag + *count);
+    *count = flag - 1;
+  }
+#else
+  if (threadIdx.x == 0){
+    while (flags[my_s] == 0){}
+    int flag = flags[my_s];
+    flags[my_s + 1] = flag + *count;
+    *count = flag - 1;
+  }
+#endif
+  __syncthreads();
+}
+
+// Set global synchronization (irregular DS Partition)
+__device__ void ds_sync_irregular_partition(volatile unsigned int *flags1, volatile unsigned int *flags2, const int my_s, int *count1, int *count2){
+#if ATOM
+  if (threadIdx.x == 0){
+    while (atomicOr((unsigned int*)&flags1[my_s], 0) == 0){}
+    int flag2 = flags2[my_s];
+    atomicAdd((unsigned int*)&flags2[my_s + 1], flag2 + *count);
+    int flag1 = flags1[my_s];
+    atomicAdd((unsigned int*)&flags1[my_s + 1], flag1 + *count);
+    *count1 = flag1 - 1;
+    *count2 = flag2 - 1;
+  }
+#else
+  if (threadIdx.x == 0){
+    while (flags1[my_s] == 0){}
+    int flag2 = flags2[my_s];
+    flags2[my_s + 1] = flag2 + *count2;
+    int flag1 = flags1[my_s];
+    flags1[my_s + 1] = flag1 + *count1;
+    *count1 = flag1 - 1;
+    *count2 = flag2 - 1;
+  }
+#endif
+  __syncthreads();
+}
+
+// Reduction kernel (CUDA SDK reduce6)
+template <class S>
+__device__ void reduction(S *count, S local_cnt){
+    __shared__ S sdata[L_DIM];
+
+    unsigned int tid = threadIdx.x;
+    S mySum = local_cnt;
+
+    // each thread puts its local sum into shared memory
+    sdata[tid] = local_cnt;
+    __syncthreads();
+
+    // do reduction in shared mem
+    if ((blockDim.x >= 1024) && (tid < 512)){
+        sdata[tid] = mySum = mySum + sdata[tid + 512];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >= 512) && (tid < 256)){
+        sdata[tid] = mySum = mySum + sdata[tid + 256];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >= 256) && (tid < 128)){
+            sdata[tid] = mySum = mySum + sdata[tid + 128];
+    }
+     __syncthreads();
+
+    if ((blockDim.x >= 128) && (tid <  64)){
+       sdata[tid] = mySum = mySum + sdata[tid +  64];
+    }
+    __syncthreads();
+
+#if (__CUDA_ARCH__ >= 300 )
+    if ( tid < 32 ){
+        // Fetch final intermediate sum from 2nd warp
+        if (blockDim.x >=  64) mySum += sdata[tid + 32];
+        // Reduce final warp using shuffle
+        #pragma unroll
+        for (int offset = WARP_SIZE/2; offset > 0; offset /= 2){
+            //mySum += __shfl_down(mySum, offset);
+            mySum += __shfl_xor(mySum, offset);
+        }
+    }
+#else
+    // fully unroll reduction within a single warp
+    if ((blockDim.x >=  64) && (tid < 32)){
+        sdata[tid] = mySum = mySum + sdata[tid + 32];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >=  32) && (tid < 16)){
+        sdata[tid] = mySum = mySum + sdata[tid + 16];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >=  16) && (tid <  8)){
+        sdata[tid] = mySum = mySum + sdata[tid +  8];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >=   8) && (tid <  4)){
+        sdata[tid] = mySum = mySum + sdata[tid +  4];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >=   4) && (tid <  2)){
+        sdata[tid] = mySum = mySum + sdata[tid +  2];
+    }
+    __syncthreads();
+
+    if ((blockDim.x >=   2) && ( tid <  1)){
+        sdata[tid] = mySum = mySum + sdata[tid +  1];
+    }
+    __syncthreads();
+#endif
+
+    // write result for this block to global mem
+    if (tid == 0) *count = mySum;
+}
+
+// Binary prefix-sum (GPU Computing Gems)
+__device__ inline int lane_id(void) { return threadIdx.x % WARP_SIZE; }
+__device__ inline int warp_id(void) { return threadIdx.x / WARP_SIZE; }
+
+__device__ unsigned int warp_prefix_sums(bool p){
+  unsigned int b = __ballot(p);
+  return __popc(b & ((1 << lane_id()) - 1));
+}
+
+__device__ int warp_scan(int val, volatile int *s_data){
+#if (__CUDA_ARCH__ < 300 )
+  int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
+  s_data[idx] = 0;
+  idx += WARP_SIZE;
+  int t = s_data[idx] = val;
+  s_data[idx] = t = t + s_data[idx - 1];
+  s_data[idx] = t = t + s_data[idx - 2];
+  s_data[idx] = t = t + s_data[idx - 4];
+  s_data[idx] = t = t + s_data[idx - 8];
+  s_data[idx] = t = t + s_data[idx - 16];
+  return s_data[idx - 1];
+#else
+  int x = val;
+  #pragma unroll
+  for(int offset = 1; offset < 32; offset <<= 1){
+  // From GTC: Kepler shuffle tips and tricks:
+#if 0
+    int y = __shfl_up(x, offset);
+    if(lane_id() >= offset)
+      x += y;
+#else
+    asm volatile("{"
+        " .reg .s32 r0;"
+        " .reg .pred p;"
+        " shfl.up.b32 r0|p, %0, %1, 0x0;"
+        " @p add.s32 r0, r0, %0;"
+        " mov.s32 %0, r0;"
+        "}" : "+r"(x) : "r"(offset));
+#endif
+  }
+  return x - val;
+#endif
+}
+
+__device__ int block_binary_prefix_sums(int* count, int x){
+
+  __shared__ int sdata[L_DIM];
+
+  // A. Exclusive scan within each warp
+  int warpPrefix = warp_prefix_sums(x);
+
+  // B. Store in shared memory
+  if(lane_id() == WARP_SIZE - 1)
+    sdata[warp_id()] = warpPrefix + x;
+  __syncthreads();
+
+  // C. One warp scans in shared memory
+  if(threadIdx.x < WARP_SIZE)
+    sdata[threadIdx.x] = warp_scan(sdata[threadIdx.x], sdata);
+  __syncthreads();
+
+  // D. Each thread calculates it final value
+  int thread_out_element = warpPrefix + sdata[warp_id()];
+  int output = thread_out_element + *count;
+  __syncthreads();
+  if(threadIdx.x == blockDim.x - 1)
+    *count += (thread_out_element + x);
+
+  return output;
+}
diff --git a/SEL/baselines/gpu/kernel.cu b/SEL/baselines/gpu/kernel.cu
new file mode 100644
index 0000000..607dfd1
--- /dev/null
+++ b/SEL/baselines/gpu/kernel.cu
@@ -0,0 +1,113 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+__global__ void select_remove_if(T *matrix_out, T *matrix,
+    int size,
+    volatile unsigned int *flags,
+    struct is_even pred)
+{
+  __shared__ int count; // Counter for number of non-zero elements per block
+  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
+
+  // Dynamic allocation of runtime workgroup id
+  if (threadIdx.x == 0) count = 0;
+  const int my_s = dynamic_wg_id(flags, num_flags);
+
+  int local_cnt = 0;
+  // Declare on-chip memory
+  T reg[REGS];
+  int pos = my_s * REGS * blockDim.x + threadIdx.x;
+  // Load in on-chip memory
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    if (pos < size){
+      reg[j] = matrix[pos];
+      if(!pred(reg[j]))
+        local_cnt++;
+      else
+        reg[j] = -1;
+    }
+    else
+      reg[j] = -1;
+    pos += blockDim.x;
+  }
+  reduction<int>(&count, local_cnt);
+
+  // Set global synch
+  ds_sync_irregular(flags, my_s, &count);
+
+  // Store to global memory 
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
+    if (reg[j] >= 0){
+      matrix_out[pos] = reg[j];
+    }
+  }
+}
+
+__global__ void select_copy_if(T *matrix_out, T *matrix,
+    int size,
+    volatile unsigned int *flags,
+    struct is_even pred)
+{
+  __shared__ int count; // Counter for number of non-zero elements per block
+  const int num_flags = size % (blockDim.x * REGS) == 0 ? size / (blockDim.x * REGS) : size / (blockDim.x * REGS) + 1;
+
+  // Dynamic allocation of runtime workgroup id
+  if (threadIdx.x == 0) count = 0;
+  const int my_s = dynamic_wg_id(flags, num_flags);
+
+  int local_cnt = 0;
+  // Declare on-chip memory
+  T reg[REGS];
+  int pos = my_s * REGS * blockDim.x + threadIdx.x;
+  // Load in on-chip memory
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    if (pos < size){
+      reg[j] = matrix[pos];
+      if(pred(reg[j]))
+        local_cnt++;
+      else
+        reg[j] = -1;
+    }
+    else
+      reg[j] = -1;
+    pos += blockDim.x;
+  }
+  reduction<int>(&count, local_cnt);
+
+  // Set global synch
+  ds_sync_irregular(flags, my_s, &count);
+
+  // Store to global memory 
+  #pragma unroll
+  for (int j = 0; j < REGS; j++){
+    pos = block_binary_prefix_sums(&count, reg[j] >= 0);
+    if (reg[j] >= 0){
+      matrix_out[pos] = reg[j];
+    }
+  }
+}
diff --git a/SEL/baselines/gpu/select.cu b/SEL/baselines/gpu/select.cu
new file mode 100644
index 0000000..b3c60d6
--- /dev/null
+++ b/SEL/baselines/gpu/select.cu
@@ -0,0 +1,242 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2015 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+  In-Place Data Sliding Algorithms for Many-Core Architectures, presented in ICPP’15
+
+  Copyright (c) 2015 University of Illinois at Urbana-Champaign. 
+  All rights reserved.
+
+  Permission to use, copy, modify and distribute this software and its documentation for 
+  educational purpose is hereby granted without fee, provided that the above copyright 
+  notice and this permission notice appear in all copies of this software and that you do 
+  not sell the software.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
+  OTHERWISE.
+
+  Authors: Juan Gómez-Luna (el1goluj@uco.es, gomezlun@illinois.edu), Li-Wen Chang (lchang20@illinois.edu)
+*/
+
+#include "ds.h"
+
+// Sample predicate for partition (only for INT)
+struct is_even{
+  __host__ __device__
+  bool operator()(const T &x){
+    return (x % 2) == 0;
+  }
+};
+
+#include "kernel.cu"
+
+// Sequential CPU version
+void cpu_copy_if(T* output, T* input, int elements, struct is_even pred){
+  int pos = 0;
+  for (int i = 0; i < elements; i++){
+    if(pred(input[i])){
+	  output[pos] = input[i];
+      pos++;
+    }
+  }
+}
+void cpu_remove_if(T* input, int elements, struct is_even pred){
+  int pos = 0;
+  for (int i = 0; i < elements; i++){
+    if(!pred(input[i])){
+	  input[pos] = input[i];
+      pos++;
+    }
+  }
+}
+
+int main(int argc, char **argv){
+
+  // Syntax verification
+  if (argc != 4) {
+      printf("Wrong format\n");
+      printf("Syntax: %s <Device Input (%% elements) numElements>\n",argv[0]);
+      exit(1);
+  }
+  int device = atoi(argv[1]);
+  int input = atoi(argv[2]);
+  int numElements = atoi(argv[3]);
+  size_t size = numElements * sizeof(T);
+
+  // Set device
+  cudaDeviceProp device_properties;
+  cudaGetDeviceProperties(&device_properties,device);
+  cudaSetDevice(device);
+
+  printf("DS Select on %s\n", device_properties.name);
+  printf("Thread block size = %d\n", L_DIM);
+  printf("Coarsening factor = %d\n", REGS);
+#ifdef FLOAT
+  printf("Single precision array: %d elements\n", numElements);
+#elif INT
+  printf("Integer array: %d elements\n", numElements);
+#else
+  printf("Double precision array: %d elements\n", numElements);
+#endif
+
+  // Event creation
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  float time1 = 0;
+  float time2 = 0;
+  float time3 = 0;
+
+  // Allocate the host input vector A
+  T *h_A = (T*)malloc(size);
+
+  // Allocate the host output vectors
+  T *h_B = (T*)malloc(size);
+  T *h_C = (T*)malloc(size);
+  T *h_D = (T*)malloc(size);
+
+  // Allocate the device input vector A and output vector B
+  T *d_A = NULL;
+  cudaMalloc((void **)&d_A, size);
+  T *d_B = NULL;
+  cudaMalloc((void **)&d_B, size);
+
+#define WARMUP 2
+#define REP 10
+  unsigned int flagM1 = 0;
+  unsigned int flagM2 = 0;
+  for(int iteration = 0; iteration < REP+WARMUP; iteration++){
+    // Initialize the host input vectors
+    srand(2014);
+    for(int i = 0; i < numElements; i++)
+        h_A[i] = i % 2 != 0 ? i:i+1;
+    int M = (numElements * input)/100;
+    int m = M;
+    while(m>0){
+        int x = (int)(numElements*(((float)rand()/(float)RAND_MAX)));
+        if(h_A[x] % 2 != 0){
+            h_A[x] = x * 2;
+            m--;
+        }
+    }
+
+#if PRINT
+    for(int i = 0; i < numElements; ++i){
+        printf("%d ",*(h_A+i));
+    }
+    printf("\n");
+#endif
+
+    // Copy the host input vector A in host memory to the device input vector in device memory
+    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
+
+    int ldim = L_DIM;
+    // Atomic flags
+    unsigned int* d_flags = NULL;
+    const int num_flags = numElements % (ldim * REGS) == 0 ? numElements / (ldim * REGS) : numElements / (ldim * REGS) + 1;
+    unsigned int *flags = (unsigned int *)calloc(sizeof(unsigned int), num_flags + 2);
+    flags[0] = 1;
+    flags[num_flags + 1] = 0;
+    cudaMalloc((void **)&d_flags, (num_flags + 2) * sizeof(unsigned int));
+    cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    // Number of work-groups/thread blocks
+    int num_wg = num_flags;
+
+    // Start timer
+    cudaEventRecord( start, 0 );
+
+    // Kernel launch (Copy_if)
+    select_copy_if<<<num_wg, ldim>>>(d_B, d_A, numElements, d_flags, is_even());
+
+    cudaMemcpy(&flagM1, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+    // Stop timer
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize( stop );
+    cudaEventElapsedTime( &time1, start, stop );
+    if(iteration >= WARMUP) time2 += time1;
+
+    if(iteration == REP+WARMUP-1){
+      float timer = time2 / REP;
+      double bw = (double)((numElements + flagM1) * sizeof(T)) / (double)(timer * 1000000.0);
+      printf("Copy_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+    }
+
+    // Atomic flags
+    cudaMemcpy(d_flags, flags, (num_flags + 2) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    free(flags);
+
+    // Start timer
+    cudaEventRecord( start, 0 );
+
+    // Kernel launch (Remove_if)
+    select_remove_if<<<num_wg, ldim>>>(d_A, d_A, numElements, d_flags, is_even()); 
+
+    cudaMemcpy(&flagM2, d_flags + num_flags, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
+    // End timer
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize( stop );
+    cudaEventElapsedTime( &time1, start, stop );
+    if(iteration >= WARMUP) time3 += time1;
+
+    if(iteration == REP+WARMUP-1){
+      float timer = time3 / REP;
+      double bw = (double)((numElements + flagM2) * sizeof(T)) / (double)(timer * 1000000.0);
+      printf("Remove_if - Execution time = %f ms, Throughput = %f GB/s\n", timer, bw);
+    }
+
+    // Free flags
+    cudaFree(d_flags);
+  }
+  // Copy to host memory
+  cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_C, d_A, size, cudaMemcpyDeviceToHost);
+
+  // CPU execution for comparison
+  cpu_copy_if(h_D, h_A, numElements, is_even());
+  cpu_remove_if(h_A, numElements, is_even());
+
+  // Verify that the result vector is correct
+#if PRINT
+  for(int i = 0; i < numElements; ++i){
+     printf("%d ",*(h_B+i));
+  }
+  printf("\n");
+  for(int i = 0; i < numElements; ++i){
+      printf("%d ",*(h_D+i));
+  }
+  printf("\n");
+#endif
+  for (int i = 0; i < flagM1 - 1; ++i){
+      if (h_B[i] != h_D[i]){
+          fprintf(stderr, "Copy_if - Result verification failed at element %d!\n", i);
+          exit(EXIT_FAILURE);
+      }
+  }
+  for (int i = 0; i < flagM2 - 1; ++i){
+      if (h_C[i] != h_A[i]){
+          fprintf(stderr, "Remove_if - Result verification failed at element %d!\n", i);
+          exit(EXIT_FAILURE);
+      }
+  }
+  printf("Test PASSED\n");
+
+  // Free device global memory
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Free host memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+  free(h_D);
+
+  return 0;
+}