summaryrefslogtreecommitdiff
path: root/NW/host/app.c
diff options
context:
space:
mode:
Diffstat (limited to 'NW/host/app.c')
-rw-r--r--NW/host/app.c879
1 files changed, 879 insertions, 0 deletions
diff --git a/NW/host/app.c b/NW/host/app.c
new file mode 100644
index 0000000..0e899ec
--- /dev/null
+++ b/NW/host/app.c
@@ -0,0 +1,879 @@
+/**
+* app.c
+* NW Host Application Source File
+*
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <dpu.h>
+#include <dpu_log.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include "../support/common.h"
+#include "../support/timer.h"
+#include "../support/params.h"
+
+#if ENERGY
+#include <dpu_probe.h>
+#endif
+
+// Define the DPU Binary path as DPU_BINARY here
+#ifndef DPU_BINARY
+#define DPU_BINARY "./bin/nw_dpu"
+#endif
+
+// Traceback in the host
+#if PRINT_FILE
+static void traceback(int* traceback_output, char *file, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) {
+ FILE *fpo = fopen(file, "w"); // Use to print to an output file
+#else
+static void traceback(int* traceback_output, int32_t *input_itemsets, int32_t *reference, unsigned int max_rows, unsigned int max_cols, unsigned int penalty) {
+#endif
+
+ int k = 0;
+ for (int i = max_rows - 2, j = max_rows - 2; i>=0 && j>=0;) {
+ int nw = 0, n = 0, w = 0, traceback = 0;
+#if PRINT_FILE
+ if ( i == (int)max_rows - 2 && j == (int)max_rows - 2 )
+ fprintf(fpo, "%d ", input_itemsets[ i * max_cols + j]); //print the first element
+#endif
+
+ if (i == 0 && j == 0)
+ break;
+ if (i > 0 && j > 0) {
+ nw = input_itemsets[(i - 1) * max_cols + j - 1];
+ w = input_itemsets[i * max_cols + j - 1];
+ n = input_itemsets[(i - 1) * max_cols + j];
+ } else if (i == 0) {
+ nw = n = LIMIT;
+ w = input_itemsets[ i * max_cols + j - 1 ];
+ } else if (j == 0) {
+ nw = w = LIMIT;
+ n = input_itemsets[(i - 1) * max_cols + j];
+ } else {
+ ;
+ }
+
+ int new_nw, new_w, new_n;
+ new_nw = nw + reference[i * max_cols + j];
+ new_w = w - penalty;
+ new_n = n - penalty;
+
+ traceback = maximum(new_nw, new_w, new_n);
+ if (traceback == new_nw)
+ traceback = nw;
+ if (traceback == new_w)
+ traceback = w;
+ if (traceback == new_n)
+ traceback = n;
+
+#if PRINT_FILE
+ fprintf(fpo, "%d ", traceback);
+#endif
+ traceback_output[k++] = traceback;
+
+ if (traceback == nw) {
+ i--;
+ j--;
+ continue;
+ } else if (traceback == w) {
+ j--;
+ continue;
+ } else if (traceback == n) {
+ i--;
+ continue;
+ } else {
+ ;
+ }
+ }
+
+ return;
+}
+
+// Compute output in the host
+static void nw_host(int32_t *input_itemsets, int32_t *reference, uint64_t max_cols, unsigned int penalty) {
+
+ int32_t *input_itemsets_l = (int32_t *) malloc((BL + 1) * (BL + 1) * sizeof(int32_t));
+ int32_t *reference_l = (int32_t *) malloc((BL * BL) * sizeof(int32_t));
+
+
+ // top-left
+ for (uint64_t blk = 1; blk <= (max_cols-1)/BL; blk++) {
+ for (uint64_t b_index_x = 0; b_index_x < blk; b_index_x++) {
+ uint64_t b_index_y = blk - 1 - b_index_x;
+
+ for (uint64_t i = 0; i < BL; i++){
+ for (uint64_t j = 0; j < BL; j++) {
+ reference_l[i*BL + j] = reference[(max_cols-1) * (b_index_y*BL + i) + b_index_x*BL + j];
+ }
+ }
+
+ for (uint64_t i = 0; i < BL + 1; i++){
+ for (uint64_t j = 0; j < BL + 1; j++) {
+ input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j];
+ }
+ }
+
+ // Computation
+ for (uint64_t i = 1; i < BL + 1; i++) {
+ for (uint64_t j = 1; j < BL + 1; j++) {
+ input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1],
+ input_itemsets_l[i*(BL+1) + j - 1] - penalty,
+ input_itemsets_l[(i-1)*(BL+1) + j] - penalty);
+ }
+ }
+
+ for (uint64_t i = 0; i < BL; i++) {
+ for (uint64_t j = 0; j < BL; j++) {
+ input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1];
+ }
+ }
+
+ }
+
+ }
+
+ // bottom-right
+ for (uint64_t blk = 2; blk <= (max_cols-1)/BL; blk++) {
+ for (uint64_t b_index_x = blk - 1; b_index_x < (max_cols-1)/BL; b_index_x++) {
+ uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+
+ for (uint64_t i = 0; i < BL; i++){
+ for (uint64_t j = 0; j < BL; j++) {
+ reference_l[i*BL + j] = reference[(max_cols-1)*(b_index_y*BL + i) + b_index_x*BL + j];
+ }
+ }
+
+ for (uint64_t i = 0; i < BL + 1; i++){
+ for (uint64_t j = 0; j < BL + 1; j++) {
+ input_itemsets_l[i*(BL + 1) + j] = input_itemsets[max_cols*(b_index_y*BL + i) + b_index_x*BL + j];
+ }
+ }
+
+ // Computation
+ for (uint64_t i = 1; i < BL + 1; i++) {
+ for (uint64_t j = 1; j < BL + 1; j++) {
+ input_itemsets_l[i*(BL + 1) + j] = maximum(input_itemsets_l[(i-1)*(BL+1) + j - 1] + reference_l[(i-1)*BL + j - 1],
+ input_itemsets_l[i*(BL+1) + j - 1] - penalty,
+ input_itemsets_l[(i-1)*(BL+1) + j] - penalty);
+ }
+ }
+
+ for (uint64_t i = 0; i < BL; i++) {
+ for (uint64_t j = 0; j < BL; j++) {
+ input_itemsets[max_cols*(b_index_y*BL + i + 1) + b_index_x*BL + j + 1] = input_itemsets_l[(i+1)*(BL+1) + j + 1];
+ }
+ }
+
+ }
+
+ }
+
+
+ free(input_itemsets_l);
+ free(reference_l);
+ return;
+}
+
+// Main of the Host Application
+int main(int argc, char **argv) {
+
+ struct Params p = input_params(argc, argv);
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t nr_of_dpus, max_dpus;
+
+#if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy probe", &probe));
+#endif
+
+ // Allocate DPUs and load binary
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ printf("Allocated %d DPU(s)\n", nr_of_dpus);
+ printf("Allocated %d TASKLET(s) per DPU\n", NR_TASKLETS);
+#if DYNAMIC
+ max_dpus = nr_of_dpus;
+#endif
+
+ uint64_t max_rows = p.max_rows + 1;
+ uint64_t max_cols = p.max_rows + 1;
+ unsigned int penalty = p.penalty;
+ int32_t *reference = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t));
+ int32_t *input_itemsets_host = (int32_t *) malloc(max_rows * max_cols * sizeof(int32_t));
+ int32_t *input_itemsets = (int32_t *) malloc((max_rows+1) * (max_cols+1) * sizeof(int32_t));
+ dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
+ printf("Max size %d\n", p.max_rows);
+
+ // Traceback output
+ int32_t* traceback_output = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t));
+ int32_t* traceback_output_host = (int32_t *) malloc((max_rows + max_cols) * sizeof(int32_t));
+ memset(traceback_output, 0, (max_rows + max_cols) * sizeof(int32_t));
+ memset(traceback_output_host, 0, (max_rows + max_cols) * sizeof(int32_t));
+
+ // This array is used for dummy/stale CPU-DPU transfers
+ int32_t *dummy = (int32_t *) malloc(nr_of_dpus * (BL+2) * sizeof(int32_t));
+ unsigned int blocks_per_dpu;
+ unsigned int mram_offset = 0;
+
+ // Timer
+ Timer timer;
+ Timer long_diagonal_timer;
+#if ENERGY
+ double tacc_energy, tacc_time, tavg_time;
+ double tavg_energy=0;
+#endif
+
+ for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+
+ // Initializing inputs are needed at each iteration
+ // Initialize input itemsets
+ for(unsigned int i = 0; i < max_rows; i++) {
+ for (unsigned int j = 0; j < max_cols; j++) {
+ input_itemsets_host[i * max_cols + j] = 0;
+ }
+ }
+
+ for(unsigned int i = 0; i <= max_rows; i++) {
+ for (unsigned int j = 0; j <= max_cols; j++) {
+ input_itemsets[i * (max_cols+1) + j] = 0;
+ }
+ }
+
+ // Define random sequences
+ srand(7);
+ for (unsigned int i = 1; i < max_rows; i++) {
+ input_itemsets_host[i * max_cols] = rand() % 10 + 1;
+ }
+
+ for (unsigned int j = 1; j < max_cols; j++) {
+ input_itemsets_host[j] = rand() % 10 + 1;
+ }
+
+ for (unsigned int i = 0; i < max_rows-1; i++) {
+ for (unsigned int j = 0; j < max_cols-1; j++) {
+ reference[i * (max_cols-1) + j] = blosum62[input_itemsets[(i+1) * max_cols]][input_itemsets[j+1]];
+ }
+ }
+
+ for (unsigned int i = 1; i < max_rows; i++) {
+ input_itemsets_host[i * max_cols] = -i * penalty;
+ input_itemsets[i * (max_cols+1)] = -i * penalty;
+ }
+
+ for (unsigned int j = 1; j < max_cols; j++) {
+ input_itemsets_host[j] = -j * penalty;
+ input_itemsets[j] = -j * penalty;
+ }
+
+ if (rep >= p.n_warmup)
+ start(&timer, 0, rep - p.n_warmup);
+ // Computation on host CPU
+ nw_host(input_itemsets_host, reference, max_cols, penalty);
+
+ // Print host output
+#if PRINT_FILE
+ if (rep >= p.n_warmup) {
+ char *host_file = "./bin/host_output.txt";
+ traceback(traceback_output_host, host_file, input_itemsets_host, reference, max_rows, max_cols, penalty);
+ }
+#endif
+ if (rep >= p.n_warmup)
+ stop(&timer, 0);
+
+ // Top-left computation on DPUs
+ for (unsigned int blk = 1; blk <= (max_cols-1)/BL; blk++) {
+#if DYNAMIC
+ // If nr_of_blocks are lower than max_dpus,
+ // set nr_of_dpus to be equal with nr_of_blocks
+ unsigned nr_of_blocks = blk;
+ if (nr_of_blocks < max_dpus) {
+ DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ } else if (nr_of_dpus == max_dpus) {
+ ;
+ } else {
+ DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ }
+#if PRINT
+ printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, blk);
+#endif
+#endif
+
+ // Copy data to DPUs
+ unsigned int i=0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int blocks_per_dpu = blk / nr_of_dpus;
+ unsigned int active_blocks_per_dpu = blk / nr_of_dpus;
+ unsigned int rest_blocks = blk % nr_of_dpus;
+ if(i < rest_blocks)
+ blocks_per_dpu++;
+
+ if(rest_blocks != 0)
+ active_blocks_per_dpu++;
+
+ // Copy input arguments to dpu
+ input_args[i].nblocks = blocks_per_dpu;
+ input_args[i].active_blocks = active_blocks_per_dpu;
+ input_args[i].penalty = penalty;
+ DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
+ }
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+
+ // Copy itemsets to DPUs
+ blocks_per_dpu = blk / nr_of_dpus;
+ if (blk % nr_of_dpus != 0)
+ blocks_per_dpu++;
+ mram_offset = 0;
+
+
+ if (rep >= p.n_warmup) {
+ if ((max_cols-1)/BL == 1)
+ start(&timer, 2, rep - p.n_warmup + blk - 1);
+ else
+ start(&timer, 1, rep - p.n_warmup + blk - 1);
+
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ if ((max_cols-1)/BL == 1)
+ start(&long_diagonal_timer, 2, rep - p.n_warmup);
+ else
+ start(&long_diagonal_timer, 1, rep - p.n_warmup);
+ }
+ }
+
+#if PRINT
+ uint64_t total_dpu_memory = 0;
+ total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t);
+ printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory);
+#endif
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = blk / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = blk % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t input_itemsets_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= blk) {
+ dpu_pointer = dummy;
+ input_itemsets_offset = 0;
+ } else {
+ uint64_t b_index_x = prev_block_index + bl_indx;
+ uint64_t b_index_y = blk - 1 - b_index_x;
+ dpu_pointer = input_itemsets;
+ input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);
+ }
+
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+ }
+
+ if (bl == 0)
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+ else
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += ((BL+2) * sizeof(int32_t));
+
+ }
+ }
+ if (rep >= p.n_warmup) {
+ if ((max_cols-1)/BL == 1)
+ stop(&timer, 2);
+ else
+ stop(&timer, 1);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ if ((max_cols-1)/BL == 1)
+ stop(&long_diagonal_timer, 2);
+ else
+ stop(&long_diagonal_timer, 1);
+ }
+ }
+
+
+ if (rep >= p.n_warmup) {
+ start(&timer, 2, rep - p.n_warmup + blk - 1);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ start(&long_diagonal_timer, 2, rep - p.n_warmup);
+ }
+ }
+ // Copy reference to DPUs
+ mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t);
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = blk / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = blk % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t reference_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= blk) {
+ dpu_pointer = dummy;
+ reference_offset = 0;
+ } else {
+ uint64_t b_index_x = prev_block_index + bl_indx;
+ uint64_t b_index_y = blk - 1 - b_index_x;
+ dpu_pointer = reference;
+ reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1);
+ }
+
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset));
+ }
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += BL * sizeof(int32_t);
+
+ }
+ }
+ if (rep >= p.n_warmup) {
+ stop(&timer, 2);
+ if (blk == ((max_cols-1)/BL)) {
+ stop(&long_diagonal_timer, 2);
+ }
+ }
+
+#if ENERGY
+ if (rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_probe_start(&probe));
+ }
+#endif
+ if (rep >= p.n_warmup) {
+ start(&timer, 3, rep - p.n_warmup + blk - 1);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ start(&long_diagonal_timer, 3, rep - p.n_warmup);
+ }
+ }
+ // Launch kernel on DPUs
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 3);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ stop(&long_diagonal_timer, 3);
+ }
+ }
+#if ENERGY
+ if (rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ }
+#endif
+
+#if ENERGY
+ double acc_energy, avg_energy, acc_time, avg_time;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
+ tavg_energy += avg_energy;
+#endif
+
+#if PRINT
+ // Display DPU Logs
+ DPU_FOREACH(dpu_set, dpu) {
+ DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
+ }
+#endif
+
+ if (rep >= p.n_warmup) {
+ start(&timer, 4, rep - p.n_warmup + blk - 1);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ start(&long_diagonal_timer, 4, rep - p.n_warmup);
+ }
+ }
+ // Retrieve results
+ // Copy output result to Host CPU
+ mram_offset = 0;
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = blk / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = blk % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t input_itemsets_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= blk) {
+ dpu_pointer = dummy;
+ input_itemsets_offset = 0;
+ } else {
+ uint64_t b_index_x = prev_block_index + bl_indx;
+ uint64_t b_index_y = blk - 1 - b_index_x;
+ dpu_pointer = input_itemsets;
+ input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);
+ }
+
+ if (bl == 0) // Skip the first row of the block
+ continue;
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+
+ }
+ if (bl == 0) {
+ mram_offset += (BL+2) * sizeof(int32_t);
+ continue;
+ }
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += (BL+2) * sizeof(int32_t);
+
+ }
+ }
+ if (rep >= p.n_warmup) {
+ stop(&timer, 4);
+ // Timer for longest diagonal
+ if (blk == ((max_cols-1)/BL)) {
+ stop(&long_diagonal_timer, 4);
+ }
+ }
+ }
+
+
+ // Bottom-right computation on DPUs
+ for (unsigned int blk = 2; blk <= (max_cols-1)/BL; blk++) {
+#if DYNAMIC
+ // If nr_of_blocks are lower than max_dpus,
+ // set nr_of_dpus to be equal with nr_of_blocks
+ unsigned nr_of_blocks = (((max_cols-1)/BL) - blk + 1);
+ if (nr_of_blocks < max_dpus) {
+ DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_alloc(nr_of_blocks, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ } else if (nr_of_dpus == max_dpus) {
+ ;
+ } else {
+ DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_alloc(max_dpus, NULL, &dpu_set));
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ }
+#if PRINT
+ printf("Allocated %d DPU(s) for %d (%d) blocks\n", nr_of_dpus, nr_of_blocks, (((max_cols-1)/BL) - blk + 1));
+#endif
+#endif
+
+ // Copy data to DPUs
+ unsigned int i=0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ unsigned int active_blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+ if(i < rest_blocks)
+ blocks_per_dpu++;
+
+ if(rest_blocks != 0)
+ active_blocks_per_dpu++;
+
+ // Copy input arguments to dpu
+ input_args[i].nblocks = blocks_per_dpu;
+ input_args[i].active_blocks = active_blocks_per_dpu;
+ input_args[i].penalty = penalty;
+ DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
+ }
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+
+ if (rep >= p.n_warmup)
+ start(&timer, 1, rep - p.n_warmup + blk - 1);
+ // Copy itemsets to DPUs
+ unsigned int blocks_per_dpu = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ if ((((max_cols-1)/BL) - blk + 1) % nr_of_dpus != 0)
+ blocks_per_dpu++;
+#if PRINT
+ uint64_t total_dpu_memory = 0;
+ total_dpu_memory = (uint64_t) blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t) + (uint64_t) blocks_per_dpu * BL * BL * sizeof(int32_t);
+ printf("Total memory allocated in each DPU %u bytes\n", total_dpu_memory);
+#endif
+ unsigned int mram_offset = 0;
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t input_itemsets_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+ dpu_pointer = dummy;
+ input_itemsets_offset = 0;
+ } else {
+ uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+ uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+ dpu_pointer = input_itemsets;
+ input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);
+ }
+
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+ }
+
+ if (bl == 0)
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+ else
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, 2 * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += (BL+2) * sizeof(int32_t);
+
+ }
+ }
+ if (rep >= p.n_warmup)
+ stop(&timer, 1);
+
+
+ if (rep >= p.n_warmup)
+ start(&timer, 2, rep - p.n_warmup + blk - 1);
+ // Copy reference to DPUs
+ mram_offset = blocks_per_dpu * (BL+1) * (BL+2) * sizeof(int32_t);
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t reference_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+ dpu_pointer = dummy;
+ reference_offset = 0;
+ } else {
+ uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+ uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+ dpu_pointer = reference;
+ reference_offset = b_index_y * (max_cols - 1) * BL + b_index_x * BL + bl * (max_cols - 1);
+ }
+
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + reference_offset));
+ }
+
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, BL * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += BL * sizeof(int32_t);
+
+ }
+ }
+ if (rep >= p.n_warmup)
+ stop(&timer, 2);
+
+#if ENERGY
+ if (rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_probe_start(&probe));
+ }
+#endif
+ if (rep >= p.n_warmup)
+ start(&timer, 3, rep - p.n_warmup + blk - 1); // Do not re-initialize the counter
+ // Launch kernel on DPUs
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ if (rep >= p.n_warmup)
+ stop(&timer, 3);
+#if ENERGY
+ if (rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ }
+#endif
+
+#if ENERGY
+ double acc_energy, avg_energy, acc_time, avg_time;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
+ tavg_energy += avg_energy;
+#endif
+
+#if PRINT
+ // Display DPU Logs
+ DPU_FOREACH(dpu_set, dpu) {
+ DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
+ }
+#endif
+
+
+ if (rep >= p.n_warmup)
+ start(&timer, 4, rep - p.n_warmup + blk - 1);
+ // Retrieve results
+ // Copy output result to Host CPU
+ mram_offset = 0;
+ for (unsigned int bl_indx = 0; bl_indx < blocks_per_dpu; bl_indx++) {
+ for (unsigned int bl = 0; bl < BL + 1; bl++) {
+
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ unsigned int chunks = (((max_cols-1)/BL) - blk + 1) / nr_of_dpus;
+ unsigned int prev_block_index = 0;
+ unsigned int rest_blocks = (((max_cols-1)/BL) - blk + 1) % nr_of_dpus;
+ if (rest_blocks > 0) {
+ if (i >= rest_blocks) {
+ prev_block_index = rest_blocks * (chunks + 1) + (i - rest_blocks) * chunks;
+ } else {
+ prev_block_index = i * (chunks + 1);
+ }
+ } else {
+ prev_block_index = i * blocks_per_dpu;
+ }
+
+ uint64_t input_itemsets_offset = 0;
+ int32_t *dpu_pointer;
+ if (i + bl_indx * nr_of_dpus >= (((max_cols-1)/BL) - blk + 1)) {
+ dpu_pointer = dummy;
+ input_itemsets_offset = 0;
+ } else {
+ uint64_t b_index_x = blk - 1 + prev_block_index + bl_indx;
+ uint64_t b_index_y = (max_cols-1)/BL + blk - 2 - b_index_x;
+ dpu_pointer = input_itemsets;
+ input_itemsets_offset = b_index_y * (max_cols+1) * BL + b_index_x * BL + bl * (max_cols + 1);
+ }
+
+ if (bl == 0) // Skip the first row of the block
+ continue;
+ DPU_ASSERT(dpu_prepare_xfer(dpu, dpu_pointer + input_itemsets_offset));
+
+ }
+
+ if (bl == 0) {
+ mram_offset += (BL+2) * sizeof(int32_t);
+ continue;
+ }
+ DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, mram_offset, (BL+2) * sizeof(int32_t), DPU_XFER_DEFAULT));
+ mram_offset += (BL+2) * sizeof(int32_t);
+
+ }
+ }
+ if (rep >= p.n_warmup)
+ stop(&timer, 4);
+
+
+ }
+
+ // Traceback step
+ if (rep >= p.n_warmup)
+ start(&timer, 1, 1);
+#if PRINT_FILE
+ char *dpu_file = "./bin/dpu_output.txt";
+ traceback(traceback_output, dpu_file, input_itemsets, reference, max_rows+1, max_cols+1, penalty);
+#else
+ traceback(traceback_output, input_itemsets, reference, max_rows+1, max_cols+1, penalty);
+#endif
+ if (rep >= p.n_warmup)
+ stop(&timer, 1);
+
+ }
+
+ // Print timing results
+ printf("CPU version ");
+ print(&timer, 0, p.n_reps);
+ printf("CPU-DPU ");
+ print(&timer, 2, p.n_reps);
+ printf("DPU Kernel ");
+ print(&timer, 3, p.n_reps);
+ printf("Inter-DPU ");
+ print(&timer, 1, p.n_reps);
+ printf("DPU-CPU ");
+ print(&timer, 4, p.n_reps);
+ printf("\n");
+ printf("Longest Diagonal CPU-DPU ");
+ print(&long_diagonal_timer, 2, p.n_reps);
+ printf("Longest Diagonal DPU Kernel ");
+ print(&long_diagonal_timer, 3, p.n_reps);
+ printf("Longest Diagonal Inter-DPU ");
+ print(&long_diagonal_timer, 1, p.n_reps);
+ printf("Longest Diagonal DPU-CPU ");
+ print(&long_diagonal_timer, 4, p.n_reps);
+ printf("\n");
+
+#if ENERGY
+ printf("DPU Energy (J): %f \t ", tavg_energy / p.n_reps);
+#endif
+
+ // Check output
+ bool status = true;
+ for (uint64_t i = 1; i < max_rows; i++) {
+ for (uint64_t j = 1; j < max_cols; j++) {
+ if (input_itemsets_host[i*max_cols + j] != input_itemsets[i*(max_cols+1) + j]) {
+ status = false;
+#if PRINT
+ printf("%ld (%ld, %ld): %d %d\n", i*max_cols + j, i, j, input_itemsets_host[i*max_cols + j], input_itemsets[i*(max_cols+1) + j]);
+#endif
+ }
+ }
+ }
+
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ }
+
+ free(input_itemsets_host);
+ free(input_itemsets);
+ free(reference);
+ free(traceback_output);
+ free(traceback_output_host);
+ DPU_ASSERT(dpu_free(dpu_set));
+ return status ? 0 : -1;
+ return 0;
+}