summaryrefslogtreecommitdiff
path: root/MLP/host/app.c
diff options
context:
space:
mode:
Diffstat (limited to 'MLP/host/app.c')
-rw-r--r--MLP/host/app.c176
1 files changed, 111 insertions, 65 deletions
diff --git a/MLP/host/app.c b/MLP/host/app.c
index 952cb3f..24243bf 100644
--- a/MLP/host/app.c
+++ b/MLP/host/app.c
@@ -27,28 +27,29 @@
#define DPU_BINARY "./bin/mlp_dpu"
#endif
-static T** A;
-static T* B;
-static T* B_host;
-static T* B_tmp;
-static T* C;
-static T* C_dpu;
+static T **A;
+static T *B;
+static T *B_host;
+static T *B_tmp;
+static T *C;
+static T *C_dpu;
// Create input arrays
-static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int n_size) {
+static void init_data(T **A, T *B, T *B_host, unsigned int m_size,
+ unsigned int n_size)
+{
for (unsigned int l = 0; l < NUM_LAYERS; l++)
- for (unsigned int i = 0; i < m_size * n_size; i++){
- if(i % 100 < 98){
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ if (i % 100 < 98) {
A[l][i] = 0;
- }else{
- A[l][i] = (l+i) % 2;
+ } else {
+ A[l][i] = (l + i) % 2;
}
}
- for (unsigned int i = 0; i < n_size; i++){
- if(i % 50 < 48){
+ for (unsigned int i = 0; i < n_size; i++) {
+ if (i % 50 < 48) {
B[i] = 0;
- }
- else{
+ } else {
B[i] = i % 2;
}
B_host[i] = B[i];
@@ -56,26 +57,29 @@ static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int
}
// Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
- for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
- for (unsigned int m = 0; m < m_size; m++){
+ for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+ for (unsigned int m = 0; m < m_size; m++) {
C[m] = 0;
}
- for (unsigned int m = 0; m < m_size; m++){
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int m = 0; m < m_size; m++) {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[nl][m * n_size + n] * B[n];
}
C[m] = max(0, C[m]);
}
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int n = 0; n < n_size; n++) {
B[n] = C[n];
}
}
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
struct Params p = input_params(argc, argv);
@@ -97,14 +101,15 @@ int main(int argc, char **argv) {
unsigned int n_size = p.n_size;
// Initialize help data
- dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t));
- dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
+ dpu_info =
+ (struct dpu_info_t *)malloc(nr_of_dpus * sizeof(struct dpu_info_t));
+ dpu_arguments_t *input_args =
+ (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
uint32_t max_rows_per_dpu = 0;
uint32_t n_size_pad = n_size;
- if(n_size % 2 == 1){
+ if (n_size % 2 == 1) {
n_size_pad++;
}
-
// Timer
Timer timer;
i = 0;
@@ -118,7 +123,10 @@ int main(int argc, char **argv) {
rows_per_dpu++;
if (rest_rows > 0) {
if (i >= rest_rows)
- prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+ prev_rows_dpu =
+ rest_rows * (chunks + 1) + (i -
+ rest_rows) *
+ chunks;
else
prev_rows_dpu = i * (chunks + 1);
} else {
@@ -127,7 +135,7 @@ int main(int argc, char **argv) {
// Keep max rows for parallel transfers
uint32_t rows_per_dpu_pad = rows_per_dpu;
- if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+ if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
rows_per_dpu_pad++;
if (rows_per_dpu_pad > max_rows_per_dpu)
max_rows_per_dpu = rows_per_dpu_pad;
@@ -142,14 +150,15 @@ int main(int argc, char **argv) {
input_args[i].nr_rows = rows_per_dpu;
}
- A = (T**)malloc(NUM_LAYERS * sizeof(T*));
- for(l = 0; l < NUM_LAYERS; l++)
- A[l] = (T*)malloc( max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T));
+ A = (T **) malloc(NUM_LAYERS * sizeof(T *));
+ for (l = 0; l < NUM_LAYERS; l++)
+ A[l] =
+ (T *) malloc(max_rows_per_dpu * nr_of_dpus * n_size_pad *
+ sizeof(T));
-
- B = (T*)malloc(n_size * sizeof(T));
- B_host = (T*)malloc(n_size * sizeof(T));
- C = (T*)malloc(m_size * sizeof(T));
+ B = (T *) malloc(n_size * sizeof(T));
+ B_host = (T *) malloc(n_size * sizeof(T));
+ C = (T *) malloc(m_size * sizeof(T));
C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
@@ -170,26 +179,36 @@ int main(int argc, char **argv) {
input_args[i].max_rows = max_rows_per_dpu;
DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
-
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
// Copy input array and vector
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A[0] + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A[0] + dpu_info[i].prev_rows_dpu * n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup)
stop(&timer, 1);
// Run kernel on DPUs
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
@@ -198,31 +217,38 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
- for(int lay = 1; lay < NUM_LAYERS; lay++){
+ for (int lay = 1; lay < NUM_LAYERS; lay++) {
if (rep >= p.n_warmup)
start(&timer, 4, rep - p.n_warmup);
i = 0;
// Copy C_dpu
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T),
+ DPU_XFER_DEFAULT));
// B = C
unsigned int n, j;
i = 0;
for (n = 0; n < nr_of_dpus; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- B_tmp[i] = C_dpu[n * max_rows_per_dpu + j];
+ B_tmp[i] =
+ C_dpu[n * max_rows_per_dpu + j];
i++;
}
}
@@ -230,20 +256,31 @@ int main(int argc, char **argv) {
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B_tmp));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
// Copy next matrix of weights
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A[lay] + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A[lay] +
+ dpu_info[i].prev_rows_dpu *
+ n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup)
+ if (rep >= p.n_warmup)
stop(&timer, 4);
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
@@ -252,8 +289,7 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
@@ -273,16 +309,23 @@ int main(int argc, char **argv) {
start(&timer, 3, rep - p.n_warmup);
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup)
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup)
stop(&timer, 3);
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -311,23 +354,26 @@ int main(int argc, char **argv) {
i = 0;
for (n = 0; n < nr_of_dpus; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+ if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
status = false;
#if PRINT
- printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+ printf("%d: %d -- %d\n", i, C[i],
+ C_dpu[n * max_rows_per_dpu + j]);
#endif
}
i++;
}
}
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
}
// Deallocation
- for(i = 0; i < NUM_LAYERS; i++)
+ for (i = 0; i < NUM_LAYERS; i++)
free(A[i]);
free(A);
free(B);