VA: dos2unix; indent -linux

author: Birte Kristina Friesel <birte.friesel@uos.de> 2025-01-16 08:25:34 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2025-01-16 08:25:34 +0100
commit: c2bf48b415e8e51d59bbec59635a02ba4e1cb4c4 (patch)
tree: c06e2048de3d6dac944f48038389790fcd014e17
parent: 0bebc23cf55adfc6a25c0d5f4fa9061ce093e0d7 (diff)
6 files changed, 767 insertions, 642 deletions
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c
index 9290488..7975200 100644
--- a/VA/baselines/cpu/app_baseline.c
+++ b/VA/baselines/cpu/app_baseline.c
@@ -25,7 +25,7 @@
 #include <numaif.h>
 #include <numa.h>
 
-void* mp_pages[1];
+void *mp_pages[1];
 int mp_status[1];
 int mp_nodes[1];
 int numa_node_in = -1;
@@ -55,317 +55,345 @@ static T *B_local;
 /**
 * @brief compute output in the host
 */
-static void vector_addition_host(unsigned long nr_elements, int t) {
-    omp_set_num_threads(t);
-    #pragma omp parallel for
-    for (long i = 0; i < nr_elements; i++) {
+static void vector_addition_host(unsigned long nr_elements, int t)
+{
+	omp_set_num_threads(t);
+#pragma omp parallel for
+	for (long i = 0; i < nr_elements; i++) {
 #if NUMA_MEMCPY
-        C[i] = A_local[i] + B_local[i];
+		C[i] = A_local[i] + B_local[i];
 #else
-        C[i] = A[i] + B[i];
+		C[i] = A[i] + B[i];
 #endif
-    }
+	}
 }
 
 // Params ---------------------------------------------------------------------
 typedef struct Params {
-    long  input_size;
-    int   n_warmup;
-    int   n_reps;
-    int   exp;
-    int   n_threads;
+	long input_size;
+	int n_warmup;
+	int n_reps;
+	int exp;
+	int n_threads;
 #if NUMA
-    struct bitmask* bitmask_in;
-    struct bitmask* bitmask_out;
-    int numa_node_cpu;
+	struct bitmask *bitmask_in;
+	struct bitmask *bitmask_out;
+	int numa_node_cpu;
 #endif
 #if NUMA_MEMCPY
-    int numa_node_cpu_memcpy;
-    struct bitmask* bitmask_cpu;
+	int numa_node_cpu_memcpy;
+	struct bitmask *bitmask_cpu;
 #endif
-}Params;
-
-void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -t <T>    # of threads (default=8)"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=8M elements)"
-        "\n");
+} Params;
+
+void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -t <T>    # of threads (default=8)"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=8M elements)" "\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 16777216;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 1;
-    p.n_threads     = 5;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 16777216;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 1;
+	p.n_threads = 5;
 #if NUMA
-    p.bitmask_in     = NULL;
-    p.bitmask_out    = NULL;
-    p.numa_node_cpu  = -1;
+	p.bitmask_in = NULL;
+	p.bitmask_out = NULL;
+	p.numa_node_cpu = -1;
 #endif
 #if NUMA_MEMCPY
-    p.numa_node_cpu_memcpy  = -1;
-    p.bitmask_cpu    = NULL;
+	p.numa_node_cpu_memcpy = -1;
+	p.bitmask_cpu = NULL;
 #endif
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atol(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'x': p.exp           = atoi(optarg); break;
-        case 't': p.n_threads     = atoi(optarg); break;
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atol(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		case 't':
+			p.n_threads = atoi(optarg);
+			break;
 #if NUMA
-        case 'a': p.bitmask_in    = numa_parse_nodestring(optarg); break;
-        case 'b': p.bitmask_out   = numa_parse_nodestring(optarg); break;
-        case 'c': p.numa_node_cpu = atoi(optarg); break;
+		case 'a':
+			p.bitmask_in = numa_parse_nodestring(optarg);
+			break;
+		case 'b':
+			p.bitmask_out = numa_parse_nodestring(optarg);
+			break;
+		case 'c':
+			p.numa_node_cpu = atoi(optarg);
+			break;
 #if NUMA_MEMCPY
-        case 'C': p.bitmask_cpu   = numa_parse_nodestring(optarg); break;
-        case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
-#endif // NUMA_MEMCPY
-#endif // NUMA
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(p.n_threads > 0 && "Invalid # of ranks!");
-
-    return p;
+		case 'C':
+			p.bitmask_cpu = numa_parse_nodestring(optarg);
+			break;
+		case 'M':
+			p.numa_node_cpu_memcpy = atoi(optarg);
+			break;
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(p.n_threads > 0 && "Invalid # of ranks!");
+
+	return p;
 }
 
 /**
 * @brief Main of the Host Application.
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    const unsigned long input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
+	const unsigned long input_size =
+	    p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
 
-    // Create an input file with arbitrary data.
+	// Create an input file with arbitrary data.
     /**
     * @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values
     * @param nr_elements how many 32-bit elements we want the file to be
     * @return the buffer address
     */
-    srand(0);
+	srand(0);
 
 #if NUMA
-    if (p.bitmask_in) {
-        numa_set_membind(p.bitmask_in);
-        numa_free_nodemask(p.bitmask_in);
-    }
-    A = (T*) numa_alloc(input_size * sizeof(T));
-    B = (T*) numa_alloc(input_size * sizeof(T));
+	if (p.bitmask_in) {
+		numa_set_membind(p.bitmask_in);
+		numa_free_nodemask(p.bitmask_in);
+	}
+	A = (T *) numa_alloc(input_size * sizeof(T));
+	B = (T *) numa_alloc(input_size * sizeof(T));
 #else
-    A = (T*) malloc(input_size * sizeof(T));
-    B = (T*) malloc(input_size * sizeof(T));
+	A = (T *) malloc(input_size * sizeof(T));
+	B = (T *) malloc(input_size * sizeof(T));
 #endif
 
 #if NUMA
-    if (p.bitmask_out) {
-        numa_set_membind(p.bitmask_out);
-        numa_free_nodemask(p.bitmask_out);
-    }
-    C = (T*) numa_alloc(input_size * sizeof(T));
+	if (p.bitmask_out) {
+		numa_set_membind(p.bitmask_out);
+		numa_free_nodemask(p.bitmask_out);
+	}
+	C = (T *) numa_alloc(input_size * sizeof(T));
 #else
-    C = (T*) malloc(input_size * sizeof(T));
+	C = (T *) malloc(input_size * sizeof(T));
 #endif
 
-    for (unsigned long i = 0; i < input_size; i++) {
-        A[i] = (T) (rand());
-        B[i] = (T) (rand());
-    }
+	for (unsigned long i = 0; i < input_size; i++) {
+		A[i] = (T) (rand());
+		B[i] = (T) (rand());
+	}
 
 #if NUMA
 #if NUMA_MEMCPY
-    if (p.bitmask_cpu) {
-        numa_set_membind(p.bitmask_cpu);
-        numa_free_nodemask(p.bitmask_cpu);
-    }
+	if (p.bitmask_cpu) {
+		numa_set_membind(p.bitmask_cpu);
+		numa_free_nodemask(p.bitmask_cpu);
+	}
 #else
-    struct bitmask *bitmask_all = numa_allocate_nodemask();
-    numa_bitmask_setall(bitmask_all);
-    numa_set_membind(bitmask_all);
-    numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+	struct bitmask *bitmask_all = numa_allocate_nodemask();
+	numa_bitmask_setall(bitmask_all);
+	numa_set_membind(bitmask_all);
+	numa_free_nodemask(bitmask_all);
+#endif				// NUMA_MEMCPY
+#endif				// NUMA
 
 #if NUMA
-    mp_pages[0] = A;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(A)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_in = mp_status[0];
-    }
-
-    mp_pages[0] = C;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(C)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_out = mp_status[0];
-    }
-
-    numa_node_cpu = p.numa_node_cpu;
-    if (p.numa_node_cpu != -1) {
-        if (numa_run_on_node(p.numa_node_cpu) == -1) {
-            perror("numa_run_on_node");
-            numa_node_cpu = -1;
-        }
-    }
+	mp_pages[0] = A;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(A)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_in = mp_status[0];
+	}
+
+	mp_pages[0] = C;
+	if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+		perror("move_pages(C)");
+	} else if (mp_status[0] < 0) {
+		printf("move_pages error: %d", mp_status[0]);
+	} else {
+		numa_node_out = mp_status[0];
+	}
+
+	numa_node_cpu = p.numa_node_cpu;
+	if (p.numa_node_cpu != -1) {
+		if (numa_run_on_node(p.numa_node_cpu) == -1) {
+			perror("numa_run_on_node");
+			numa_node_cpu = -1;
+		}
+	}
 #endif
 
 #if NUMA_MEMCPY
-    numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+	numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+				 || (numa_node_cpu + 8 == numa_node_in)) * 1;
 #endif
 
 #if WITH_BENCHMARK
-    Timer timer;
+	Timer timer;
 #endif
 
 #if NOP_SYNC
-    for(int rep = 0; rep < 200000; rep++) {
-        asm volatile("nop" ::);
-    }
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
 
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if NUMA_MEMCPY
-        numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
-        start(&timer, 1, 0);
-        if (!numa_node_in_is_local) {
-            A_local = (T*) numa_alloc(input_size * sizeof(T));
-            B_local = (T*) numa_alloc(input_size * sizeof(T));
-        }
-        stop(&timer, 1);
-        if (!numa_node_in_is_local) {
-            if (p.numa_node_cpu_memcpy != -1) {
-                if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
-                    perror("numa_run_on_node");
-                    numa_node_cpu_memcpy = -1;
-                }
-            }
-        }
-        start(&timer, 2, 0);
-        if (!numa_node_in_is_local) {
-            memcpy(A_local, A, input_size * sizeof(T));
-            memcpy(B_local, B, input_size * sizeof(T));
-        } else {
-            A_local = A;
-            B_local = B;
-        }
-        stop(&timer, 2);
-        if (p.numa_node_cpu != -1) {
-            if (numa_run_on_node(p.numa_node_cpu) == -1) {
-                perror("numa_run_on_node");
-                numa_node_cpu = -1;
-            }
-        }
-        mp_pages[0] = A_local;
-        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-            perror("move_pages(A_local)");
-        }
-        else if (mp_status[0] < 0) {
-            printf("move_pages error: %d", mp_status[0]);
-        }
-        else {
-            numa_node_local = mp_status[0];
-        }
+		numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+		start(&timer, 1, 0);
+		if (!numa_node_in_is_local) {
+			A_local = (T *) numa_alloc(input_size * sizeof(T));
+			B_local = (T *) numa_alloc(input_size * sizeof(T));
+		}
+		stop(&timer, 1);
+		if (!numa_node_in_is_local) {
+			if (p.numa_node_cpu_memcpy != -1) {
+				if (numa_run_on_node(p.numa_node_cpu_memcpy) ==
+				    -1) {
+					perror("numa_run_on_node");
+					numa_node_cpu_memcpy = -1;
+				}
+			}
+		}
+		start(&timer, 2, 0);
+		if (!numa_node_in_is_local) {
+			memcpy(A_local, A, input_size * sizeof(T));
+			memcpy(B_local, B, input_size * sizeof(T));
+		} else {
+			A_local = A;
+			B_local = B;
+		}
+		stop(&timer, 2);
+		if (p.numa_node_cpu != -1) {
+			if (numa_run_on_node(p.numa_node_cpu) == -1) {
+				perror("numa_run_on_node");
+				numa_node_cpu = -1;
+			}
+		}
+		mp_pages[0] = A_local;
+		if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+			perror("move_pages(A_local)");
+		} else if (mp_status[0] < 0) {
+			printf("move_pages error: %d", mp_status[0]);
+		} else {
+			numa_node_local = mp_status[0];
+		}
 #endif
 
-        start(&timer, 0, 0);
-        vector_addition_host(input_size, p.n_threads);
-        stop(&timer, 0);
+		start(&timer, 0, 0);
+		vector_addition_host(input_size, p.n_threads);
+		stop(&timer, 0);
 
 #if NUMA_MEMCPY
-        start(&timer, 3, 0);
-        if (!numa_node_in_is_local) {
-            numa_free(A_local, input_size * sizeof(T));
-            numa_free(B_local, input_size * sizeof(T));
-        }
-        stop(&timer, 3);
+		start(&timer, 3, 0);
+		if (!numa_node_in_is_local) {
+			numa_free(A_local, input_size * sizeof(T));
+			numa_free(B_local, input_size * sizeof(T));
+		}
+		stop(&timer, 3);
 #endif
 
 #if WITH_BENCHMARK
-        unsigned int nr_threads = 0;
+		unsigned int nr_threads = 0;
 #pragma omp parallel
 #pragma omp atomic
-        nr_threads++;
+		nr_threads++;
 
-        if (rep >= p.n_warmup) {
+		if (rep >= p.n_warmup) {
 #if NUMA_MEMCPY
-            printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
-                " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
-                " | throughput_MBps=%f",
-                nr_threads, XSTR(T), input_size,
-                numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
-                input_size * 3 * sizeof(T) / timer.time[0]);
-            printf(" throughput_MOpps=%f",
-                input_size / timer.time[0]);
-            printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
-                timer.time[0], timer.time[1], timer.time[2], timer.time[3],
-                timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+			printf
+			    ("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+			     " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+			     " | throughput_MBps=%f", nr_threads, XSTR(T),
+			     input_size, numa_node_in, numa_node_local,
+			     numa_node_out, numa_node_cpu, numa_node_cpu_memcpy,
+			     numa_distance(numa_node_in, numa_node_cpu),
+			     numa_distance(numa_node_cpu, numa_node_out),
+			     input_size * 3 * sizeof(T) / timer.time[0]);
+			printf(" throughput_MOpps=%f",
+			       input_size / timer.time[0]);
+			printf
+			    (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+			     timer.time[0], timer.time[1], timer.time[2],
+			     timer.time[3],
+			     timer.time[0] + timer.time[1] + timer.time[2] +
+			     timer.time[3]);
 #else
-            printf("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%ld"
+			printf
+			    ("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%ld"
 #if NUMA
-                " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+			     " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
 #endif
-                " | throughput_MBps=%f",
-                nr_threads, XSTR(T), input_size,
+			     " | throughput_MBps=%f",
+			     nr_threads, XSTR(T), input_size,
 #if NUMA
-                numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+			     numa_node_in, numa_node_out, numa_node_cpu,
+			     numa_distance(numa_node_in, numa_node_cpu),
+			     numa_distance(numa_node_cpu, numa_node_out),
 #endif
-                input_size * 3 * sizeof(T) / timer.time[0]);
-            printf(" throughput_MOpps=%f",
-                input_size / timer.time[0]);
-            printf(" latency_us=%f\n",
-                timer.time[0]);
-#endif // NUMA_MEMCPY
-        }
-#endif // WITH_BENCHMARK
-    }
+			     input_size * 3 * sizeof(T) / timer.time[0]);
+			printf(" throughput_MOpps=%f",
+			       input_size / timer.time[0]);
+			printf(" latency_us=%f\n", timer.time[0]);
+#endif				// NUMA_MEMCPY
+		}
+#endif				// WITH_BENCHMARK
+	}
 
 #if NOP_SYNC
-    for(int rep = 0; rep < 200000; rep++) {
-        asm volatile("nop" ::);
-    }
+	for (int rep = 0; rep < 200000; rep++) {
+		asm volatile ("nop"::);
+	}
 #endif
 
 #if NUMA
-    numa_free(A, input_size * sizeof(T));
-    numa_free(B, input_size * sizeof(T));
-    numa_free(C, input_size * sizeof(T));
+	numa_free(A, input_size * sizeof(T));
+	numa_free(B, input_size * sizeof(T));
+	numa_free(C, input_size * sizeof(T));
 #else
-    free(A);
-    free(B);
-    free(C);
+	free(A);
+	free(B);
+	free(C);
 #endif
 
-   return 0;
- }
+	return 0;
+}
diff --git a/VA/dpu/task.c b/VA/dpu/task.c
index bb41303..9622911 100644
--- a/VA/dpu/task.c
+++ b/VA/dpu/task.c
@@ -15,10 +15,11 @@
 __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
 
 // vector_addition: Computes the vector addition of a cached block 
-static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
-    for (unsigned int i = 0; i < l_size; i++){
-        bufferB[i] += bufferA[i];
-    }
+static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size)
+{
+	for (unsigned int i = 0; i < l_size; i++) {
+		bufferB[i] += bufferA[i];
+	}
 }
 
 // Barrier
@@ -26,53 +27,67 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
 
 extern int main_kernel1(void);
 
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
 
-int main(void) { 
-    // Kernel
-    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
+int main(void)
+{
+	// Kernel
+	return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
 }
 
 // main_kernel1
-int main_kernel1() {
-    unsigned int tasklet_id = me();
+int main_kernel1()
+{
+	unsigned int tasklet_id = me();
 #if PRINT
-    printf("tasklet_id = %u\n", tasklet_id);
+	printf("tasklet_id = %u\n", tasklet_id);
 #endif
-    if (tasklet_id == 0){ // Initialize once the cycle counter
-        mem_reset(); // Reset the heap
-    }
-    // Barrier
-    barrier_wait(&my_barrier);
-
-    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
-    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
-
-    // Address of the current processing block in MRAM
-    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
-    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
-    uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
-
-    // Initialize a local cache to store the MRAM block
-    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
-    T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
-
-    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
-
-        // Bound checking
-        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
-
-        // Load cache with current MRAM block
-        mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
-        mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);
-
-        // Computer vector addition
-        vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
-
-        // Write cache to current MRAM block
-        mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);
-
-    }
-
-    return 0;
+	if (tasklet_id == 0) {	// Initialize once the cycle counter
+		mem_reset();	// Reset the heap
+	}
+	// Barrier
+	barrier_wait(&my_barrier);
+
+	uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;	// Input size per DPU in bytes
+	uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size;	// Transfer input size per DPU in bytes
+
+	// Address of the current processing block in MRAM
+	uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+	uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+	uint32_t mram_base_addr_B =
+	    (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
+
+	// Initialize a local cache to store the MRAM block
+	T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+	T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
+
+	for (unsigned int byte_index = base_tasklet;
+	     byte_index < input_size_dpu_bytes;
+	     byte_index += BLOCK_SIZE * NR_TASKLETS) {
+
+		// Bound checking
+		uint32_t l_size_bytes =
+		    (byte_index + BLOCK_SIZE >=
+		     input_size_dpu_bytes) ? (input_size_dpu_bytes -
+					      byte_index) : BLOCK_SIZE;
+
+		// Load cache with current MRAM block
+		mram_read((__mram_ptr void const *)(mram_base_addr_A +
+						    byte_index), cache_A,
+			  l_size_bytes);
+		mram_read((__mram_ptr void const *)(mram_base_addr_B +
+						    byte_index), cache_B,
+			  l_size_bytes);
+
+		// Computer vector addition
+		vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
+
+		// Write cache to current MRAM block
+		mram_write(cache_B,
+			   (__mram_ptr void *)(mram_base_addr_B + byte_index),
+			   l_size_bytes);
+
+	}
+
+	return 0;
 }
diff --git a/VA/host/app.c b/VA/host/app.c
index 5fe3f61..1a2cdfd 100644
--- a/VA/host/app.c
+++ b/VA/host/app.c
@@ -33,296 +33,361 @@
 #include <dpu_target_macros.h>
 
 // Pointer declaration
-static T* A;
-static T* B;
-static T* C;
-static T* C2;
+static T *A;
+static T *B;
+static T *C;
+static T *C2;
 
 // Create input arrays
-static void read_input(T* A, T* B, unsigned int nr_elements) {
-    srand(0);
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        A[i] = (T) (rand());
-        B[i] = (T) (rand());
-    }
+static void read_input(T *A, T *B, unsigned int nr_elements)
+{
+	srand(0);
+	for (unsigned int i = 0; i < nr_elements; i++) {
+		A[i] = (T) (rand());
+		B[i] = (T) (rand());
+	}
 }
 
 // Compute output in the host
-static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
-    for (unsigned int i = 0; i < nr_elements; i++) {
-        C[i] = A[i] + B[i];
-    }
+static void vector_addition_host(T *C, T *A, T *B, unsigned int nr_elements)
+{
+	for (unsigned int i = 0; i < nr_elements; i++) {
+		C[i] = A[i] + B[i];
+	}
 }
 
 // Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-    struct Params p = input_params(argc, argv);
+	struct Params p = input_params(argc, argv);
 
-    struct dpu_set_t dpu_set, dpu;
-    uint32_t nr_of_dpus;
-    uint32_t nr_of_ranks;
+	struct dpu_set_t dpu_set, dpu;
+	uint32_t nr_of_dpus;
+	uint32_t nr_of_ranks;
 
 #if ENERGY
-    struct dpu_probe_t probe;
-    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+	struct dpu_probe_t probe;
+	DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
 #endif
 
-    // Timer declaration
-    Timer timer;
+	// Timer declaration
+	Timer timer;
 
-    int numa_node_rank = -2;
+	int numa_node_rank = -2;
 
-    // Allocate DPUs and load binary
+	// Allocate DPUs and load binary
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-    timer.time[0] = 0; // alloc
+	DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+	timer.time[0] = 0;	// alloc
 #endif
 #if !WITH_LOAD_OVERHEAD
-    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-    DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-    assert(nr_of_dpus == NR_DPUS);
-    timer.time[1] = 0; // load
+	DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+	DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+	DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+	assert(nr_of_dpus == NR_DPUS);
+	timer.time[1] = 0;	// load
 #endif
 #if !WITH_FREE_OVERHEAD
-    timer.time[6] = 0; // free
+	timer.time[6] = 0;	// free
 #endif
 
-    unsigned int i = 0;
-    const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
-    const unsigned int input_size_8bytes = 
-        ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
-    const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
-    const unsigned int input_size_dpu_8bytes = 
-        ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
-
-    // Input/output allocation
-    A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
-    T *bufferA = A;
-    T *bufferB = B;
-    T *bufferC = C2;
-
-    // Create an input file with arbitrary data
-    read_input(A, B, input_size);
-
-    // Loop over main kernel
-    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+	unsigned int i = 0;
+	const unsigned int input_size =
+	    p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
+	const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size;	// Input size per DPU (max.), 8-byte aligned
+	const unsigned int input_size_dpu = divceil(input_size, NR_DPUS);	// Input size per DPU (max.)
+	const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu;	// Input size per DPU (max.), 8-byte aligned
+
+	// Input/output allocation
+	A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+	T *bufferA = A;
+	T *bufferB = B;
+	T *bufferC = C2;
+
+	// Create an input file with arbitrary data
+	read_input(A, B, input_size);
+
+	// Loop over main kernel
+	for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
 
 #if WITH_ALLOC_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 0, 0);
-        }
-        DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 0, 0);
+		}
+		DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 0);
+		}
 #endif
 #if WITH_DPUINFO
-        printf("DPUs:");
-        DPU_FOREACH (dpu_set, dpu) {
-            int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            int slice = dpu_get_slice_id(dpu_from_set(dpu));
-            int member = dpu_get_member_id(dpu_from_set(dpu));
-            printf(" %d(%d.%d)", rank, slice, member);
-        }
-        printf("\n");
+		printf("DPUs:");
+		DPU_FOREACH(dpu_set, dpu) {
+			int rank =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			int slice = dpu_get_slice_id(dpu_from_set(dpu));
+			int member = dpu_get_member_id(dpu_from_set(dpu));
+			printf(" %d(%d.%d)", rank, slice, member);
+		}
+		printf("\n");
 #endif
 #if WITH_LOAD_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 1, 0);
-        }
-        DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 1);
-        }
-        DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
-        DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
-        assert(nr_of_dpus == NR_DPUS);
+		if (rep >= p.n_warmup) {
+			start(&timer, 1, 0);
+		}
+		DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 1);
+		}
+		DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+		DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+		assert(nr_of_dpus == NR_DPUS);
 #endif
 
-        // int prev_rank_id = -1;
-        int rank_id = -1;
-        DPU_FOREACH (dpu_set, dpu) {
-            rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
-            if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
-                numa_node_rank = -1;
-            } else {
-                numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
-            }
-            /*
-            if (rank_id != prev_rank_id) {
-                printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
-                prev_rank_id = rank_id;
-            }
-            */
-        }
-
-
-        // Compute output on CPU (performance comparison and verification purposes)
-        if(rep >= p.n_warmup) {
-            start(&timer, 2, 0);
-        }
-        vector_addition_host(C, A, B, input_size);
-        if(rep >= p.n_warmup) {
-            stop(&timer, 2);
-        }
-
-        if(rep >= p.n_warmup) {
-            start(&timer, 3, 0);
-        }
-        // Input arguments
-        unsigned int kernel = 0;
-        dpu_arguments_t input_arguments[NR_DPUS];
-        for(i=0; i<nr_of_dpus-1; i++) {
-            input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
-            input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-            input_arguments[i].kernel=kernel;
-        }
-        input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
-        input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
-        input_arguments[nr_of_dpus-1].kernel=kernel;
-
-        // Copy input arrays
-        i = 0;
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
-
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
- 
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 3);
-        }
-
-        // Run DPU kernel
-        if(rep >= p.n_warmup) {
-            start(&timer, 4, 0);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_start(&probe));
-            #endif
-        }
-        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 4);
-            #if ENERGY
-            DPU_ASSERT(dpu_probe_stop(&probe));
-            #endif
-        }
-
+		// int prev_rank_id = -1;
+		int rank_id = -1;
+		DPU_FOREACH(dpu_set, dpu) {
+			rank_id =
+			    dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+			    DPU_TARGET_MASK;
+			if ((numa_node_rank != -2)
+			    && numa_node_rank !=
+			    dpu_get_rank_numa_node(dpu_get_rank
+						   (dpu_from_set(dpu)))) {
+				numa_node_rank = -1;
+			} else {
+				numa_node_rank =
+				    dpu_get_rank_numa_node(dpu_get_rank
+							   (dpu_from_set(dpu)));
+			}
+			/*
+			   if (rank_id != prev_rank_id) {
+			   printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+			   prev_rank_id = rank_id;
+			   }
+			 */
+		}
+
+		// Compute output on CPU (performance comparison and verification purposes)
+		if (rep >= p.n_warmup) {
+			start(&timer, 2, 0);
+		}
+		vector_addition_host(C, A, B, input_size);
+		if (rep >= p.n_warmup) {
+			stop(&timer, 2);
+		}
+
+		if (rep >= p.n_warmup) {
+			start(&timer, 3, 0);
+		}
+		// Input arguments
+		unsigned int kernel = 0;
+		dpu_arguments_t input_arguments[NR_DPUS];
+		for (i = 0; i < nr_of_dpus - 1; i++) {
+			input_arguments[i].size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].transfer_size =
+			    input_size_dpu_8bytes * sizeof(T);
+			input_arguments[i].kernel = kernel;
+		}
+		input_arguments[nr_of_dpus - 1].size =
+		    (input_size_8bytes -
+		     input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T);
+		input_arguments[nr_of_dpus - 1].transfer_size =
+		    input_size_dpu_8bytes * sizeof(T);
+		input_arguments[nr_of_dpus - 1].kernel = kernel;
+
+		// Copy input arrays
+		i = 0;
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+			    sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferA + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME, 0,
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferB + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_TO_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size_dpu_8bytes * sizeof(T),
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 3);
+		}
+		// Run DPU kernel
+		if (rep >= p.n_warmup) {
+			start(&timer, 4, 0);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+		}
+		DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 4);
+#if ENERGY
+			DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+		}
 #if PRINT
-        {
-            unsigned int each_dpu = 0;
-            printf("Display DPU Logs\n");
-            DPU_FOREACH (dpu_set, dpu) {
-                printf("DPU#%d:\n", each_dpu);
-                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
-                each_dpu++;
-            }
-        }
+		{
+			unsigned int each_dpu = 0;
+			printf("Display DPU Logs\n");
+			DPU_FOREACH(dpu_set, dpu) {
+				printf("DPU#%d:\n", each_dpu);
+				DPU_ASSERT(dpulog_read_for_dpu
+					   (dpu.dpu, stdout));
+				each_dpu++;
+			}
+		}
 #endif
 
-        if(rep >= p.n_warmup) {
-            start(&timer, 5, 0);
-        }
-        i = 0;
-        // PARALLEL RETRIEVE TRANSFER
-        DPU_FOREACH(dpu_set, dpu, i) {
-            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
-        }
-        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-        if(rep >= p.n_warmup) {
-            stop(&timer, 5);
-        }
-
+		if (rep >= p.n_warmup) {
+			start(&timer, 5, 0);
+		}
+		i = 0;
+		// PARALLEL RETRIEVE TRANSFER
+		DPU_FOREACH(dpu_set, dpu, i) {
+			DPU_ASSERT(dpu_prepare_xfer
+				   (dpu, bufferC + input_size_dpu_8bytes * i));
+		}
+		DPU_ASSERT(dpu_push_xfer
+			   (dpu_set, DPU_XFER_FROM_DPU,
+			    DPU_MRAM_HEAP_POINTER_NAME,
+			    input_size_dpu_8bytes * sizeof(T),
+			    input_size_dpu_8bytes * sizeof(T),
+			    DPU_XFER_DEFAULT));
+		if (rep >= p.n_warmup) {
+			stop(&timer, 5);
+		}
 #if WITH_ALLOC_OVERHEAD
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            start(&timer, 6, 0);
-        }
+		if (rep >= p.n_warmup) {
+			start(&timer, 6, 0);
+		}
 #endif
-        DPU_ASSERT(dpu_free(dpu_set));
+		DPU_ASSERT(dpu_free(dpu_set));
 #if WITH_FREE_OVERHEAD
-        if(rep >= p.n_warmup) {
-            stop(&timer, 6);
-        }
+		if (rep >= p.n_warmup) {
+			stop(&timer, 6);
+		}
 #endif
 #endif
 
-        // Check output
-        bool status = true;
-        for (i = 0; i < input_size; i++) {
-            if(C[i] != bufferC[i]){ 
-                status = false;
+		// Check output
+		bool status = true;
+		for (i = 0; i < input_size; i++) {
+			if (C[i] != bufferC[i]) {
+				status = false;
 #if PRINT
-                printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
+				printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
 #endif
-            }
-        }
-        if (status) {
-            printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
-            if (rep >= p.n_warmup) {
-                printf("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
-                    nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size / NR_DPUS);
-                printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
-                    WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
-                printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
-                    timer.time[0],
-                    timer.time[1],
-                    timer.time[2],
-                    timer.time[3],
-                    timer.time[4],
-                    timer.time[5],
-                    timer.time[6]);
-                printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
-                    input_size * 3 * sizeof(T) / timer.time[2],
-                    input_size * 3 * sizeof(T) / (timer.time[4]),
-                    input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
-                    input_size * 3 * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * 3 * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-                printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
-                    input_size / timer.time[2],
-                    input_size / (timer.time[4]),
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
-                printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
-                    input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
-                    input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
-            }
-        } else {
-            printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
-        }
-    }
+			}
+		}
+		if (status) {
+			printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+			       "] Outputs are equal\n");
+			if (rep >= p.n_warmup) {
+				printf
+				    ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
+				     nr_of_dpus, nr_of_ranks, NR_TASKLETS,
+				     XSTR(T), BLOCK_SIZE, input_size,
+				     input_size / NR_DPUS);
+				printf
+				    (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+				     WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+				     WITH_FREE_OVERHEAD, numa_node_rank);
+				printf
+				    ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+				     timer.time[0], timer.time[1],
+				     timer.time[2], timer.time[3],
+				     timer.time[4], timer.time[5],
+				     timer.time[6]);
+				printf
+				    (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+				     input_size * 3 * sizeof(T) / timer.time[2],
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[4]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5] + timer.time[6]));
+				printf
+				    (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[3] + timer.time[4] +
+				      timer.time[5]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[1] + timer.time[3] +
+				      timer.time[4] + timer.time[5]),
+				     input_size * 3 * sizeof(T) /
+				     (timer.time[0] + timer.time[1] +
+				      timer.time[3] + timer.time[4] +
+				      timer.time[5]));
+				printf
+				    (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+				     input_size / timer.time[2],
+				     input_size / (timer.time[4]),
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5] +
+						   timer.time[6]));
+				printf
+				    (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+				     input_size / (timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]),
+				     input_size / (timer.time[0] +
+						   timer.time[1] +
+						   timer.time[3] +
+						   timer.time[4] +
+						   timer.time[5]));
+			}
+		} else {
+			printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+			       "] Outputs differ!\n");
+		}
+	}
 
 #if ENERGY
-    double energy;
-    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
-    printf("DPU Energy (J): %f\t", energy);
-#endif	
-
+	double energy;
+	DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+	printf("DPU Energy (J): %f\t", energy);
+#endif
 
-    // Deallocation
-    free(A);
-    free(B);
-    free(C);
-    free(C2);
+	// Deallocation
+	free(A);
+	free(B);
+	free(C);
+	free(C2);
 
 #if !WITH_ALLOC_OVERHEAD
-    DPU_ASSERT(dpu_free(dpu_set));
+	DPU_ASSERT(dpu_free(dpu_set));
 #endif
-	
-    return 0;
+
+	return 0;
 }
diff --git a/VA/support/common.h b/VA/support/common.h
index c1043fd..cee09e2 100755
--- a/VA/support/common.h
+++ b/VA/support/common.h
@@ -3,11 +3,11 @@
 
 // Structures used by both the host and the dpu to communicate information
 typedef struct {
-    uint32_t size;
-    uint32_t transfer_size;
+	uint32_t size;
+	uint32_t transfer_size;
 	enum kernels {
-	    kernel1 = 0,
-	    nr_kernels = 1,
+		kernel1 = 0,
+		nr_kernels = 1,
 	} kernel;
 } dpu_arguments_t;
 
@@ -24,34 +24,34 @@ typedef struct {
 // Data type
 #ifdef UINT32
 #define T uint32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif UINT64
 #define T uint64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif INT32
 #define T int32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif INT64
 #define T int64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif FLOAT
 #define T float
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2			// Shift right to divide by sizeof(T)
 #elif DOUBLE
 #define T double
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3			// Shift right to divide by sizeof(T)
 #elif CHAR
 #define T char
-#define DIV 0 // Shift right to divide by sizeof(T)
+#define DIV 0			// Shift right to divide by sizeof(T)
 #elif SHORT
 #define T short
-#define DIV 1 // Shift right to divide by sizeof(T)
+#define DIV 1			// Shift right to divide by sizeof(T)
 #endif
 
 #ifndef ENERGY
 #define ENERGY 0
 #endif
-#define PRINT 0 
+#define PRINT 0
 
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
diff --git a/VA/support/params.h b/VA/support/params.h
index 8bd71a6..47c10ef 100644
--- a/VA/support/params.h
+++ b/VA/support/params.h
@@ -4,53 +4,62 @@
 #include "common.h"
 
 typedef struct Params {
-    unsigned int   input_size;
-    int   n_warmup;
-    int   n_reps;
-    int   exp;
-}Params;
+	unsigned int input_size;
+	int n_warmup;
+	int n_reps;
+	int exp;
+} Params;
 
-static void usage() {
-    fprintf(stderr,
-        "\nUsage:  ./program [options]"
-        "\n"
-        "\nGeneral options:"
-        "\n    -h        help"
-        "\n    -w <W>    # of untimed warmup iterations (default=1)"
-        "\n    -e <E>    # of timed repetition iterations (default=3)"
-        "\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
-        "\n"
-        "\nBenchmark-specific options:"
-        "\n    -i <I>    input size (default=2621440 elements)"
-        "\n");
+static void usage()
+{
+	fprintf(stderr,
+		"\nUsage:  ./program [options]"
+		"\n"
+		"\nGeneral options:"
+		"\n    -h        help"
+		"\n    -w <W>    # of untimed warmup iterations (default=1)"
+		"\n    -e <E>    # of timed repetition iterations (default=3)"
+		"\n    -x <X>    Weak (0) or strong (1) scaling (default=0)"
+		"\n"
+		"\nBenchmark-specific options:"
+		"\n    -i <I>    input size (default=2621440 elements)" "\n");
 }
 
-struct Params input_params(int argc, char **argv) {
-    struct Params p;
-    p.input_size    = 2621440;
-    p.n_warmup      = 1;
-    p.n_reps        = 3;
-    p.exp           = 0;
+struct Params input_params(int argc, char **argv)
+{
+	struct Params p;
+	p.input_size = 2621440;
+	p.n_warmup = 1;
+	p.n_reps = 3;
+	p.exp = 0;
 
-    int opt;
-    while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
-        switch(opt) {
-        case 'h':
-        usage();
-        exit(0);
-        break;
-        case 'i': p.input_size    = atoi(optarg); break;
-        case 'w': p.n_warmup      = atoi(optarg); break;
-        case 'e': p.n_reps        = atoi(optarg); break;
-        case 'x': p.exp           = atoi(optarg); break;
-        default:
-            fprintf(stderr, "\nUnrecognized option!\n");
-            usage();
-            exit(0);
-        }
-    }
-    assert(NR_DPUS > 0 && "Invalid # of dpus!");
+	int opt;
+	while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
+		switch (opt) {
+		case 'h':
+			usage();
+			exit(0);
+			break;
+		case 'i':
+			p.input_size = atoi(optarg);
+			break;
+		case 'w':
+			p.n_warmup = atoi(optarg);
+			break;
+		case 'e':
+			p.n_reps = atoi(optarg);
+			break;
+		case 'x':
+			p.exp = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "\nUnrecognized option!\n");
+			usage();
+			exit(0);
+		}
+	}
+	assert(NR_DPUS > 0 && "Invalid # of dpus!");
 
-    return p;
+	return p;
 }
 #endif
diff --git a/VA/support/timer.h b/VA/support/timer.h
index 4d597b9..df68334 100755
--- a/VA/support/timer.h
+++ b/VA/support/timer.h
@@ -1,66 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by:    IMPACT Research Group
- *                  University of Cordoba and University of Illinois
- *                  http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the 
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- *      > Redistributions of source code must retain the above copyright notice,
- *        this list of conditions and the following disclaimers.
- *      > Redistributions in binary form must reproduce the above copyright
- *        notice, this list of conditions and the following disclaimers in the
- *        documentation and/or other materials provided with the distribution.
- *      > Neither the names of IMPACT Research Group, University of Cordoba, 
- *        University of Illinois nor the names of its contributors may be used 
- *        to endorse or promote products derived from this Software without 
- *        specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
-    struct timeval startTime[7];
-    struct timeval stopTime[7];
-    double         time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
-    if(rep == 0) {
-        timer->time[i] = 0.0;
-    }
-    gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
-    gettimeofday(&timer->stopTime[i], NULL);
-    timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
-                      (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
-    for (int i = 0; i <= maxt; i++) {
-        printf(" timer%d_us=%f", i, timer->time[i]);
-    }
-    printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by:    IMPACT Research Group
+ *                  University of Cordoba and University of Illinois
+ *                  http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ *      > Redistributions of source code must retain the above copyright notice,
+ *        this list of conditions and the following disclaimers.
+ *      > Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimers in the
+ *        documentation and/or other materials provided with the distribution.
+ *      > Neither the names of IMPACT Research Group, University of Cordoba,
+ *        University of Illinois nor the names of its contributors may be used
+ *        to endorse or promote products derived from this Software without
+ *        specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+	struct timeval startTime[7];
+	struct timeval stopTime[7];
+	double time[7];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+	if (rep == 0) {
+		timer->time[i] = 0.0;
+	}
+	gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+	gettimeofday(&timer->stopTime[i], NULL);
+	timer->time[i] +=
+	    (timer->stopTime[i].tv_sec -
+	     timer->startTime[i].tv_sec) * 1000000.0 +
+	    (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+	printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+	for (int i = 0; i <= maxt; i++) {
+		printf(" timer%d_us=%f", i, timer->time[i]);
+	}
+	printf("\n");
+}
author	Birte Kristina Friesel <birte.friesel@uos.de>	2025-01-16 08:25:34 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2025-01-16 08:25:34 +0100
commit	c2bf48b415e8e51d59bbec59635a02ba4e1cb4c4 (patch)
tree	c06e2048de3d6dac944f48038389790fcd014e17
parent	0bebc23cf55adfc6a25c0d5f4fa9061ce093e0d7 (diff)