From 196683d2ec21da97764a17b5683539588715d1de Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Thu, 23 Dec 2021 07:50:01 +0100 Subject: bs_bug_fix: Make MRAM reads 8-byte aligned Current implementation does not guarantee that `current_mram_block_addr_A` is 8-byte aligned before using it as the start address of `mram_read`s. This commit makes `current_mram_block_addr_A` 8-byte aligned whenever we try to use it for a MRAM read by `current_mram_block_addr_A &= WORD_MASK`, which will clear the unaligned bytes. Signed-off-by: Yun-Ze Li --- BS/dpu/task.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/BS/dpu/task.c b/BS/dpu/task.c index 39a340d..45c48f5 100644 --- a/BS/dpu/task.c +++ b/BS/dpu/task.c @@ -12,6 +12,7 @@ #include #include "common.h" +#define WORD_MASK 0xfffffff8 __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; @@ -92,6 +93,7 @@ int main_kernel1() { mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)), cache_aux_B, BLOCK_SIZE); current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2; + current_mram_block_addr_A &= WORD_MASK; while(!end) { // Load cache with current MRAM block @@ -104,6 +106,7 @@ int main_kernel1() { if(found > -1) { result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); + printf("Tasklet %d has found %lld\n", me(), result->found + 1); break; } @@ -112,6 +115,7 @@ int main_kernel1() { { end_mram_block_addr_A = current_mram_block_addr_A; current_mram_block_addr_A = (current_mram_block_addr_A + start_mram_block_addr_A) / 2; + current_mram_block_addr_A &= WORD_MASK; } // If found == -1, we need to discard left part of the input vector @@ -119,6 +123,7 @@ int main_kernel1() { { start_mram_block_addr_A = current_mram_block_addr_A; current_mram_block_addr_A = (current_mram_block_addr_A + end_mram_block_addr_A) / 2; + current_mram_block_addr_A &= WORD_MASK; } // Start boundary check @@ -132,7 +137,12 @@ int main_kernel1() { { end = true; result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); + printf("Tasklet %d has found %lld\n", me(), result->found + 1); } + else + { + printf("%lld NOT found\n", searching_for); + } } // End boundary check @@ -145,7 +155,12 @@ int main_kernel1() { if(found > -1) { result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); + printf("Tasklet %d has found %lld\n", me(), result->found + 1); } + else + { + printf("%lld NOT found\n", searching_for); + } } } } -- cgit v1.2.3 From 875271995b12c00b24f38124b2b19e3fc94d2057 Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Thu, 23 Dec 2021 09:05:17 +0100 Subject: bs_bug_fix: Modify boundary case handlings in BS Current boundary case handling may discard some numbers obliged to be compared with `searching_for`, which in my opinion can result in false positives (BS fails to identify the number which is indeed in the input array). This commit changed the boundary check condition to `if(current_mram_block_addr_A < start_mram_block_addr_A + BLOCK_SIZE)`, where the expression returns true if and only if the length of the range [start_mram_block_addr_A, end_mram_block_addr_A) has become smaller than 2*BLOCK_SIZE. When this happens, we can then finalize the BS result by checking if `searching_for` exists within [start_mram_block_addr_A, end_mram_block_addr_A) without overlooking any number that should be checked. Signed-off-by: Yun-Ze Li --- BS/dpu/task.c | 79 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/BS/dpu/task.c b/BS/dpu/task.c index 45c48f5..ac4ca38 100644 --- a/BS/dpu/task.c +++ b/BS/dpu/task.c @@ -17,12 +17,12 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; // Search -static DTYPE search(DTYPE *bufferA, DTYPE searching_for) { +static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size) { DTYPE found = -2; if(bufferA[0] <= searching_for) { found = -1; - for (uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++){ + for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++){ if(bufferA[i] == searching_for) { found = i; @@ -94,13 +94,47 @@ int main_kernel1() { current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2; current_mram_block_addr_A &= WORD_MASK; - while(!end) + + while(1) { + // Boundary check + if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE)) + { + //end = true; + // find (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE) + mram_read((__mram_ptr void const *) start_mram_block_addr_A, cache_A, BLOCK_SIZE); + found = search(cache_A, searching_for, BLOCK_SIZE); + + if(found > -1) + { + result->found = found + (start_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); + printf("Tasklet %d has found %lld\n", me(), result->found + 1); + } + // find (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A) + else + { + size_t remain_bytes_to_search = end_mram_block_addr_A - (start_mram_block_addr_A + BLOCK_SIZE); + mram_read((__mram_ptr void const *) start_mram_block_addr_A + BLOCK_SIZE, cache_A, remain_bytes_to_search); + found = search(cache_A, searching_for, remain_bytes_to_search); + + if(found > -1) + { + result->found = found + (start_mram_block_addr_A + BLOCK_SIZE - start_mram_block_addr_aux) / sizeof(DTYPE); + printf("Tasklet %d has found %lld\n", me(), result->found + 1); + } + else + { + printf("%lld NOT found\n", searching_for); + } + } + break; + } + // Load cache with current MRAM block mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE); // Search inside block - found = search(cache_A, searching_for); + found = search(cache_A, searching_for, BLOCK_SIZE); // If found > -1, we found the searching_for query if(found > -1) @@ -125,43 +159,6 @@ int main_kernel1() { current_mram_block_addr_A = (current_mram_block_addr_A + end_mram_block_addr_A) / 2; current_mram_block_addr_A &= WORD_MASK; } - - // Start boundary check - if(current_mram_block_addr_A < (start_mram_block_addr_aux + BLOCK_SIZE)) - { - end = true; - mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE); - found = search(cache_A, searching_for); - - if(found > -1) - { - end = true; - result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - printf("Tasklet %d has found %lld\n", me(), result->found + 1); - } - else - { - printf("%lld NOT found\n", searching_for); - } - } - - // End boundary check - if(current_mram_block_addr_A > (end_mram_block_addr_A - BLOCK_SIZE)) - { - end = true; - mram_read((__mram_ptr void const *) end_mram_block_addr_A - BLOCK_SIZE, cache_A, BLOCK_SIZE); - found = search(cache_A, searching_for); - - if(found > -1) - { - result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - printf("Tasklet %d has found %lld\n", me(), result->found + 1); - } - else - { - printf("%lld NOT found\n", searching_for); - } - } } } return 0; -- cgit v1.2.3 From eda4f83d73ab2737e7d0d245d87c1dd248a93ce5 Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Thu, 23 Dec 2021 09:17:06 +0100 Subject: bs_refactor: Remove redundant lines This commit reduce redundant lines for calculating `current_mram_block_addrA`. Signed-off-by: Yun-Ze Li --- BS/dpu/task.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/BS/dpu/task.c b/BS/dpu/task.c index ac4ca38..92a383f 100644 --- a/BS/dpu/task.c +++ b/BS/dpu/task.c @@ -92,11 +92,12 @@ int main_kernel1() { mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_aux_A, BLOCK_SIZE); mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)), cache_aux_B, BLOCK_SIZE); - current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2; - current_mram_block_addr_A &= WORD_MASK; - while(1) { + // Locate the address of the mid mram block + current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2; + current_mram_block_addr_A &= WORD_MASK; + // Boundary check if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE)) { @@ -148,16 +149,12 @@ int main_kernel1() { if(found == -2) { end_mram_block_addr_A = current_mram_block_addr_A; - current_mram_block_addr_A = (current_mram_block_addr_A + start_mram_block_addr_A) / 2; - current_mram_block_addr_A &= WORD_MASK; } // If found == -1, we need to discard left part of the input vector else if (found == -1) { start_mram_block_addr_A = current_mram_block_addr_A; - current_mram_block_addr_A = (current_mram_block_addr_A + end_mram_block_addr_A) / 2; - current_mram_block_addr_A &= WORD_MASK; } } } -- cgit v1.2.3 From bab6c1ba39b7a00eccf3ecdcc7bd4560d2581d18 Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Thu, 23 Dec 2021 09:31:35 +0100 Subject: bs_refactor: Remove printf for logging Signed-off-by: Yun-Ze Li --- BS/dpu/task.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/BS/dpu/task.c b/BS/dpu/task.c index 92a383f..acf66f2 100644 --- a/BS/dpu/task.c +++ b/BS/dpu/task.c @@ -79,8 +79,6 @@ int main_kernel1() { mram_read((__mram_ptr void const *) current_mram_block_addr_query, &searching_for, 8); current_mram_block_addr_query += 8; - bool end = false; - // Initialize input vector boundaries start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; start_mram_block_addr_aux = start_mram_block_addr_A; @@ -101,17 +99,15 @@ int main_kernel1() { // Boundary check if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE)) { - //end = true; - // find (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE) + // Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE) mram_read((__mram_ptr void const *) start_mram_block_addr_A, cache_A, BLOCK_SIZE); found = search(cache_A, searching_for, BLOCK_SIZE); if(found > -1) { result->found = found + (start_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - printf("Tasklet %d has found %lld\n", me(), result->found + 1); } - // find (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A) + // Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A) else { size_t remain_bytes_to_search = end_mram_block_addr_A - (start_mram_block_addr_A + BLOCK_SIZE); @@ -121,7 +117,6 @@ int main_kernel1() { if(found > -1) { result->found = found + (start_mram_block_addr_A + BLOCK_SIZE - start_mram_block_addr_aux) / sizeof(DTYPE); - printf("Tasklet %d has found %lld\n", me(), result->found + 1); } else { @@ -141,7 +136,6 @@ int main_kernel1() { if(found > -1) { result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - printf("Tasklet %d has found %lld\n", me(), result->found + 1); break; } -- cgit v1.2.3 From 151a43fa1fbccb9af895cf6795af2c6ae58add6c Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Sun, 26 Dec 2021 13:11:01 +0100 Subject: documentation: Fix typo Signed-off-by: Yun-Ze Li --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3ce614e..66561ae 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ PrIM provides a common set of workloads to evaluate the UPMEM PIM architecture w The workloads have different characteristics, exhibiting heterogeneity in their memory access patterns, operations and data types, and communication patterns. This repository also contains baseline CPU and GPU implementations of PrIM benchmarks for comparison purposes. -PrIm also includes a set of microbenchmarks can be used to assess various architecture limits such as compute throughput and memory bandwidth. +PrIM also includes a set of microbenchmarks can be used to assess various architecture limits such as compute throughput and memory bandwidth. ## Citation @@ -149,7 +149,7 @@ Several benchmark folders (HST-S, HST-L, RED, SCAN-SSA, SCAN-RSS) contain a scri ### Microbenchmarks -Each microbenchmark folder contais a script (`run.sh`) that compiles and runs the microbenchmark for the experiments in the [paper](https://arxiv.org/pdf/2105.03814.pdf): +Each microbenchmark folder contains a script (`run.sh`) that compiles and runs the microbenchmark for the experiments in the [paper](https://arxiv.org/pdf/2105.03814.pdf): ```sh cd Microbenchmarks/Arithmetic-Throughput -- cgit v1.2.3