summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BFS/Makefile5
-rw-r--r--BFS/baselines/cpu/Makefile26
-rw-r--r--BFS/baselines/cpu/app.c41
-rw-r--r--BFS/dpu/dpu-utils.h61
-rw-r--r--BFS/dpu/task.c274
-rw-r--r--BFS/host/app.c724
-rw-r--r--BFS/host/mram-management.h46
-rw-r--r--BFS/support/common.h23
-rw-r--r--BFS/support/graph.h195
-rw-r--r--BFS/support/params.h81
-rw-r--r--BFS/support/timer.h35
-rw-r--r--BFS/support/utils.h1
-rw-r--r--BS/baselines/cpu/Makefile26
-rw-r--r--BS/baselines/cpu/bs_omp.c353
-rwxr-xr-xBS/baselines/cpu/run-perf.sh6
-rwxr-xr-xBS/dimes-hetsim-hbm.sh2
-rwxr-xr-xBS/dimes-hetsim-nmc.sh8
-rw-r--r--BS/dpu/task.c278
-rw-r--r--BS/host/app.c324
-rwxr-xr-xBS/support/common.h2
-rw-r--r--BS/support/params.h91
-rwxr-xr-xBS/support/timer.h137
-rw-r--r--COUNT/baselines/cpu/Makefile27
-rw-r--r--COUNT/baselines/cpu/app_baseline.c297
-rw-r--r--COUNT/dpu/task.c138
-rw-r--r--COUNT/host/app.c529
-rwxr-xr-xCOUNT/support/common.h15
-rw-r--r--COUNT/support/params.h95
-rwxr-xr-xCOUNT/support/timer.h146
-rwxr-xr-xCOUNT/vamos25.sh43
-rw-r--r--GEMV/baselines/cpu/Makefile19
-rw-r--r--GEMV/baselines/cpu/gemv_openmp.c410
-rwxr-xr-xGEMV/baselines/cpu/run-perf.sh6
-rw-r--r--GEMV/dpu/task.c152
-rw-r--r--GEMV/host/app.c296
-rwxr-xr-xGEMV/support/common.h14
-rw-r--r--GEMV/support/params.h95
-rwxr-xr-xGEMV/support/timer.h143
-rw-r--r--HST-S/baselines/cpu/app_baseline.c588
-rw-r--r--HST-S/dpu/task.c183
-rw-r--r--HST-S/host/app.c667
-rwxr-xr-xHST-S/support/common.h16
-rw-r--r--HST-S/support/params.h124
-rwxr-xr-xHST-S/support/timer.h140
-rw-r--r--MLP/baselines/cpu/Makefile28
-rw-r--r--MLP/baselines/cpu/mlp_openmp.c340
-rw-r--r--MLP/dpu/task.c124
-rw-r--r--MLP/host/app.c176
-rwxr-xr-xMLP/support/common.h16
-rw-r--r--MLP/support/params.h95
-rwxr-xr-xMLP/support/timer.h131
-rwxr-xr-xMicrobenchmarks/CPU-DPU/nodmc25-alloc.sh34
-rwxr-xr-xMicrobenchmarks/CPU-DPU/nodmc25-transfer.sh37
-rw-r--r--SpMV/baselines/cpu/Makefile10
-rwxr-xr-xSpMV/baselines/cpu/run-perf.sh6
-rw-r--r--TRNS/baselines/cpu/Makefile19
-rwxr-xr-xTRNS/baselines/cpu/run-perf.sh6
-rwxr-xr-xTRNS/dimes-hetsim-hbm.sh7
-rwxr-xr-xTRNS/dimes-hetsim-nmc.sh4
-rw-r--r--TS/baselines/cpu/mprofile.h6
-rwxr-xr-xTS/baselines/cpu/run-perf.sh8
-rw-r--r--TS/dpu/task.c138
-rw-r--r--TS/host/app.c307
-rwxr-xr-xTS/support/common.h30
-rw-r--r--TS/support/params.h99
-rwxr-xr-xTS/support/timer.h140
-rw-r--r--UNI/baselines/cpu/Makefile2
-rw-r--r--VA/baselines/cpu/Makefile26
-rw-r--r--VA/baselines/cpu/app_baseline.c496
-rwxr-xr-xVA/baselines/cpu/run-perf.sh6
-rwxr-xr-xVA/baselines/cpu/run-ws.sh6
-rw-r--r--VA/dpu/task.c109
-rw-r--r--VA/host/app.c551
-rwxr-xr-xVA/support/common.h26
-rw-r--r--VA/support/params.h95
-rwxr-xr-xVA/support/timer.h140
-rw-r--r--perf-events.txt44
77 files changed, 5814 insertions, 4330 deletions
diff --git a/BFS/Makefile b/BFS/Makefile
index a4ea69d..d43202f 100644
--- a/BFS/Makefile
+++ b/BFS/Makefile
@@ -1,12 +1,15 @@
NR_DPUS ?= 1
NR_TASKLETS ?= 16
+WITH_ALLOC_OVERHEAD ?= 0
+WITH_LOAD_OVERHEAD ?= 0
+WITH_FREE_OVERHEAD ?= 0
COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard host/*.c)
DPU_SOURCES := $(wildcard dpu/*.c)
COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
-HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS}
+HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
QUIET = @
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile
index 6f082b1..1efe457 100644
--- a/BFS/baselines/cpu/Makefile
+++ b/BFS/baselines/cpu/Makefile
@@ -1,8 +1,26 @@
-.PHONY: all
-all: bfs
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+LDFLAGS =
+CFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
+endif
bfs: app.c
- gcc -O2 -o bfs -fopenmp app.c
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -o bfs -fopenmp app.c ${LDFLAGS}
bfs_O0: app.c
gcc -o bfs_O0 -fopenmp app.c
@@ -27,3 +45,5 @@ run_O2: bfs_O2
.PHONY: clean
clean:
rm -f bfs bfs_O0 bfs_O2
+
+.PHONY: all
diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c
index caf4cbc..390b1f9 100644
--- a/BFS/baselines/cpu/app.c
+++ b/BFS/baselines/cpu/app.c
@@ -8,12 +8,30 @@
#include <omp.h>
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void* mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+struct bitmask* bitmask_in;
+int numa_node_in = -1;
+int numa_node_cpu = -1;
+#endif
+
#include "../../support/common.h"
#include "../../support/graph.h"
#include "../../support/params.h"
-#include "../../support/timer.h"
#include "../../support/utils.h"
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define startTimer(...)
+#define stopTimer(...)
+#endif
+
int main(int argc, char** argv) {
// Process parameters
@@ -24,8 +42,9 @@ int main(int argc, char** argv) {
struct COOGraph cooGraph = readCOOGraph(p.fileName);
PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
-
+#if WITH_BENCHMARK
Timer timer;
+#endif
for(int rep = 0; rep < 100; rep++) {
struct CSRGraph csrGraph = coo2csr(cooGraph);
@@ -43,6 +62,12 @@ int main(int argc, char** argv) {
uint32_t* prevFrontier = buffer1;
uint32_t* currFrontier = buffer2;
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
// Calculating result on CPU
startTimer(&timer, 0, 0);
nodeLevel[srcNode] = 0;
@@ -86,6 +111,12 @@ int main(int argc, char** argv) {
}
stopTimer(&timer, 0);
+#if NOP_SYNC
+ for(int rep = 0; rep < 200000; rep++) {
+ asm volatile("nop" ::);
+ }
+#endif
+
freeCSRGraph(csrGraph);
free(buffer1);
free(buffer2);
@@ -135,6 +166,7 @@ int main(int argc, char** argv) {
}
stopTimer(&timer, 1);
+#if WITH_BENCHMARK
unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
@@ -158,8 +190,11 @@ int main(int argc, char** argv) {
printf(" throughput_seq_MOpps=%f throughput_MOpps=%f",
csrGraph.numNodes / timer.time[1],
csrGraph.numNodes / timer.time[0]);
- printAll(&timer, 1);
+ printf(" latency_us=%f latency_seq_us=%f\n",
+ timer.time[0],
+ timer.time[1]);
}
+#endif // WITH_BENCHMARK
freeCSRGraph(csrGraph);
free(nodeLevel);
diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h
index b02c073..dc986d2 100644
--- a/BFS/dpu/dpu-utils.h
+++ b/BFS/dpu/dpu-utils.h
@@ -6,39 +6,46 @@
#define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__)
-static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8);
- return cache_w[0];
+static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+ mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)),
+ cache_w, 8);
+ return cache_w[0];
}
-static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- cache_w[0] = val;
- mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8);
+static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx,
+ uint64_t *cache_w)
+{
+ cache_w[0] = val;
+ mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)),
+ 8);
}
-static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- // Load 8B
- uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
- uint32_t offset = ((uint32_t)ptr_idx_m)%8;
- uint32_t ptr_block_m = ptr_idx_m - offset;
- mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
- // Extract 4B
- uint32_t* cache_32_w = (uint32_t*) cache_w;
- return cache_32_w[offset/4];
+static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w)
+{
+ // Load 8B
+ uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+ uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+ uint32_t ptr_block_m = ptr_idx_m - offset;
+ mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+ // Extract 4B
+ uint32_t *cache_32_w = (uint32_t *) cache_w;
+ return cache_32_w[offset / 4];
}
-static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) {
- // Load 8B
- uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t);
- uint32_t offset = ((uint32_t)ptr_idx_m)%8;
- uint32_t ptr_block_m = ptr_idx_m - offset;
- mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8);
- // Modify 4B
- uint32_t* cache_32_w = (uint32_t*) cache_w;
- cache_32_w[offset/4] = val;
- // Write back 8B
- mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8);
+static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx,
+ uint64_t *cache_w)
+{
+ // Load 8B
+ uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t);
+ uint32_t offset = ((uint32_t) ptr_idx_m) % 8;
+ uint32_t ptr_block_m = ptr_idx_m - offset;
+ mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8);
+ // Modify 4B
+ uint32_t *cache_32_w = (uint32_t *) cache_w;
+ cache_32_w[offset / 4] = val;
+ // Write back 8B
+ mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8);
}
#endif
-
diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c
index 43a2d0f..44ec214 100644
--- a/BFS/dpu/task.c
+++ b/BFS/dpu/task.c
@@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS);
MUTEX_INIT(nextFrontierMutex);
// main
-int main() {
-
- if(me() == 0) {
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- // Load parameters
- uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
- struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
- mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
-
- // Extract parameters
- uint32_t numGlobalNodes = params_w->numNodes;
- uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
- uint32_t numNodes = params_w->dpuNumNodes;
- uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
- uint32_t level = params_w->level;
- uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
- uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
- uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
- uint32_t visited_m = params_w->dpuVisited_m;
- uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
- uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
-
- if(numNodes > 0) {
-
- // Sanity check
- if(me() == 0) {
- if(numGlobalNodes%64 != 0) {
- //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
- }
- if(startNodeIdx%64 != 0 || numNodes%64 != 0) {
- //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
- }
- }
-
- // Allocate WRAM cache for each tasklet to use throughout
- uint64_t* cache_w = mem_alloc(sizeof(uint64_t));
-
- // Update current frontier and visited list based on the next frontier from the previous iteration
- for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) {
-
- // Get the next frontier tile from MRAM
- uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w);
-
- // Process next frontier tile if it is not empty
- if(nextFrontierTile) {
-
- // Mark everything that was previously added to the next frontier as visited
- uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w);
- visitedTile |= nextFrontierTile;
- store8B(visitedTile, visited_m, nodeTileIdx, cache_w);
-
- // Clear the next frontier
- store8B(0, nextFrontier_m, nodeTileIdx, cache_w);
-
- }
-
- // Extract the current frontier from the previous next frontier and update node levels
- uint32_t startTileIdx = startNodeIdx/64;
- uint32_t numTiles = numNodes/64;
- if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) {
-
- // Update current frontier
- store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w);
-
- // Update node levels
- if(nextFrontierTile) {
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(nextFrontierTile, node%64)) {
- store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
- }
- }
- }
- }
-
- }
-
- // Wait until all tasklets have updated the current frontier
- barrier_wait(&bfsBarrier);
-
- // Identify tasklet's nodes
- uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS;
- uint32_t taskletNodesStart = me()*numNodesPerTasklet;
- uint32_t taskletNumNodes;
- if(taskletNodesStart > numNodes) {
- taskletNumNodes = 0;
- } else if(taskletNodesStart + numNodesPerTasklet > numNodes) {
- taskletNumNodes = numNodes - taskletNodesStart;
- } else {
- taskletNumNodes = numNodesPerTasklet;
- }
-
- // Visit neighbors of the current frontier
- mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
- for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) {
- uint32_t nodeTileIdx = node/64;
- uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
- if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
- // Visit its neighbors
- uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset;
- uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
- for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
- uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
- uint32_t neighborTileIdx = neighbor/64;
- uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w);
- if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited
- // Add neighbor to next frontier
- mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
- uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w);
- setBit(nextFrontierTile, neighbor%64);
- store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w);
- mutex_unlock(mutexID);
- }
- }
- }
- }
-
- }
-
- return 0;
+int main()
+{
+
+ if (me() == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Load parameters
+ uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ struct DPUParams *params_w =
+ (struct DPUParams *)
+ mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+ mram_read((__mram_ptr void const *)params_m, params_w,
+ ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)));
+
+ // Extract parameters
+ uint32_t numGlobalNodes = params_w->numNodes;
+ uint32_t startNodeIdx = params_w->dpuStartNodeIdx;
+ uint32_t numNodes = params_w->dpuNumNodes;
+ uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset;
+ uint32_t level = params_w->level;
+ uint32_t nodePtrs_m = params_w->dpuNodePtrs_m;
+ uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m;
+ uint32_t nodeLevel_m = params_w->dpuNodeLevel_m;
+ uint32_t visited_m = params_w->dpuVisited_m;
+ uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m;
+ uint32_t nextFrontier_m = params_w->dpuNextFrontier_m;
+
+ if (numNodes > 0) {
+
+ // Sanity check
+ if (me() == 0) {
+ if (numGlobalNodes % 64 != 0) {
+ //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!");
+ }
+ if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) {
+ //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!");
+ }
+ }
+ // Allocate WRAM cache for each tasklet to use throughout
+ uint64_t *cache_w = mem_alloc(sizeof(uint64_t));
+
+ // Update current frontier and visited list based on the next frontier from the previous iteration
+ for (uint32_t nodeTileIdx = me();
+ nodeTileIdx < numGlobalNodes / 64;
+ nodeTileIdx += NR_TASKLETS) {
+
+ // Get the next frontier tile from MRAM
+ uint64_t nextFrontierTile =
+ load8B(nextFrontier_m, nodeTileIdx, cache_w);
+
+ // Process next frontier tile if it is not empty
+ if (nextFrontierTile) {
+
+ // Mark everything that was previously added to the next frontier as visited
+ uint64_t visitedTile =
+ load8B(visited_m, nodeTileIdx, cache_w);
+ visitedTile |= nextFrontierTile;
+ store8B(visitedTile, visited_m, nodeTileIdx,
+ cache_w);
+
+ // Clear the next frontier
+ store8B(0, nextFrontier_m, nodeTileIdx,
+ cache_w);
+
+ }
+ // Extract the current frontier from the previous next frontier and update node levels
+ uint32_t startTileIdx = startNodeIdx / 64;
+ uint32_t numTiles = numNodes / 64;
+ if (startTileIdx <= nodeTileIdx
+ && nodeTileIdx < startTileIdx + numTiles) {
+
+ // Update current frontier
+ store8B(nextFrontierTile, currentFrontier_m,
+ nodeTileIdx - startTileIdx, cache_w);
+
+ // Update node levels
+ if (nextFrontierTile) {
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64;
+ ++node) {
+ if (isSet
+ (nextFrontierTile,
+ node % 64)) {
+ store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks
+ }
+ }
+ }
+ }
+
+ }
+
+ // Wait until all tasklets have updated the current frontier
+ barrier_wait(&bfsBarrier);
+
+ // Identify tasklet's nodes
+ uint32_t numNodesPerTasklet =
+ (numNodes + NR_TASKLETS - 1) / NR_TASKLETS;
+ uint32_t taskletNodesStart = me() * numNodesPerTasklet;
+ uint32_t taskletNumNodes;
+ if (taskletNodesStart > numNodes) {
+ taskletNumNodes = 0;
+ } else if (taskletNodesStart + numNodesPerTasklet > numNodes) {
+ taskletNumNodes = numNodes - taskletNodesStart;
+ } else {
+ taskletNumNodes = numNodesPerTasklet;
+ }
+
+ // Visit neighbors of the current frontier
+ mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex);
+ for (uint32_t node = taskletNodesStart;
+ node < taskletNodesStart + taskletNumNodes; ++node) {
+ uint32_t nodeTileIdx = node / 64;
+ uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile
+ if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier
+ // Visit its neighbors
+ uint32_t nodePtr =
+ load4B(nodePtrs_m, node,
+ cache_w) - nodePtrsOffset;
+ uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr
+ for (uint32_t i = nodePtr; i < nextNodePtr; ++i) {
+ uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader
+ uint32_t neighborTileIdx =
+ neighbor / 64;
+ uint64_t visitedTile =
+ load8B(visited_m, neighborTileIdx,
+ cache_w);
+ if (!isSet(visitedTile, neighbor % 64)) { // Neighbor not previously visited
+ // Add neighbor to next frontier
+ mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention
+ uint64_t nextFrontierTile =
+ load8B(nextFrontier_m,
+ neighborTileIdx,
+ cache_w);
+ setBit(nextFrontierTile,
+ neighbor % 64);
+ store8B(nextFrontierTile,
+ nextFrontier_m,
+ neighborTileIdx,
+ cache_w);
+ mutex_unlock(mutexID);
+ }
+ }
+ }
+ }
+
+ }
+
+ return 0;
}
diff --git a/BFS/host/app.c b/BFS/host/app.c
index 54b9cdc..9ba7ffb 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -30,305 +30,429 @@
#define DPU_BINARY "./bin/dpu_code"
// Main of the Host Application
-int main(int argc, char** argv) {
-
- // Process parameters
- struct Params p = input_params(argc, argv);
-
- // Timer and profiling
- Timer timer;
- #if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- double tenergy=0;
- #endif
-
- // Allocate DPUs and load binary
- struct dpu_set_t dpu_set, dpu;
- uint32_t numDPUs;
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
- PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
- // Initialize BFS data structures
- PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
- struct COOGraph cooGraph = readCOOGraph(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
- struct CSRGraph csrGraph = coo2csr(cooGraph);
- uint32_t numNodes = csrGraph.numNodes;
- uint32_t* nodePtrs = csrGraph.nodePtrs;
- uint32_t* neighborIdxs = csrGraph.neighborIdxs;
- uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- uint32_t level = 1;
-
- // Partition data structure across DPUs
- uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1);
- PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU);
- struct DPUParams dpuParams[numDPUs];
- uint32_t dpuParams_m[numDPUs];
- unsigned int dpuIdx = 0;
- unsigned int t0ini = 0;
- unsigned int t1ini = 0;
- unsigned int t2ini = 0;
- unsigned int t3ini = 0;
- DPU_FOREACH (dpu_set, dpu) {
-
- // Allocate parameters
- struct mram_heap_allocator_t allocator;
- init_allocator(&allocator);
- dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
- // Find DPU's nodes
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- uint32_t dpuNumNodes;
- if(dpuStartNodeIdx > numNodes) {
- dpuNumNodes = 0;
- } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) {
- dpuNumNodes = numNodes - dpuStartNodeIdx;
- } else {
- dpuNumNodes = numNodesPerDPU;
- }
- dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
- PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
- PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", dpuNumNodes);
-
- // Partition edges and copy data
- if(dpuNumNodes > 0) {
-
- // Find DPU's CSR graph partition
- uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
- uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
- uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset;
- uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
- uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
-
- // Allocate MRAM
- uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t));
- uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t));
- uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t));
- uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t));
- uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
-
- // Set up DPU parameters
- dpuParams[dpuIdx].numNodes = numNodes;
- dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
- dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
- dpuParams[dpuIdx].level = level;
- dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
- dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
- dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
- dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
- dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m;
- dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
-
- // Send data to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer, 0, t0ini++);
- copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
- copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // NOTE: No need to copy current frontier because it is written before being read
- stopTimer(&timer, 0);
- //loadTime += getElapsedTime(timer);
-
- }
-
- // Send parameters to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer, 1, t1ini++);
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- stopTimer(&timer, 1);
- //loadTime += getElapsedTime(timer);
-
- ++dpuIdx;
-
- }
-
- // Iterate until next frontier is empty
- uint32_t nextFrontierEmpty = 0;
- while(!nextFrontierEmpty) {
-
- PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level);
-
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- // Run all DPUs
- PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
- startTimer(&timer, 2, t2ini++);
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- stopTimer(&timer, 2);
- //dpuTime += getElapsedTime(timer);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- tenergy += energy;
- #endif
-
-
-
- // Copy back next frontier from all DPUs and compute their union as the current frontier
- startTimer(&timer, 3, t3ini++);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- if(dpuIdx == 0) {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t));
- } else {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t));
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- currentFrontier[i] |= nextFrontier[i];
- }
- }
- ++dpuIdx;
- }
- }
-
- // Check if the next frontier is empty, and copy data to DPU if not empty
- nextFrontierEmpty = 1;
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- if(currentFrontier[i]) {
- nextFrontierEmpty = 0;
- break;
- }
- }
- if(!nextFrontierEmpty) {
- ++level;
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
- copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // Copy new level to DPU
- dpuParams[dpuIdx].level = level;
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- ++dpuIdx;
- }
- }
- }
- stopTimer(&timer, 3);
- //hostTime += getElapsedTime(timer);
-
- }
-
- // Copy back node levels
- PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer, 4, 0);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float));
- }
- ++dpuIdx;
- }
- stopTimer(&timer, 4);
- //retrieveTime += getElapsedTime(timer);
- //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
-
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t));
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- nextFrontierEmpty = 0;
- level = 1;
- while(!nextFrontierEmpty) {
- // Update current frontier and visited list based on the next frontier from the previous iteration
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
- currentFrontier[nodeTileIdx] = nextFrontierTile;
- if(nextFrontierTile) {
- visited[nodeTileIdx] |= nextFrontierTile;
- nextFrontier[nodeTileIdx] = 0;
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(nextFrontierTile, node%64)) {
- nodeLevelReference[node] = level;
- }
- }
- }
- }
- // Visit neighbors of the current frontier
- nextFrontierEmpty = 1;
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t currentFrontierTile = currentFrontier[nodeTileIdx];
- if(currentFrontierTile) {
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
- // Visit its neighbors
- uint32_t nodePtr = nodePtrs[node];
- uint32_t nextNodePtr = nodePtrs[node + 1];
- for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
- uint32_t neighbor = neighborIdxs[i];
- if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited
- // Add neighbor to next frontier
- setBit(nextFrontier[neighbor/64], neighbor%64);
- nextFrontierEmpty = 0;
- }
- }
- }
- }
- }
- }
- ++level;
- }
-
- // Verify the result
- PRINT_INFO(p.verbosity >= 1, "Verifying the result");
- int status = 1;
- for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
- if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
- PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
- status = 0;
- }
- }
-
- if (status) {
- printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
- "| throughput_pim_MBps=%f throughput_MBps=%f",
- numDPUs, NR_TASKLETS, "uint32_t", numNodes,
- numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]),
- numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
- numNodes / (timer.time[2] + timer.time[3]),
- numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printAll(&timer, 4);
- }
-
- // Display DPU Logs
- if(p.verbosity >= 2) {
- PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- PRINT("DPU %u:", dpuIdx);
- DPU_ASSERT(dpu_log_read(dpu, stdout));
- ++dpuIdx;
- }
- }
-
- // Deallocate data structures
- freeCOOGraph(cooGraph);
- freeCSRGraph(csrGraph);
- free(nodeLevel);
- free(visited);
- free(currentFrontier);
- free(nextFrontier);
- free(nodeLevelReference);
-
- return 0;
+int main(int argc, char **argv)
+{
-}
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Timer and profiling
+ Timer timer;
+#if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ double tenergy = 0;
+#endif
+
+ printf
+ ("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
+
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs, numRanks;
+
+#if WITH_ALLOC_OVERHEAD
+ startTimer(&timer, 0, 0);
+#endif
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if WITH_ALLOC_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ timer.time[0] = 0;
+#endif
+
+#if WITH_LOAD_OVERHEAD
+ startTimer(&timer, 1, 0);
+#endif
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+#if WITH_LOAD_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ timer.time[1] = 0;
+#endif
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+ assert(NR_DPUS == numDPUs);
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize BFS data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+ struct COOGraph cooGraph = readCOOGraph(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges",
+ cooGraph.numNodes, cooGraph.numEdges);
+ struct CSRGraph csrGraph = coo2csr(cooGraph);
+ uint32_t numNodes = csrGraph.numNodes;
+ uint32_t *nodePtrs = csrGraph.nodePtrs;
+ uint32_t *neighborIdxs = csrGraph.neighborIdxs;
+ uint32_t *nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ uint64_t *visited = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *currentFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *nextFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ uint32_t level = 1;
+
+ // Partition data structure across DPUs
+ uint32_t numNodesPerDPU =
+ ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU",
+ numNodesPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ uint32_t dpuParams_m[numDPUs];
+ unsigned int dpuIdx = 0;
+ unsigned int t0ini = 0;
+ unsigned int t1ini = 0;
+ unsigned int t2ini = 0;
+ unsigned int t3ini = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ dpuParams_m[dpuIdx] =
+ mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's nodes
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ uint32_t dpuNumNodes;
+ if (dpuStartNodeIdx > numNodes) {
+ dpuNumNodes = 0;
+ } else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) {
+ dpuNumNodes = numNodes - dpuStartNodeIdx;
+ } else {
+ dpuNumNodes = numNodesPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u nodes",
+ dpuNumNodes);
+
+ // Partition edges and copy data
+ if (dpuNumNodes > 0) {
+
+ // Find DPU's CSR graph partition
+ uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
+ uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
+ uint32_t *dpuNeighborIdxs_h =
+ neighborIdxs + dpuNodePtrsOffset;
+ uint32_t dpuNumNeighbors =
+ dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
+ uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
+
+ // Allocate MRAM
+ uint32_t dpuNodePtrs_m =
+ mram_heap_alloc(&allocator,
+ (dpuNumNodes +
+ 1) * sizeof(uint32_t));
+ uint32_t dpuNeighborIdxs_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNeighbors * sizeof(uint32_t));
+ uint32_t dpuNodeLevel_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes * sizeof(uint32_t));
+ uint32_t dpuVisited_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ uint32_t dpuCurrentFrontier_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes / 64 *
+ sizeof(uint64_t));
+ uint32_t dpuNextFrontier_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ PRINT_INFO(p.verbosity >= 2,
+ " Total memory allocated is %d bytes",
+ allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].numNodes = numNodes;
+ dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
+ dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
+ dpuParams[dpuIdx].level = level;
+ dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
+ dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
+ dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
+ dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
+ dpuParams[dpuIdx].dpuCurrentFrontier_m =
+ dpuCurrentFrontier_m;
+ dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying data to DPU");
+ startTimer(&timer, 2, t0ini++);
+ copyToDPU(dpu, (uint8_t *) dpuNodePtrs_h, dpuNodePtrs_m,
+ (dpuNumNodes + 1) * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) dpuNeighborIdxs_h,
+ dpuNeighborIdxs_m,
+ dpuNumNeighbors * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) dpuNodeLevel_h,
+ dpuNodeLevel_m,
+ dpuNumNodes * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) visited, dpuVisited_m,
+ numNodes / 64 * sizeof(uint64_t));
+ copyToDPU(dpu, (uint8_t *) nextFrontier,
+ dpuNextFrontier_m,
+ numNodes / 64 * sizeof(uint64_t));
+ // NOTE: No need to copy current frontier because it is written before being read
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ }
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying parameters to DPU");
+ startTimer(&timer, 2, t1ini++);
+ copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx],
+ dpuParams_m[dpuIdx], sizeof(struct DPUParams));
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+
+ // Iterate until next frontier is empty
+ uint32_t nextFrontierEmpty = 0;
+ while (!nextFrontierEmpty) {
+
+ PRINT_INFO(p.verbosity >= 1,
+ "Processing current frontier for level %u", level);
+
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
+ startTimer(&timer, 3, t2ini++);
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ stopTimer(&timer, 3);
+ //dpuTime += getElapsedTime(timer);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ tenergy += energy;
+#endif
+
+ // Copy back next frontier from all DPUs and compute their union as the current frontier
+ startTimer(&timer, 4, t3ini++);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ if (dpuIdx == 0) {
+ copyFromDPU(dpu,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ (uint8_t *) currentFrontier,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ } else {
+ copyFromDPU(dpu,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ (uint8_t *) nextFrontier,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ for (uint32_t i = 0; i < numNodes / 64;
+ ++i) {
+ currentFrontier[i] |=
+ nextFrontier[i];
+ }
+ }
+ ++dpuIdx;
+ }
+ }
+
+ // Check if the next frontier is empty, and copy data to DPU if not empty
+ nextFrontierEmpty = 1;
+ for (uint32_t i = 0; i < numNodes / 64; ++i) {
+ if (currentFrontier[i]) {
+ nextFrontierEmpty = 0;
+ break;
+ }
+ }
+ if (!nextFrontierEmpty) {
+ ++level;
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes =
+ dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
+ copyToDPU(dpu,
+ (uint8_t *) currentFrontier,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ // Copy new level to DPU
+ dpuParams[dpuIdx].level = level;
+ copyToDPU(dpu,
+ (uint8_t *) &
+ dpuParams[dpuIdx],
+ dpuParams_m[dpuIdx],
+ sizeof(struct DPUParams));
+ ++dpuIdx;
+ }
+ }
+ }
+ stopTimer(&timer, 4);
+ //hostTime += getElapsedTime(timer);
+
+ }
+
+ // Copy back node levels
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer, 5, 0);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m,
+ (uint8_t *) (nodeLevel + dpuStartNodeIdx),
+ dpuNumNodes * sizeof(float));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer, 5);
+ //retrieveTime += getElapsedTime(timer);
+ //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ uint32_t *nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t));
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ nextFrontierEmpty = 0;
+ level = 1;
+ startTimer(&timer, 6, 0);
+ while (!nextFrontierEmpty) {
+ // Update current frontier and visited list based on the next frontier from the previous iteration
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
+ currentFrontier[nodeTileIdx] = nextFrontierTile;
+ if (nextFrontierTile) {
+ visited[nodeTileIdx] |= nextFrontierTile;
+ nextFrontier[nodeTileIdx] = 0;
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(nextFrontierTile, node % 64)) {
+ nodeLevelReference[node] =
+ level;
+ }
+ }
+ }
+ }
+ // Visit neighbors of the current frontier
+ nextFrontierEmpty = 1;
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t currentFrontierTile =
+ currentFrontier[nodeTileIdx];
+ if (currentFrontierTile) {
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier
+ // Visit its neighbors
+ uint32_t nodePtr =
+ nodePtrs[node];
+ uint32_t nextNodePtr =
+ nodePtrs[node + 1];
+ for (uint32_t i = nodePtr;
+ i < nextNodePtr; ++i) {
+ uint32_t neighbor =
+ neighborIdxs[i];
+ if (!isSet(visited[neighbor / 64], neighbor % 64)) { // Neighbor not previously visited
+ // Add neighbor to next frontier
+ setBit
+ (nextFrontier
+ [neighbor /
+ 64],
+ neighbor %
+ 64);
+ nextFrontierEmpty
+ = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ ++level;
+ }
+ stopTimer(&timer, 6);
+
+#if WITH_FREE_OVERHEAD
+ startTimer(&timer, 7);
+#endif
+ DPU_ASSERT(dpu_free(dpu_set));
+#if WITH_FREE_OVERHEAD
+ stopTimer(&timer, 7);
+#else
+ timer.time[7] = 0;
+#endif
+
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
+ for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
+ if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
+ PRINT_ERROR
+ ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)",
+ nodeIdx, nodeLevelReference[nodeIdx],
+ nodeLevel[nodeIdx]);
+ status = 0;
+ }
+ }
+
+ if (status) {
+ printf
+ ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
+ "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs,
+ NR_TASKLETS, "uint32_t", numNodes,
+ numNodes * sizeof(uint32_t) / (timer.time[2] +
+ timer.time[3]),
+ numNodes * sizeof(uint32_t) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4]));
+ printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
+ numNodes / (timer.time[2] + timer.time[3]),
+ numNodes / (timer.time[0] + timer.time[1] +
+ timer.time[2] + timer.time[3] +
+ timer.time[4]));
+ printf
+ (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[4], timer.time[5], timer.time[6],
+ timer.time[7]);
+ }
+ // Display DPU Logs
+ if (p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+ // Deallocate data structures
+ freeCOOGraph(cooGraph);
+ freeCSRGraph(csrGraph);
+ free(nodeLevel);
+ free(visited);
+ free(currentFrontier);
+ free(nextFrontier);
+ free(nodeLevelReference);
+
+ return 0;
+
+}
diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h
index 627dfde..f2ee031 100644
--- a/BFS/host/mram-management.h
+++ b/BFS/host/mram-management.h
@@ -5,33 +5,45 @@
#include "../support/common.h"
#include "../support/utils.h"
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
struct mram_heap_allocator_t {
- uint32_t totalAllocated;
+ uint32_t totalAllocated;
};
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
- allocator->totalAllocated = 0;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+ allocator->totalAllocated = 0;
}
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
- uint32_t ret = allocator->totalAllocated;
- allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
- if(allocator->totalAllocated > DPU_CAPACITY) {
- PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
- exit(0);
- }
- return ret;
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+ uint32_t size)
+{
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if (allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR
+ (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+ allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
}
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
- DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
+ uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_to
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
- DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
+ uint8_t *hostPtr, uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_from
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
#endif
-
diff --git a/BFS/support/common.h b/BFS/support/common.h
index ced324c..5f2aa0d 100644
--- a/BFS/support/common.h
+++ b/BFS/support/common.h
@@ -9,18 +9,17 @@
#define isSet(val, idx) ((val) & (1 << (idx)))
struct DPUParams {
- uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
- uint32_t numNodes; /* Total number of nodes in the graph */
- uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */
- uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
- uint32_t level; /* The current BFS level */
- uint32_t dpuNodePtrs_m;
- uint32_t dpuNeighborIdxs_m;
- uint32_t dpuNodeLevel_m;
- uint32_t dpuVisited_m;
- uint32_t dpuCurrentFrontier_m;
- uint32_t dpuNextFrontier_m;
+ uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */
+ uint32_t numNodes; /* Total number of nodes in the graph */
+ uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */
+ uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */
+ uint32_t level; /* The current BFS level */
+ uint32_t dpuNodePtrs_m;
+ uint32_t dpuNeighborIdxs_m;
+ uint32_t dpuNodeLevel_m;
+ uint32_t dpuVisited_m;
+ uint32_t dpuCurrentFrontier_m;
+ uint32_t dpuNextFrontier_m;
};
#endif
-
diff --git a/BFS/support/graph.h b/BFS/support/graph.h
index f89ff5c..2a19f67 100644
--- a/BFS/support/graph.h
+++ b/BFS/support/graph.h
@@ -9,108 +9,125 @@
#include "utils.h"
struct COOGraph {
- uint32_t numNodes;
- uint32_t numEdges;
- uint32_t* nodeIdxs;
- uint32_t* neighborIdxs;
+ uint32_t numNodes;
+ uint32_t numEdges;
+ uint32_t *nodeIdxs;
+ uint32_t *neighborIdxs;
};
struct CSRGraph {
- uint32_t numNodes;
- uint32_t numEdges;
- uint32_t* nodePtrs;
- uint32_t* neighborIdxs;
+ uint32_t numNodes;
+ uint32_t numEdges;
+ uint32_t *nodePtrs;
+ uint32_t *neighborIdxs;
};
-static struct COOGraph readCOOGraph(const char* fileName) {
-
- struct COOGraph cooGraph;
-
- // Initialize fields
- FILE* fp = fopen(fileName, "r");
- uint32_t numNodes, numCols;
- assert(fscanf(fp, "%u", &numNodes));
- assert(fscanf(fp, "%u", &numCols));
- if(numNodes == numCols) {
- cooGraph.numNodes = numNodes;
- } else {
- PRINT_WARNING(" Adjacency matrix is not square. Padding matrix to be square.");
- cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols;
- }
- if(cooGraph.numNodes%64 != 0) {
- PRINT_WARNING(" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes);
- cooGraph.numNodes += (64 - cooGraph.numNodes%64);
- PRINT_WARNING(" Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes);
- }
- assert(fscanf(fp, "%u", &cooGraph.numEdges));
- cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
- cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t));
-
- // Read the neighborIdxs
- for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
- uint32_t nodeIdx;
- assert(fscanf(fp, "%u", &nodeIdx));
- cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
- uint32_t neighborIdx;
- assert(fscanf(fp, "%u", &neighborIdx));
- cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
- }
-
- return cooGraph;
+static struct COOGraph readCOOGraph(const char *fileName)
+{
+
+ struct COOGraph cooGraph;
+
+ // Initialize fields
+ FILE *fp = fopen(fileName, "r");
+ uint32_t numNodes, numCols;
+ assert(fscanf(fp, "%u", &numNodes));
+ assert(fscanf(fp, "%u", &numCols));
+ if (numNodes == numCols) {
+ cooGraph.numNodes = numNodes;
+ } else {
+ PRINT_WARNING
+ (" Adjacency matrix is not square. Padding matrix to be square.");
+ cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols;
+ }
+ if (cooGraph.numNodes % 64 != 0) {
+ PRINT_WARNING
+ (" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.",
+ cooGraph.numNodes);
+ cooGraph.numNodes += (64 - cooGraph.numNodes % 64);
+ PRINT_WARNING
+ (" Padding to %u which is a multiple of 64 nodes.",
+ cooGraph.numNodes);
+ }
+ assert(fscanf(fp, "%u", &cooGraph.numEdges));
+ cooGraph.nodeIdxs =
+ (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+ cooGraph.neighborIdxs =
+ (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t));
+
+ // Read the neighborIdxs
+ for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) {
+ uint32_t nodeIdx;
+ assert(fscanf(fp, "%u", &nodeIdx));
+ cooGraph.nodeIdxs[edgeIdx] = nodeIdx;
+ uint32_t neighborIdx;
+ assert(fscanf(fp, "%u", &neighborIdx));
+ cooGraph.neighborIdxs[edgeIdx] = neighborIdx;
+ }
+
+ return cooGraph;
}
-static void freeCOOGraph(struct COOGraph cooGraph) {
- free(cooGraph.nodeIdxs);
- free(cooGraph.neighborIdxs);
+static void freeCOOGraph(struct COOGraph cooGraph)
+{
+ free(cooGraph.nodeIdxs);
+ free(cooGraph.neighborIdxs);
}
-static struct CSRGraph coo2csr(struct COOGraph cooGraph) {
-
- struct CSRGraph csrGraph;
-
- // Initialize fields
- csrGraph.numNodes = cooGraph.numNodes;
- csrGraph.numEdges = cooGraph.numEdges;
- csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t));
- csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t)));
-
- // Histogram nodeIdxs
- for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
- uint32_t nodeIdx = cooGraph.nodeIdxs[i];
- csrGraph.nodePtrs[nodeIdx]++;
- }
-
- // Prefix sum nodePtrs
- uint32_t sumBeforeNextNode = 0;
- for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
- uint32_t sumBeforeNode = sumBeforeNextNode;
- sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
- csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
- }
- csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
-
- // Bin the neighborIdxs
- for(uint32_t i = 0; i < cooGraph.numEdges; ++i) {
- uint32_t nodeIdx = cooGraph.nodeIdxs[i];
- uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
- csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i];
- }
-
- // Restore nodePtrs
- for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
- csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
- }
- csrGraph.nodePtrs[0] = 0;
-
- return csrGraph;
+static struct CSRGraph coo2csr(struct COOGraph cooGraph)
+{
+
+ struct CSRGraph csrGraph;
+
+ // Initialize fields
+ csrGraph.numNodes = cooGraph.numNodes;
+ csrGraph.numEdges = cooGraph.numEdges;
+ csrGraph.nodePtrs =
+ (uint32_t *)
+ calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1),
+ sizeof(uint32_t));
+ csrGraph.neighborIdxs =
+ (uint32_t *)
+ malloc(ROUND_UP_TO_MULTIPLE_OF_8
+ (csrGraph.numEdges * sizeof(uint32_t)));
+
+ // Histogram nodeIdxs
+ for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+ uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+ csrGraph.nodePtrs[nodeIdx]++;
+ }
+
+ // Prefix sum nodePtrs
+ uint32_t sumBeforeNextNode = 0;
+ for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+ uint32_t sumBeforeNode = sumBeforeNextNode;
+ sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx];
+ csrGraph.nodePtrs[nodeIdx] = sumBeforeNode;
+ }
+ csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode;
+
+ // Bin the neighborIdxs
+ for (uint32_t i = 0; i < cooGraph.numEdges; ++i) {
+ uint32_t nodeIdx = cooGraph.nodeIdxs[i];
+ uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++;
+ csrGraph.neighborIdxs[neighborListIdx] =
+ cooGraph.neighborIdxs[i];
+ }
+
+ // Restore nodePtrs
+ for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) {
+ csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1];
+ }
+ csrGraph.nodePtrs[0] = 0;
+
+ return csrGraph;
}
-static void freeCSRGraph(struct CSRGraph csrGraph) {
- free(csrGraph.nodePtrs);
- free(csrGraph.neighborIdxs);
+static void freeCSRGraph(struct CSRGraph csrGraph)
+{
+ free(csrGraph.nodePtrs);
+ free(csrGraph.neighborIdxs);
}
#endif
-
diff --git a/BFS/support/params.h b/BFS/support/params.h
index f4f12e7..f9169bc 100644
--- a/BFS/support/params.h
+++ b/BFS/support/params.h
@@ -5,42 +5,63 @@
#include "common.h"
#include "utils.h"
-static void usage() {
- PRINT( "\nUsage: ./program [options]"
- "\n"
- "\nBenchmark-specific options:"
- "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)"
- "\n"
- "\nGeneral options:"
- "\n -v <V> verbosity"
- "\n -h help"
- "\n\n");
+static void usage()
+{
+ PRINT("\nUsage: ./program [options]"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)"
+ "\n"
+ "\nGeneral options:"
+ "\n -v <V> verbosity" "\n -h help" "\n\n");
}
typedef struct Params {
- const char* fileName;
- unsigned int verbosity;
+ const char *fileName;
+ unsigned int verbosity;
+#if NUMA
+ struct bitmask *bitmask_in;
+ int numa_node_cpu;
+#endif
} Params;
-static struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.fileName = "data/roadNet-CA.txt";
- p.verbosity = 0;
- int opt;
- while((opt = getopt(argc, argv, "f:v:h")) >= 0) {
- switch(opt) {
- case 'f': p.fileName = optarg; break;
- case 'v': p.verbosity = atoi(optarg); break;
- case 'h': usage(); exit(0);
- default:
- PRINT_ERROR("Unrecognized option!");
- usage();
- exit(0);
- }
- }
+static struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.fileName = "data/roadNet-CA.txt";
+ p.verbosity = 0;
+#if NUMA
+ p.bitmask_in = NULL;
+ p.numa_node_cpu = -1;
+#endif
+ int opt;
+ while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) {
+ switch (opt) {
+ case 'f':
+ p.fileName = optarg;
+ break;
+ case 'v':
+ p.verbosity = atoi(optarg);
+ break;
+#if NUMA
+ case 'A':
+ p.bitmask_in = numa_parse_nodestring(optarg);
+ break;
+ case 'C':
+ p.numa_node_cpu = atoi(optarg);
+ break;
+#endif
+ case 'h':
+ usage();
+ exit(0);
+ default:
+ PRINT_ERROR("Unrecognized option!");
+ usage();
+ exit(0);
+ }
+ }
- return p;
+ return p;
}
#endif
-
diff --git a/BFS/support/timer.h b/BFS/support/timer.h
index 80719cf..63b5567 100644
--- a/BFS/support/timer.h
+++ b/BFS/support/timer.h
@@ -6,29 +6,26 @@
#include <sys/time.h>
typedef struct Timer {
- struct timeval startTime[5];
- struct timeval stopTime[5];
- double time[5];
+ struct timeval startTime[8];
+ struct timeval stopTime[8];
+ double time[8];
} Timer;
-static void startTimer(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
+static void startTimer(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
}
-static void stopTimer(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-static void printAll(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
+static void stopTimer(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
}
#endif
diff --git a/BFS/support/utils.h b/BFS/support/utils.h
index ddb1e2c..ccd8fbd 100644
--- a/BFS/support/utils.h
+++ b/BFS/support/utils.h
@@ -8,4 +8,3 @@
#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__)
#endif
-
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile
index b67602f..4c30f65 100644
--- a/BS/baselines/cpu/Makefile
+++ b/BS/baselines/cpu/Makefile
@@ -1,16 +1,30 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
-ifeq (${NUMA}, 1)
- FLAGS += -lnuma
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
endif
.PHONY: all
all: bs_omp
bs_omp: bs_omp.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS}
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} bs_omp.c -o bs_omp -fopenmp ${LDFLAGS}
bs_omp_O0: bs_omp.c
gcc bs_omp.c -o bs_omp_O0 -fopenmp
diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c
index 874299b..2e4c300 100644
--- a/BS/baselines/cpu/bs_omp.c
+++ b/BS/baselines/cpu/bs_omp.c
@@ -7,265 +7,286 @@
#include <assert.h>
#include <time.h>
#include <stdint.h>
+
+#if WITH_BENCHMARK
#include "timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
#if NUMA
#include <numaif.h>
#include <numa.h>
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
-struct bitmask* bitmask_in;
+struct bitmask *bitmask_in;
int numa_node_in = -1;
int numa_node_cpu = -1;
#endif
-
#if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
#endif
-
#define DTYPE uint64_t
/*
* @brief creates a "test file" by filling a bufferwith values
*/
-void create_test_file(DTYPE * input, uint64_t nr_elements, DTYPE * querys, uint64_t n_querys) {
+void create_test_file(DTYPE *input, uint64_t nr_elements, DTYPE *querys,
+ uint64_t n_querys)
+{
- srand(time(NULL));
+ srand(time(NULL));
- input[0] = 1;
- for (uint64_t i = 1; i < nr_elements; i++) {
- input[i] = input[i - 1] + (rand() % 10) + 1;
- }
+ input[0] = 1;
+ for (uint64_t i = 1; i < nr_elements; i++) {
+ input[i] = input[i - 1] + (rand() % 10) + 1;
+ }
- for(uint64_t i = 0; i < n_querys; i++)
- {
- querys[i] = input[rand() % (nr_elements - 2)];
- }
+ for (uint64_t i = 0; i < n_querys; i++) {
+ querys[i] = input[rand() % nr_elements];
+ }
}
/**
* @brief compute output in the host
*/
-uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigned n_querys)
+uint64_t binarySearch(DTYPE *input, uint64_t input_size, DTYPE *querys,
+ unsigned n_querys)
{
uint64_t found = -1;
uint64_t q, r, l, m;
-
- #pragma omp parallel for private(q,r,l,m)
- for(q = 0; q < n_querys; q++)
- {
+
+#pragma omp parallel for private(q,r,l,m)
+ for (q = 0; q < n_querys; q++) {
l = 0;
r = input_size;
- while (l <= r)
- {
- m = l + (r - l) / 2;
-
- // Check if x is present at mid
- if (input[m] == querys[q])
- {
- found += m;
+ while (l <= r) {
+ m = l + (r - l) / 2;
+
+ // Check if x is present at mid
+ if (input[m] == querys[q]) {
+ found += m;
break;
}
- // If x greater, ignore left half
- if (input[m] < querys[q])
- l = m + 1;
+ // If x greater, ignore left half
+ if (input[m] < querys[q])
+ l = m + 1;
- // If x is smaller, ignore right half
+ // If x is smaller, ignore right half
else
- r = m - 1;
-
+ r = m - 1;
+
}
- }
+ }
- return found;
+ return found;
}
/**
* @brief Main of the Host Application.
*/
- int main(int argc, char **argv) {
- (void)argc;
- Timer timer;
- uint64_t input_size = atol(argv[1]);
- uint64_t n_querys = atol(argv[2]);
+int main(int argc, char **argv)
+{
+ (void)argc;
+#if WITH_BENCHMARK
+ Timer timer;
+#endif
+ uint64_t input_size = atol(argv[1]);
+ uint64_t n_querys = atol(argv[2]);
#if NUMA
- bitmask_in = numa_parse_nodestring(argv[3]);
- numa_node_cpu = atoi(argv[4]);
+ bitmask_in = numa_parse_nodestring(argv[3]);
+ numa_node_cpu = atoi(argv[4]);
#endif
#if NUMA_MEMCPY
- bitmask_cpu = numa_parse_nodestring(argv[5]);
- numa_node_cpu_memcpy = atoi(argv[6]);
+ bitmask_cpu = numa_parse_nodestring(argv[5]);
+ numa_node_cpu_memcpy = atoi(argv[6]);
#endif
- printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
+ printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
#if NUMA
- if (bitmask_in) {
- numa_set_membind(bitmask_in);
- numa_free_nodemask(bitmask_in);
- }
- DTYPE * input = numa_alloc((input_size) * sizeof(DTYPE));
- DTYPE * querys = numa_alloc((n_querys) * sizeof(DTYPE));
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ numa_free_nodemask(bitmask_in);
+ }
+ DTYPE *input = numa_alloc((input_size) * sizeof(DTYPE));
+ DTYPE *querys = numa_alloc((n_querys) * sizeof(DTYPE));
#else
- DTYPE * input = malloc((input_size) * sizeof(DTYPE));
- DTYPE * querys = malloc((n_querys) * sizeof(DTYPE));
+ DTYPE *input = malloc((input_size) * sizeof(DTYPE));
+ DTYPE *querys = malloc((n_querys) * sizeof(DTYPE));
#endif
#if NUMA
#if NUMA_MEMCPY
- if (bitmask_cpu) {
- numa_set_membind(bitmask_cpu);
- numa_free_nodemask(bitmask_cpu);
- }
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
#endif
- DTYPE result_host = -1;
+ DTYPE result_host = -1;
- // Create an input file with arbitrary data.
- create_test_file(input, input_size, querys, n_querys);
+ // Create an input file with arbitrary data.
+ create_test_file(input, input_size, querys, n_querys);
#if NUMA
- mp_pages[0] = input;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = input;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
#endif
#if NUMA_MEMCPY
- DTYPE *input_local = input;
- DTYPE *querys_local = querys;
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- input_local = numa_alloc((input_size) * sizeof(DTYPE));
- querys_local = numa_alloc((n_querys) * sizeof(DTYPE));
- }
- stop(&timer, 1);
- if (!numa_node_in_is_local) {
- if (numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- memcpy(input_local, input, input_size * sizeof(DTYPE));
- memcpy(querys_local, querys, n_querys * sizeof(DTYPE));
- } else {
- input_local = input;
- querys_local = querys;
- }
- stop(&timer, 2);
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
- mp_pages[0] = input_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(input_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ DTYPE *input_local = input;
+ DTYPE *querys_local = querys;
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ input_local = numa_alloc((input_size) * sizeof(DTYPE));
+ querys_local = numa_alloc((n_querys) * sizeof(DTYPE));
+ }
+ stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ memcpy(input_local, input, input_size * sizeof(DTYPE));
+ memcpy(querys_local, querys, n_querys * sizeof(DTYPE));
+ } else {
+ input_local = input;
+ querys_local = querys;
+ }
+ stop(&timer, 2);
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+ mp_pages[0] = input_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(input_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
+#endif
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
#endif
- start(&timer, 0, 0);
+ start(&timer, 0, 0);
#if NUMA_MEMCPY
- result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys);
+ result_host =
+ binarySearch(input_local, input_size - 1, querys_local, n_querys);
#else
- result_host = binarySearch(input, input_size - 1, querys, n_querys);
+ result_host = binarySearch(input, input_size - 1, querys, n_querys);
+#endif
+ stop(&timer, 0);
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
#endif
- stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(input_local, input_size * sizeof(DTYPE));
- numa_free(querys_local, n_querys * sizeof(DTYPE));
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(input_local, input_size * sizeof(DTYPE));
+ numa_free(querys_local, n_querys * sizeof(DTYPE));
+ }
+ stop(&timer, 3);
#endif
- unsigned int nr_threads = 0;
+ int status = (result_host);
+#if WITH_BENCHMARK
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- int status = (result_host);
- if (status) {
+ if (status) {
#if NUMA_MEMCPY
- printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu"
- " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d"
- " | throughput_MBps=%f throughput_MOpps=%f"
- " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- nr_threads, "uint64_t", input_size,
- numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
- n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0],
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu"
+ " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f"
+ " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ nr_threads, "uint64_t", input_size, numa_node_in,
+ numa_node_local, numa_node_cpu, numa_node_cpu_memcpy,
+ numa_distance(numa_node_in, numa_node_cpu),
+ n_querys * sizeof(DTYPE) / timer.time[0],
+ n_querys / timer.time[0], timer.time[0], timer.time[1],
+ timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] +
+ timer.time[3]);
#else
- printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu"
+ printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu"
#if NUMA
- " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
+ " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, "uint64_t", input_size,
+ " | throughput_MBps=%f",
+ nr_threads, "uint64_t", input_size,
#if NUMA
- numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu),
+ numa_node_in, numa_node_cpu, numa_distance(numa_node_in,
+ numa_node_cpu),
#endif
- n_querys * sizeof(DTYPE) / timer.time[0]);
- printf(" throughput_MOpps=%f latency_us=%f\n",
- n_querys / timer.time[0], timer.time[0]);
+ n_querys * sizeof(DTYPE) / timer.time[0]);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ n_querys / timer.time[0], timer.time[0]);
#endif
- } else {
- printf("[ERROR]\n");
- }
+ } else {
+ printf("[ERROR]\n");
+ }
+#endif // WITH_BENCHMARK
#if NUMA
- numa_free(input, input_size * sizeof(DTYPE));
- numa_free(querys, n_querys * sizeof(DTYPE));
+ numa_free(input, input_size * sizeof(DTYPE));
+ numa_free(querys, n_querys * sizeof(DTYPE));
#else
- free(input);
- free(querys);
+ free(input);
+ free(querys);
#endif
-
- return status ? 0 : 1;
+ return status ? 0 : 1;
}
-
diff --git a/BS/baselines/cpu/run-perf.sh b/BS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..5b671e0
--- /dev/null
+++ b/BS/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4
diff --git a/BS/dimes-hetsim-hbm.sh b/BS/dimes-hetsim-hbm.sh
index 4e1500d..4a775ae 100755
--- a/BS/dimes-hetsim-hbm.sh
+++ b/BS/dimes-hetsim-hbm.sh
@@ -1,7 +1,7 @@
#!/bin/bash
cd baselines/cpu
-make -B NUMA=1
+make -B numa=1
mkdir -p log/$(hostname)
fn=log/$(hostname)/dimes-hetsim-hbm
diff --git a/BS/dimes-hetsim-nmc.sh b/BS/dimes-hetsim-nmc.sh
index 195334b..fa697bf 100755
--- a/BS/dimes-hetsim-nmc.sh
+++ b/BS/dimes-hetsim-nmc.sh
@@ -3,6 +3,8 @@
mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
fn=log/$(hostname)/dimes-hetsim-nmc
+source /opt/upmem/upmem-2024.1.0-Linux-x86_64/upmem_env.sh
+
# upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB)
# upstream DPU version uses 2 queries
input_size_upstream=2048576
@@ -11,6 +13,8 @@ num_queries_upstream=2
input_size_dpu=$(perl -E 'say 2 ** 22')
num_queries_dpu=1048576
+# Make sure that num_queries > input_size!
+
run_benchmark_nmc() {
local "$@"
set -e
@@ -69,7 +73,7 @@ cd baselines/cpu
(
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
echo "CPU single-node upstream-ref with memcpy, copy node == input node (1/6)" >&2
@@ -97,7 +101,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
:::+ cpu 0 1 \
::: nr_threads 1 2 4 8 12 16
-make -B NUMA=1
+make -B numa=1
echo "CPU single-node upstream-ref (3/6)" >&2
diff --git a/BS/dpu/task.c b/BS/dpu/task.c
index acf66f2..5881dd1 100644
--- a/BS/dpu/task.c
+++ b/BS/dpu/task.c
@@ -17,140 +17,168 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
__host dpu_results_t DPU_RESULTS[NR_TASKLETS];
// Search
-static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size) {
- DTYPE found = -2;
- if(bufferA[0] <= searching_for)
- {
- found = -1;
- for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++){
- if(bufferA[i] == searching_for)
- {
- found = i;
- break;
- }
- }
- }
- return found;
+static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size)
+{
+ DTYPE found = -2;
+ if (bufferA[0] <= searching_for) {
+ found = -1;
+ for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++) {
+ if (bufferA[i] == searching_for) {
+ found = i;
+ break;
+ }
+ }
+ }
+ return found;
}
BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
-int(*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
-int main(void){
- // Kernel
- return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+int main(void)
+{
+ // Kernel
+ return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
}
// main_kernel1
-int main_kernel1() {
- unsigned int tasklet_id = me();
- #if PRINT
- printf("tasklet_id = %u\n", tasklet_id);
- #endif
- if(tasklet_id == 0){
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- DTYPE searching_for, found;
- uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size;
-
- // Address of the current processing block in MRAM
- uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
- uint32_t start_mram_block_addr_aux = start_mram_block_addr_A;
- uint32_t end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
- uint32_t current_mram_block_addr_query = end_mram_block_addr_A + tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) * sizeof(DTYPE);
-
- // Initialize a local cache to store the MRAM block
- DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE);
-
- dpu_results_t *result = &DPU_RESULTS[tasklet_id];
-
- for(uint64_t targets = 0; targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS); targets++)
- {
- found = -1;
-
- mram_read((__mram_ptr void const *) current_mram_block_addr_query, &searching_for, 8);
- current_mram_block_addr_query += 8;
-
- // Initialize input vector boundaries
- start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
- start_mram_block_addr_aux = start_mram_block_addr_A;
- end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
-
- uint32_t current_mram_block_addr_A = start_mram_block_addr_A;
-
- // Bring first and last values to WRAM
- mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_aux_A, BLOCK_SIZE);
- mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)), cache_aux_B, BLOCK_SIZE);
-
- while(1)
- {
- // Locate the address of the mid mram block
- current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2;
- current_mram_block_addr_A &= WORD_MASK;
-
- // Boundary check
- if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE))
- {
- // Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE)
- mram_read((__mram_ptr void const *) start_mram_block_addr_A, cache_A, BLOCK_SIZE);
- found = search(cache_A, searching_for, BLOCK_SIZE);
-
- if(found > -1)
- {
- result->found = found + (start_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
- }
- // Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A)
- else
- {
- size_t remain_bytes_to_search = end_mram_block_addr_A - (start_mram_block_addr_A + BLOCK_SIZE);
- mram_read((__mram_ptr void const *) start_mram_block_addr_A + BLOCK_SIZE, cache_A, remain_bytes_to_search);
- found = search(cache_A, searching_for, remain_bytes_to_search);
-
- if(found > -1)
- {
- result->found = found + (start_mram_block_addr_A + BLOCK_SIZE - start_mram_block_addr_aux) / sizeof(DTYPE);
- }
- else
- {
- printf("%lld NOT found\n", searching_for);
- }
+int main_kernel1()
+{
+ unsigned int tasklet_id = me();
+#if PRINT
+ printf("tasklet_id = %u\n", tasklet_id);
+#endif
+ if (tasklet_id == 0) {
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ DTYPE searching_for, found;
+ uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size;
+
+ // Address of the current processing block in MRAM
+ uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ uint32_t start_mram_block_addr_aux = start_mram_block_addr_A;
+ uint32_t end_mram_block_addr_A =
+ start_mram_block_addr_A + sizeof(DTYPE) * input_size;
+ uint32_t current_mram_block_addr_query =
+ end_mram_block_addr_A +
+ tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) *
+ sizeof(DTYPE);
+
+ // Initialize a local cache to store the MRAM block
+ DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE);
+
+ dpu_results_t *result = &DPU_RESULTS[tasklet_id];
+
+ for (uint64_t targets = 0;
+ targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS);
+ targets++) {
+ found = -1;
+
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_query, &searching_for, 8);
+ current_mram_block_addr_query += 8;
+
+ // Initialize input vector boundaries
+ start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ start_mram_block_addr_aux = start_mram_block_addr_A;
+ end_mram_block_addr_A =
+ start_mram_block_addr_A + sizeof(DTYPE) * input_size;
+
+ uint32_t current_mram_block_addr_A = start_mram_block_addr_A;
+
+ // Bring first and last values to WRAM
+ mram_read((__mram_ptr void const *)current_mram_block_addr_A,
+ cache_aux_A, BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)(end_mram_block_addr_A -
+ BLOCK_SIZE * sizeof(DTYPE)),
+ cache_aux_B, BLOCK_SIZE);
+
+ while (1) {
+ // Locate the address of the mid mram block
+ current_mram_block_addr_A =
+ (start_mram_block_addr_A +
+ end_mram_block_addr_A) / 2;
+ current_mram_block_addr_A &= WORD_MASK;
+
+ // Boundary check
+ if (current_mram_block_addr_A <
+ (start_mram_block_addr_A + BLOCK_SIZE)) {
+ // Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE)
+ mram_read((__mram_ptr void const *)
+ start_mram_block_addr_A, cache_A,
+ BLOCK_SIZE);
+ found =
+ search(cache_A, searching_for, BLOCK_SIZE);
+
+ if (found > -1) {
+ result->found =
+ found + (start_mram_block_addr_A -
+ start_mram_block_addr_aux)
+ / sizeof(DTYPE);
+ }
+ // Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A)
+ else {
+ size_t remain_bytes_to_search =
+ end_mram_block_addr_A -
+ (start_mram_block_addr_A +
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)
+ start_mram_block_addr_A +
+ BLOCK_SIZE, cache_A,
+ remain_bytes_to_search);
+ found =
+ search(cache_A, searching_for,
+ remain_bytes_to_search);
+
+ if (found > -1) {
+ result->found =
+ found +
+ (start_mram_block_addr_A +
+ BLOCK_SIZE -
+ start_mram_block_addr_aux)
+ / sizeof(DTYPE);
+ } else {
+ printf("%lld NOT found\n",
+ searching_for);
+ }
+ }
+ break;
+ }
+ // Load cache with current MRAM block
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_A, cache_A,
+ BLOCK_SIZE);
+
+ // Search inside block
+ found = search(cache_A, searching_for, BLOCK_SIZE);
+
+ // If found > -1, we found the searching_for query
+ if (found > -1) {
+ result->found =
+ found + (current_mram_block_addr_A -
+ start_mram_block_addr_aux) /
+ sizeof(DTYPE);
+ break;
+ }
+ // If found == -2, we need to discard right part of the input vector
+ if (found == -2) {
+ end_mram_block_addr_A =
+ current_mram_block_addr_A;
+ }
+ // If found == -1, we need to discard left part of the input vector
+ else if (found == -1) {
+ start_mram_block_addr_A =
+ current_mram_block_addr_A;
+ }
+ }
}
- break;
- }
-
- // Load cache with current MRAM block
- mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE);
-
- // Search inside block
- found = search(cache_A, searching_for, BLOCK_SIZE);
-
- // If found > -1, we found the searching_for query
- if(found > -1)
- {
- result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
- break;
- }
-
- // If found == -2, we need to discard right part of the input vector
- if(found == -2)
- {
- end_mram_block_addr_A = current_mram_block_addr_A;
- }
-
- // If found == -1, we need to discard left part of the input vector
- else if (found == -1)
- {
- start_mram_block_addr_A = current_mram_block_addr_A;
- }
- }
- }
- return 0;
+ return 0;
}
diff --git a/BS/host/app.c b/BS/host/app.c
index 10d76f1..217ea99 100644
--- a/BS/host/app.c
+++ b/BS/host/app.c
@@ -31,24 +31,28 @@
#define DPU_BINARY "./bin/bs_dpu"
// Create input arrays
-void create_test_file(DTYPE * input, DTYPE * querys, uint64_t nr_elements, uint64_t nr_querys) {
+void create_test_file(DTYPE *input, DTYPE *querys, uint64_t nr_elements,
+ uint64_t nr_querys)
+{
+
+ srand(time(NULL));
input[0] = 1;
for (uint64_t i = 1; i < nr_elements; i++) {
- input[i] = input[i - 1] + 1;
+ input[i] = input[i - 1] + (rand() % 10) + 1;
}
for (uint64_t i = 0; i < nr_querys; i++) {
- querys[i] = i;
+ querys[i] = input[rand() % nr_elements];
}
}
// Compute output in the host
-int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t num_querys)
+int64_t binarySearch(DTYPE *input, DTYPE *querys, DTYPE input_size,
+ uint64_t num_querys)
{
uint64_t result = -1;
DTYPE r;
- for(uint64_t q = 0; q < num_querys; q++)
- {
+ for (uint64_t q = 0; q < num_querys; q++) {
DTYPE l = 0;
r = input_size;
while (l <= r) {
@@ -57,92 +61,96 @@ int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t n
// XXX shouldn't this short-circuit?
// Check if x is present at mid
if (input[m] == querys[q])
- result = m;
+ result = m;
// If x greater, ignore left half
if (input[m] < querys[q])
- l = m + 1;
+ l = m + 1;
// If x is smaller, ignore right half
else
- r = m - 1;
+ r = m - 1;
}
}
return result;
}
-
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ uint32_t nr_of_ranks;
uint64_t input_size = INPUT_SIZE;
uint64_t num_querys = p.num_querys;
DTYPE result_host = -1;
- DTYPE result_dpu = -1;
+ DTYPE result_dpu = -1;
- // Timer declaration
- Timer timer;
+ // Timer declaration
+ Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[6] = 0; // free
#endif
- #if ENERGY
+#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- #endif
+#endif
// Query number adjustement for proper partitioning
- if(num_querys % (NR_DPUS * NR_TASKLETS))
- num_querys = num_querys + (NR_DPUS * NR_TASKLETS - num_querys % (NR_DPUS * NR_TASKLETS));
+ if (num_querys % (NR_DPUS * NR_TASKLETS))
+ num_querys =
+ num_querys + (NR_DPUS * NR_TASKLETS -
+ num_querys % (NR_DPUS * NR_TASKLETS));
- assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors
+ assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors
- DTYPE * input = malloc((input_size) * sizeof(DTYPE));
- DTYPE * querys = malloc((num_querys) * sizeof(DTYPE));
+ DTYPE *input = malloc((input_size) * sizeof(DTYPE));
+ DTYPE *querys = malloc((num_querys) * sizeof(DTYPE));
// Create an input file with arbitrary data
create_test_file(input, querys, input_size, num_querys);
// Create kernel arguments
- uint64_t slice_per_dpu = num_querys / NR_DPUS;
- dpu_arguments_t input_arguments = {input_size, slice_per_dpu, 0};
+ uint64_t slice_per_dpu = num_querys / NR_DPUS;
+ dpu_arguments_t input_arguments = { input_size, slice_per_dpu, 0 };
for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
// Perform input transfers
uint64_t i = 0;
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 0, 0);
}
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 0);
}
#endif
#if WITH_DPUINFO
printf("DPUs:");
- DPU_FOREACH (dpu_set, dpu) {
- int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
+ DPU_FOREACH(dpu_set, dpu) {
+ int rank =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
int slice = dpu_get_slice_id(dpu_from_set(dpu));
int member = dpu_get_member_id(dpu_from_set(dpu));
printf(" %d(%d.%d)", rank, slice, member);
@@ -150,11 +158,11 @@ int main(int argc, char **argv) {
printf("\n");
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 1, 0);
}
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 1);
}
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -164,27 +172,35 @@ int main(int argc, char **argv) {
// int prev_rank_id = -1;
int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
numa_node_rank = -1;
} else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
}
/*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
}
// Compute host solution
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 2, 0);
}
- result_host = binarySearch(input, querys, input_size - 1, num_querys);
- if(rep >= p.n_warmup) {
+ result_host =
+ binarySearch(input, querys, input_size - 1, num_querys);
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
}
@@ -192,103 +208,110 @@ int main(int argc, char **argv) {
start(&timer, 3, 0);
}
- DPU_FOREACH(dpu_set, dpu, i)
- {
+ DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(input_arguments), DPU_XFER_DEFAULT));
i = 0;
- DPU_FOREACH(dpu_set, dpu, i)
- {
+ DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, input));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size * sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ input_size * sizeof(DTYPE), DPU_XFER_DEFAULT));
i = 0;
- DPU_FOREACH(dpu_set, dpu, i)
- {
- DPU_ASSERT(dpu_prepare_xfer(dpu, querys + slice_per_dpu * i));
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, querys + slice_per_dpu * i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size * sizeof(DTYPE), slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ input_size * sizeof(DTYPE),
+ slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup) {
stop(&timer, 3);
}
-
// Run kernel on DPUs
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 4, 0);
- #if ENERGY
+#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
- #endif
+#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 4);
- #if ENERGY
+#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
- #endif
+#endif
}
// Print logs if required
- #if PRINT
+#if PRINT
unsigned int each_dpu = 0;
printf("Display DPU Logs\n");
- DPU_FOREACH(dpu_set, dpu)
- {
+ DPU_FOREACH(dpu_set, dpu) {
printf("DPU#%d:\n", each_dpu);
DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
each_dpu++;
}
- #endif
+#endif
// Retrieve results
- dpu_results_t* results_retrieve[NR_DPUS];
+ dpu_results_t *results_retrieve[NR_DPUS];
if (rep >= p.n_warmup) {
start(&timer, 5, 0);
}
i = 0;
- DPU_FOREACH(dpu_set, dpu, i)
- {
- results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t));
+ DPU_FOREACH(dpu_set, dpu, i) {
+ results_retrieve[i] =
+ (dpu_results_t *) malloc(NR_TASKLETS *
+ sizeof(dpu_results_t));
DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT));
-
- DPU_FOREACH(dpu_set, dpu, i)
- {
- for(unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++)
- {
- if(results_retrieve[i][each_tasklet].found > result_dpu)
- {
- result_dpu = results_retrieve[i][each_tasklet].found;
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+ NR_TASKLETS * sizeof(dpu_results_t),
+ DPU_XFER_DEFAULT));
+
+ DPU_FOREACH(dpu_set, dpu, i) {
+ for (unsigned int each_tasklet = 0;
+ each_tasklet < NR_TASKLETS; each_tasklet++) {
+ if (results_retrieve[i][each_tasklet].found >
+ result_dpu) {
+ result_dpu =
+ results_retrieve[i][each_tasklet].
+ found;
}
}
free(results_retrieve[i]);
}
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 5);
}
-
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 6, 0);
}
#endif
DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 6);
}
#endif
@@ -296,58 +319,91 @@ int main(int argc, char **argv) {
int status = (result_dpu == result_host);
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] results are equal\n");
if (rep >= p.n_warmup) {
- printf("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
- NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, input_size);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3],
- timer.time[4],
- timer.time[5],
- timer.time[6]);
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- num_querys * sizeof(DTYPE) / timer.time[2],
- num_querys * sizeof(DTYPE) / (timer.time[4]),
- num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- num_querys * sizeof(DTYPE) / (timer.time[3] + timer.time[4] + timer.time[5]),
- num_querys * sizeof(DTYPE) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- num_querys / timer.time[2],
- num_querys / (timer.time[4]),
- num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- num_querys / (timer.time[3] + timer.time[4] + timer.time[5]),
- num_querys / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
+ printf
+ ("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
+ NR_DPUS, nr_of_ranks, NR_TASKLETS,
+ XSTR(DTYPE), BLOCK_SIZE, input_size);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ timer.time[0], timer.time[1],
+ timer.time[2], timer.time[3],
+ timer.time[4], timer.time[5],
+ timer.time[6]);
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ num_querys * sizeof(DTYPE) / timer.time[2],
+ num_querys * sizeof(DTYPE) /
+ (timer.time[4]),
+ num_querys * sizeof(DTYPE) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5] + timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ num_querys * sizeof(DTYPE) /
+ (timer.time[3] + timer.time[4] +
+ timer.time[5]),
+ num_querys * sizeof(DTYPE) /
+ (timer.time[1] + timer.time[3] +
+ timer.time[4] + timer.time[5]),
+ num_querys * sizeof(DTYPE) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ num_querys / timer.time[2],
+ num_querys / (timer.time[4]),
+ num_querys / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ num_querys / (timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ num_querys / (timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ num_querys / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
}
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] results differ!\n");
}
}
// Print timing results
/*
- printf("CPU Version Time (ms): ");
- print(&timer, 0, p.n_reps);
- printf("CPU-DPU Time (ms): ");
- print(&timer, 1, p.n_reps);
- printf("DPU Kernel Time (ms): ");
- print(&timer, 2, p.n_reps);
- printf("DPU-CPU Time (ms): ");
- print(&timer, 3, p.n_reps);
- */
-
- #if ENERGY
+ printf("CPU Version Time (ms): ");
+ print(&timer, 0, p.n_reps);
+ printf("CPU-DPU Time (ms): ");
+ print(&timer, 1, p.n_reps);
+ printf("DPU Kernel Time (ms): ");
+ print(&timer, 2, p.n_reps);
+ printf("DPU-CPU Time (ms): ");
+ print(&timer, 3, p.n_reps);
+ */
+
+#if ENERGY
double energy;
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
printf("DPU Energy (J): %f\t", energy * num_iterations);
- #endif
+#endif
free(input);
#if !WITH_ALLOC_OVERHEAD
diff --git a/BS/support/common.h b/BS/support/common.h
index dbd050c..54adc39 100755
--- a/BS/support/common.h
+++ b/BS/support/common.h
@@ -38,7 +38,7 @@ typedef struct {
// Structures used by both the host and the dpu to communicate information
typedef struct {
- DTYPE found;
+ DTYPE found;
} dpu_results_t;
#ifndef ENERGY
diff --git a/BS/support/params.h b/BS/support/params.h
index 02bd750..c91202f 100644
--- a/BS/support/params.h
+++ b/BS/support/params.h
@@ -4,49 +4,56 @@
#include "common.h"
typedef struct Params {
- long num_querys;
- unsigned n_warmup;
- unsigned n_reps;
-}Params;
+ long num_querys;
+ unsigned n_warmup;
+ unsigned n_reps;
+} Params;
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> problem size (default=2 queries)"
- "\n");
- }
+void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> problem size (default=2 queries)" "\n");
+}
- struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.num_querys = PROBLEM_SIZE;
- p.n_warmup = 1;
- p.n_reps = 3;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.num_querys = PROBLEM_SIZE;
+ p.n_warmup = 1;
+ p.n_reps = 3;
- int opt;
- while((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.num_querys = atol(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.num_querys = atol(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
- }
- #endif
+ return p;
+}
+#endif
diff --git a/BS/support/timer.h b/BS/support/timer.h
index ff1ae1b..256447a 100755
--- a/BS/support/timer.h
+++ b/BS/support/timer.h
@@ -1,66 +1,71 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+typedef struct Timer {
+ struct timeval startTime[7];
+ struct timeval stopTime[7];
+ double time[7];
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("%f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile
index 4608944..ede0498 100644
--- a/COUNT/baselines/cpu/Makefile
+++ b/COUNT/baselines/cpu/Makefile
@@ -1,8 +1,23 @@
-NUMA ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
-ifeq (${NUMA}, 1)
- FLAGS += -lnuma
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
endif
.PHONY: all
@@ -11,7 +26,7 @@ all: count
TYPE ?= uint64_t
count: app_baseline.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS}
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS}
.PHONY: run
run: count
@@ -19,4 +34,4 @@ run: count
.PHONY: clean
clean:
- rm -f count count_O0 count_O2
+ rm -f count
diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c
index d52257a..13e3f51 100644
--- a/COUNT/baselines/cpu/app_baseline.c
+++ b/COUNT/baselines/cpu/app_baseline.c
@@ -12,13 +12,19 @@
#include <assert.h>
#include <stdint.h>
#include <omp.h>
+
+#if WITH_BENCHMARK
#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
#if NUMA
#include <numaif.h>
#include <numa.h>
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -37,71 +43,70 @@ volatile int total_count;
// Params ---------------------------------------------------------------------
typedef struct Params {
- char* dpu_type;
- int input_size;
- int n_warmup;
- int n_reps;
- int n_threads;
+ char *dpu_type;
+ int input_size;
+ int n_warmup;
+ int n_reps;
+ int n_threads;
#if NUMA
- struct bitmask* bitmask_in;
- struct bitmask* bitmask_out;
- int numa_node_cpu;
+ struct bitmask *bitmask_in;
+ struct bitmask *bitmask_out;
+ int numa_node_cpu;
#endif
-}Params;
+} Params;
struct Params p;
static T *A;
-bool pred(const T x){
- return (x % 2) == 0;
+bool pred(const T x)
+{
+ return (x % 2) == 0;
}
-
-void create_test_file(unsigned int nr_elements) {
- //srand(0);
+void create_test_file(unsigned int nr_elements)
+{
+ //srand(0);
#if NUMA
- if (p.bitmask_in) {
- numa_set_membind(p.bitmask_in);
- numa_free_nodemask(p.bitmask_in);
- }
- A = (T*) numa_alloc(nr_elements * sizeof(T));
+ if (p.bitmask_in) {
+ numa_set_membind(p.bitmask_in);
+ numa_free_nodemask(p.bitmask_in);
+ }
+ A = (T *) numa_alloc(nr_elements * sizeof(T));
#else
- A = (T*) malloc(nr_elements * sizeof(T));
+ A = (T *) malloc(nr_elements * sizeof(T));
#endif
#if NUMA
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
#endif
- for (unsigned int i = 0; i < nr_elements; i++) {
- //A[i] = (unsigned int) (rand());
- A[i] = i+1;
- }
+ for (unsigned int i = 0; i < nr_elements; i++) {
+ //A[i] = (unsigned int) (rand());
+ A[i] = i + 1;
+ }
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- numa_node_cpu = p.numa_node_cpu;
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
}
@@ -109,116 +114,152 @@ void create_test_file(unsigned int nr_elements) {
/**
* @brief compute output in the host
*/
-static int count_host(int size, int t) {
- int count = 0;
-
- omp_set_num_threads(t);
- #pragma omp parallel for reduction(+:count)
- for(int my = 0; my < size; my++) {
- if(!pred(A[my])) {
- count++;
- }
- }
- return count;
+static int count_host(int size, int t)
+{
+ int count = 0;
+
+ omp_set_num_threads(t);
+#pragma omp parallel for reduction(+:count)
+ for (int my = 0; my < size; my++) {
+ if (!pred(A[my])) {
+ count++;
+ }
+ }
+ return count;
}
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -d <D> DPU type (default=fsim)"
- "\n -t <T> # of threads (default=8)"
- "\n -w <W> # of untimed warmup iterations (default=2)"
- "\n -e <E> # of timed repetition iterations (default=5)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=8M elements)"
- "\n");
+void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -d <D> DPU type (default=fsim)"
+ "\n -t <T> # of threads (default=8)"
+ "\n -w <W> # of untimed warmup iterations (default=2)"
+ "\n -e <E> # of timed repetition iterations (default=5)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=8M elements)" "\n");
}
-void input_params(int argc, char **argv) {
- p.input_size = 16 << 20;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.n_threads = 5;
+void input_params(int argc, char **argv)
+{
+ p.input_size = 16 << 20;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.n_threads = 5;
#if NUMA
- p.bitmask_in = NULL;
- p.bitmask_out = NULL;
- p.numa_node_cpu = -1;
+ p.bitmask_in = NULL;
+ p.bitmask_out = NULL;
+ p.numa_node_cpu = -1;
#endif
- int opt;
- while((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 't': p.n_threads = atoi(optarg); break;
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 't':
+ p.n_threads = atoi(optarg);
+ break;
#if NUMA
- case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break;
- case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break;
- case 'c': p.numa_node_cpu = atoi(optarg); break;
+ case 'a':
+ p.bitmask_in = numa_parse_nodestring(optarg);
+ break;
+ case 'b':
+ p.bitmask_out = numa_parse_nodestring(optarg);
+ break;
+ case 'c':
+ p.numa_node_cpu = atoi(optarg);
+ break;
#endif
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(p.n_threads > 0 && "Invalid # of ranks!");
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(p.n_threads > 0 && "Invalid # of ranks!");
}
/**
* @brief Main of the Host Application.
*/
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- input_params(argc, argv);
+ input_params(argc, argv);
- const unsigned int file_size = p.input_size;
+ const unsigned int file_size = p.input_size;
- // Create an input file with arbitrary data.
- create_test_file(file_size);
+ // Create an input file with arbitrary data.
+ create_test_file(file_size);
- Timer timer;
+#if WITH_BENCHMARK
+ Timer timer;
+#endif
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
+#endif
- for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
- start(&timer, 0, 0);
- total_count = count_host(file_size, p.n_threads);
- stop(&timer, 0);
+ for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+ start(&timer, 0, 0);
+ total_count = count_host(file_size, p.n_threads);
+ stop(&timer, 0);
- unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- if (rep >= p.n_warmup) {
- printf("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d"
+ if (rep >= p.n_warmup) {
+ printf
+ ("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), file_size,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), file_size,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+#endif
+ file_size * 2 * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f",
+ file_size / timer.time[0]);
+ printall(&timer, 0);
+ }
+#endif // WITH_BENCHMARK
+ }
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
#endif
- file_size * 2 * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- file_size / timer.time[0]);
- printall(&timer, 0);
- }
- }
#if NUMA
- numa_free(A, file_size * sizeof(T));
+ numa_free(A, file_size * sizeof(T));
#else
- free(A);
+ free(A);
#endif
- return 0;
+ return 0;
}
diff --git a/COUNT/dpu/task.c b/COUNT/dpu/task.c
index b2ed79b..8ba6aaf 100644
--- a/COUNT/dpu/task.c
+++ b/COUNT/dpu/task.c
@@ -21,33 +21,36 @@ uint32_t message[NR_TASKLETS];
uint32_t message_partial_count;
// COUNT in each tasklet
-static unsigned int count(T *input){
- unsigned int cnt = 0;
- #pragma unroll
- for(unsigned int j = 0; j < REGS; j++) {
- if(!pred(input[j])) {
- cnt++;
- }
- }
- return cnt;
+static unsigned int count(T *input)
+{
+ unsigned int cnt = 0;
+#pragma unroll
+ for (unsigned int j = 0; j < REGS; j++) {
+ if (!pred(input[j])) {
+ cnt++;
+ }
+ }
+ return cnt;
}
// Handshake with adjacent tasklets
-static unsigned int handshake_sync(unsigned int l_count, unsigned int tasklet_id){
- unsigned int p_count;
- // Wait and read message
- if(tasklet_id != 0){
- handshake_wait_for(tasklet_id - 1);
- p_count = message[tasklet_id];
- } else {
- p_count = 0;
- }
- // Write message and notify
- if(tasklet_id < NR_TASKLETS - 1){
- message[tasklet_id + 1] = p_count + l_count;
- handshake_notify();
- }
- return p_count;
+static unsigned int handshake_sync(unsigned int l_count,
+ unsigned int tasklet_id)
+{
+ unsigned int p_count;
+ // Wait and read message
+ if (tasklet_id != 0) {
+ handshake_wait_for(tasklet_id - 1);
+ p_count = message[tasklet_id];
+ } else {
+ p_count = 0;
+ }
+ // Write message and notify
+ if (tasklet_id < NR_TASKLETS - 1) {
+ message[tasklet_id + 1] = p_count + l_count;
+ handshake_notify();
+ }
+ return p_count;
}
// Barrier
@@ -55,63 +58,70 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
-int main(void) {
- // Kernel
- return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+int main(void)
+{
+ // Kernel
+ return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
}
// main_kernel1
-int main_kernel1() {
- unsigned int tasklet_id = me();
+int main_kernel1()
+{
+ unsigned int tasklet_id = me();
#if PRINT
- printf("tasklet_id = %u\n", tasklet_id);
+ printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
- dpu_results_t *result = &DPU_RESULTS[tasklet_id];
+ dpu_results_t *result = &DPU_RESULTS[tasklet_id];
- uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
+ uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
- // Address of the current processing block in MRAM
- uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
- uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
+ // Address of the current processing block in MRAM
+ uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+ uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
- // Initialize a local cache to store the MRAM block
- T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+ // Initialize a local cache to store the MRAM block
+ T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
- // Initialize shared variable
- if(tasklet_id == NR_TASKLETS - 1)
- message_partial_count = 0;
- // Barrier
- barrier_wait(&my_barrier);
+ // Initialize shared variable
+ if (tasklet_id == NR_TASKLETS - 1)
+ message_partial_count = 0;
+ // Barrier
+ barrier_wait(&my_barrier);
- for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
+ for (unsigned int byte_index = base_tasklet;
+ byte_index < input_size_dpu_bytes;
+ byte_index += BLOCK_SIZE * NR_TASKLETS) {
- // Load cache with current MRAM block
- mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, BLOCK_SIZE);
+ // Load cache with current MRAM block
+ mram_read((__mram_ptr void const *)(mram_base_addr_A +
+ byte_index), cache_A,
+ BLOCK_SIZE);
- // COUNT in each tasklet
- uint32_t l_count = count(cache_A);
+ // COUNT in each tasklet
+ uint32_t l_count = count(cache_A);
- // Sync with adjacent tasklets
- uint32_t p_count = handshake_sync(l_count, tasklet_id);
+ // Sync with adjacent tasklets
+ uint32_t p_count = handshake_sync(l_count, tasklet_id);
- // Barrier
- barrier_wait(&my_barrier);
+ // Barrier
+ barrier_wait(&my_barrier);
- // Total count in this DPU
- if(tasklet_id == NR_TASKLETS - 1){
- result->t_count = message_partial_count + p_count + l_count;
- message_partial_count = result->t_count;
- }
+ // Total count in this DPU
+ if (tasklet_id == NR_TASKLETS - 1) {
+ result->t_count =
+ message_partial_count + p_count + l_count;
+ message_partial_count = result->t_count;
+ }
- }
+ }
- return 0;
+ return 0;
}
diff --git a/COUNT/host/app.c b/COUNT/host/app.c
index 7708f6d..dad674f 100644
--- a/COUNT/host/app.c
+++ b/COUNT/host/app.c
@@ -33,287 +33,350 @@
#include <dpu_target_macros.h>
// Pointer declaration
-static T* A;
+static T *A;
// Create input arrays
-static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) {
- //srand(0);
- printf("nr_elements\t%u\t", nr_elements);
- for (unsigned int i = 0; i < nr_elements; i++) {
- //A[i] = (T) (rand());
- A[i] = i + 1;
- }
- for (unsigned int i = nr_elements; i < nr_elements_round; i++) { // Complete with removable elements
- A[i] = 0;
- }
+static void read_input(T *A, unsigned int nr_elements,
+ unsigned int nr_elements_round)
+{
+ //srand(0);
+ printf("nr_elements\t%u\t", nr_elements);
+ for (unsigned int i = 0; i < nr_elements; i++) {
+ //A[i] = (T) (rand());
+ A[i] = i + 1;
+ }
+ for (unsigned int i = nr_elements; i < nr_elements_round; i++) { // Complete with removable elements
+ A[i] = 0;
+ }
}
// Compute output in the host
-static unsigned int count_host(T* A, unsigned int nr_elements) {
- unsigned int count = 0;
- for (unsigned int i = 0; i < nr_elements; i++) {
- if(!pred(A[i])) {
- count++;
- }
- }
- return count;
+static unsigned int count_host(T *A, unsigned int nr_elements)
+{
+ unsigned int count = 0;
+ for (unsigned int i = 0; i < nr_elements; i++) {
+ if (!pred(A[i])) {
+ count++;
+ }
+ }
+ return count;
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
+ struct Params p = input_params(argc, argv);
- struct dpu_set_t dpu_set, dpu;
- uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t nr_of_dpus;
+ uint32_t nr_of_ranks;
- // Timer declaration
- Timer timer;
+ // Timer declaration
+ Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[TMR_ALLOC] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[TMR_LOAD] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[TMR_FREE] = 0; // free
#endif
#if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
- unsigned int i = 0;
- uint32_t accum = 0;
- uint32_t total_count = 0;
+ unsigned int i = 0;
+ uint32_t accum = 0;
+ uint32_t total_count = 0;
- const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; // Total input size (weak or strong scaling)
- const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
- const unsigned int input_size_dpu_round =
- (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned
+ const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; // Total input size (weak or strong scaling)
+ const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
+ const unsigned int input_size_dpu_round = (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned
- // Input allocation
- A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
- T *bufferA = A;
+ // Input allocation
+ A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T));
+ T *bufferA = A;
- dpu_results_t* results_retrieve[NR_DPUS];
- for (i = 0; i < NR_DPUS; i++) {
- results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t));
- }
+ dpu_results_t *results_retrieve[NR_DPUS];
+ for (i = 0; i < NR_DPUS; i++) {
+ results_retrieve[i] =
+ (dpu_results_t *) malloc(NR_TASKLETS *
+ sizeof(dpu_results_t));
+ }
- // Create an input file with arbitrary data
- read_input(A, input_size, input_size_dpu_round * NR_DPUS);
+ // Create an input file with arbitrary data
+ read_input(A, input_size, input_size_dpu_round * NR_DPUS);
- printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
+ printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
- // Loop over main kernel
- for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+ // Loop over main kernel
+ for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 0, 0);
- }
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
- stop(&timer, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, TMR_ALLOC, 0);
+ }
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ if (rep >= p.n_warmup) {
+ stop(&timer, TMR_ALLOC);
+ }
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 1, 0);
- }
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
- stop(&timer, 1);
- }
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
+ if (rep >= p.n_warmup) {
+ start(&timer, TMR_LOAD, 0);
+ }
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ if (rep >= p.n_warmup) {
+ stop(&timer, TMR_LOAD);
+ }
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
#endif
- // int prev_rank_id = -1;
- int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
- numa_node_rank = -1;
- } else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
- }
- /*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
- }
-
- // Compute output on CPU (performance comparison and verification purposes)
- if(rep >= p.n_warmup)
- start(&timer, 2, 0);
- total_count = count_host(A, input_size);
- if(rep >= p.n_warmup)
- stop(&timer, 2);
-
- printf("Load input data\n");
- if(rep >= p.n_warmup)
- start(&timer, 3, 0);
- // Input arguments
- const unsigned int input_size_dpu = input_size_dpu_round;
- unsigned int kernel = 0;
- dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel};
- // Copy input arrays
- i = 0;
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup)
- stop(&timer, 3);
-
- printf("Run program on DPU(s) \n");
- // Run DPU kernel
- if(rep >= p.n_warmup) {
- start(&timer, 4, 0);
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- }
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if(rep >= p.n_warmup) {
- stop(&timer, 4);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- #endif
- }
-
+ // int prev_rank_id = -1;
+ int rank_id = -1;
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
+ numa_node_rank = -1;
+ } else {
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
+ }
+ /*
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
+ }
+
+ // Compute output on CPU (performance comparison and verification purposes)
+ if (rep >= p.n_warmup)
+ start(&timer, TMR_CPU, 0);
+ total_count = count_host(A, input_size);
+ if (rep >= p.n_warmup)
+ stop(&timer, TMR_CPU);
+
+ printf("Load input data\n");
+ if (rep >= p.n_warmup)
+ start(&timer, TMR_WRITE, 0);
+ // Input arguments
+ const unsigned int input_size_dpu = input_size_dpu_round;
+ unsigned int kernel = 0;
+ dpu_arguments_t input_arguments =
+ { input_size_dpu * sizeof(T), kernel };
+ // Copy input arrays
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(input_arguments), DPU_XFER_DEFAULT));
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferA + input_size_dpu * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ input_size_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup)
+ stop(&timer, TMR_WRITE);
+
+ printf("Run program on DPU(s) \n");
+ // Run DPU kernel
+ if (rep >= p.n_warmup) {
+ start(&timer, TMR_KERNEL, 0);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ }
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ if (rep >= p.n_warmup) {
+ stop(&timer, TMR_KERNEL);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+ }
#if PRINT
- {
- unsigned int each_dpu = 0;
- printf("Display DPU Logs\n");
- DPU_FOREACH (dpu_set, dpu) {
- printf("DPU#%d:\n", each_dpu);
- DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
- each_dpu++;
- }
- }
+ {
+ unsigned int each_dpu = 0;
+ printf("Display DPU Logs\n");
+ DPU_FOREACH(dpu_set, dpu) {
+ printf("DPU#%d:\n", each_dpu);
+ DPU_ASSERT(dpulog_read_for_dpu
+ (dpu.dpu, stdout));
+ each_dpu++;
+ }
+ }
#endif
- printf("Retrieve results\n");
- dpu_results_t results[NR_DPUS];
- i = 0;
- accum = 0;
-
- if(rep >= p.n_warmup)
- start(&timer, 5, 0);
- // PARALLEL RETRIEVE TRANSFER
-
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT));
-
- DPU_FOREACH(dpu_set, dpu, i) {
- // Retrieve tasklet timings
- for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) {
- // Count of this DPU
- if(each_tasklet == NR_TASKLETS - 1){
- results[i].t_count = results_retrieve[i][each_tasklet].t_count;
- }
- }
- // Sequential scan
- accum += results[i].t_count;
- }
- if(rep >= p.n_warmup)
- stop(&timer, 5);
-
- i = 0;
+ printf("Retrieve results\n");
+ dpu_results_t results[NR_DPUS];
+ i = 0;
+ accum = 0;
+
+ if (rep >= p.n_warmup)
+ start(&timer, TMR_READ, 0);
+ // PARALLEL RETRIEVE TRANSFER
+
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+ NR_TASKLETS * sizeof(dpu_results_t),
+ DPU_XFER_DEFAULT));
+
+ DPU_FOREACH(dpu_set, dpu, i) {
+ // Retrieve tasklet timings
+ for (unsigned int each_tasklet = 0;
+ each_tasklet < NR_TASKLETS; each_tasklet++) {
+ // Count of this DPU
+ if (each_tasklet == NR_TASKLETS - 1) {
+ results[i].t_count =
+ results_retrieve[i][each_tasklet].
+ t_count;
+ }
+ }
+ // Sequential scan
+ accum += results[i].t_count;
+ }
+ if (rep >= p.n_warmup)
+ stop(&timer, TMR_READ);
+
+ i = 0;
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 8, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, TMR_FREE, 0);
+ }
#endif
- DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- stop(&timer, 8);
- }
+ if (rep >= p.n_warmup) {
+ stop(&timer, TMR_FREE);
+ }
#endif
#endif
- // Check output
- bool status = true;
- if(accum != total_count) status = false;
- if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
- if (rep >= p.n_warmup) {
- printf("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
- NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size_dpu_round);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3], // write
- timer.time[4], // kernel
- timer.time[5], // read
- timer.time[8]);
- printf(" latency_total_us=%f",
- timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8]);
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- input_size * sizeof(T) / timer.time[2],
- input_size * sizeof(T) / timer.time[4],
- input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8])
- );
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- input_size / timer.time[2],
- input_size / timer.time[4],
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8])
- );
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- }
- } else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
- }
- }
-
- #if ENERGY
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- printf("DPU Energy (J): %f\t", energy);
- #endif
-
- // Deallocation
- free(A);
+ // Check output
+ bool status = true;
+ if (accum != total_count)
+ status = false;
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
+ if (rep >= p.n_warmup) {
+ printf
+ ("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
+ NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T),
+ BLOCK_SIZE, input_size,
+ input_size_dpu_round);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3], // write
+ timer.time[4], // kernel
+ timer.time[5], // read
+ timer.time[8]);
+ printf(" latency_total_us=%f",
+ timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5] + timer.time[8]);
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ input_size * sizeof(T) / timer.time[2],
+ input_size * sizeof(T) / timer.time[4],
+ input_size * sizeof(T) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[8])
+ );
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ input_size * sizeof(T) / (timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size * sizeof(T) / (timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size * sizeof(T) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ input_size / timer.time[2],
+ input_size / timer.time[4],
+ input_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[8])
+ );
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ input_size / (timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size / (timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
+ }
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
+ }
+ }
+
+#if ENERGY
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ printf("DPU Energy (J): %f\t", energy);
+#endif
+
+ // Deallocation
+ free(A);
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_free(dpu_set));
#endif
- return 0;
+ return 0;
}
diff --git a/COUNT/support/common.h b/COUNT/support/common.h
index 72270b0..afd5b2d 100755
--- a/COUNT/support/common.h
+++ b/COUNT/support/common.h
@@ -3,15 +3,15 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t size;
+ uint32_t size;
enum kernels {
- kernel1 = 0,
- nr_kernels = 1,
+ kernel1 = 0,
+ nr_kernels = 1,
} kernel;
} dpu_arguments_t;
typedef struct {
- uint32_t t_count;
+ uint32_t t_count;
} dpu_results_t;
// Transfer size between MRAM and WRAM
@@ -26,11 +26,12 @@ typedef struct {
// Data type
#define T uint64_t
-#define REGS (BLOCK_SIZE >> 3) // 64 bits
+#define REGS (BLOCK_SIZE >> 3) // 64 bits
// Sample predicate
-bool pred(const T x){
- return (x % 2) == 0;
+bool pred(const T x)
+{
+ return (x % 2) == 0;
}
#ifndef ENERGY
diff --git a/COUNT/support/params.h b/COUNT/support/params.h
index bb86211..dd1505e 100644
--- a/COUNT/support/params.h
+++ b/COUNT/support/params.h
@@ -4,53 +4,62 @@
#include "common.h"
typedef struct Params {
- unsigned int input_size;
- int n_warmup;
- int n_reps;
- int exp;
-}Params;
+ unsigned int input_size;
+ int n_warmup;
+ int n_reps;
+ int exp;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=3932160 elements)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=3932160 elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size = 3932160;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.exp = 0;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size = 3932160;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.exp = 0;
- int opt;
- while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'x': p.exp = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'x':
+ p.exp = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/COUNT/support/timer.h b/COUNT/support/timer.h
index 3ec6d87..76fbcff 100755
--- a/COUNT/support/timer.h
+++ b/COUNT/support/timer.h
@@ -1,66 +1,80 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+ struct timeval startTime[7];
+ struct timeval stopTime[7];
+ double time[7];
+} Timer;
+
+#define TMR_ALLOC 0
+#define TMR_LOAD 1
+#define TMR_CPU 2
+#define TMR_WRITE 3
+#define TMR_KERNEL 4
+#define TMR_READ 5
+#define TMR_FREE 6
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/COUNT/vamos25.sh b/COUNT/vamos25.sh
new file mode 100755
index 0000000..a518c67
--- /dev/null
+++ b/COUNT/vamos25.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname) baselines/cpu/log/$(hostname)
+fn=log/$(hostname)/dimes-hetsim-nmc
+
+# 2^24 elem == 128 MiB
+# 2^28 elem == 2 GiB
+# (upstream version uses 1.875 GiB)
+# upstrem DPU and upstream CPU use uint64_t
+
+source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+
+run_benchmark_nmc() {
+ local "$@"
+ if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 PARALLEL_READ=1; then
+ bin/host_code -w 0 -e 40 -i ${input_size} -x 1
+ fi
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+(
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \
+ ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \
+ ::: input_size $(( 2 ** 24 )) $(( 2 ** 25 )) $(( 2 ** 26 )) $(( 2 ** 27 )) $(( 2 ** 28 ))
+
+) >> ${fn}.txt
+
+cd baselines/cpu
+
+make -B NUMA=1
+
+(
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ ./count -i {input_size} -a {ram} -c {cpu} -t {nr_threads} -w 0 -e 20 \
+ ::: ram 0 1 \
+ ::: cpu 0 1 \
+ ::: nr_threads 1 2 4 8 12 16 \
+ ::: input_size $(( 2 ** 24 )) $(( 2 ** 25 )) $(( 2 ** 26 )) $(( 2 ** 27 )) $(( 2 ** 28 ))
+) >> ${fn}.txt
diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile
index 016d561..60c662c 100644
--- a/GEMV/baselines/cpu/Makefile
+++ b/GEMV/baselines/cpu/Makefile
@@ -1,17 +1,24 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+native ?= 1
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
TYPE ?= double
-ifeq (${NUMA}, 1)
- FLAGS += -lnuma
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
endif
.PHONY: all
all: gemv
gemv: gemv_openmp.c
- gcc -ggdb -Wall -Wextra -pedantic -march=native -O2 -o gemv -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${FLAGS}
+ gcc -ggdb -Wall -Wextra -pedantic ${CFLAGS} -O3 -o gemv -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${LDFLAGS}
gemv_O0: gemv_openmp.c
gcc -o gemv_O0 -fopenmp gemv_openmp.c
diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c
index 21e24cb..99bba55 100644
--- a/GEMV/baselines/cpu/gemv_openmp.c
+++ b/GEMV/baselines/cpu/gemv_openmp.c
@@ -10,10 +10,10 @@
#include <numaif.h>
#include <numa.h>
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -22,7 +22,7 @@ int numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
-struct bitmask* bitmask_cpu;
+struct bitmask *bitmask_cpu;
int numa_node_cpu_memcpy = -1;
int numa_node_local = -1;
int numa_node_in_is_local = 0;
@@ -35,284 +35,292 @@ int numa_node_in_is_local = 0;
int main(int argc, char *argv[])
{
- (void) argc;
+ (void)argc;
/* // upstream config:
const size_t rows = 20480;
const size_t cols = 8192;
*/
- // DPU config: 163840 -n 4096
- const size_t rows = 163840;
- const size_t cols = 4096;
+ // DPU config: 163840 -n 4096
+ const size_t rows = 163840;
+ const size_t cols = 4096;
- T **A, *b, *x;
+ T **A, *b, *x;
- T **A_local, *x_local;
+ T **A_local, *x_local;
#if NUMA
- bitmask_in = numa_parse_nodestring(argv[1]);
- bitmask_out = numa_parse_nodestring(argv[2]);
- numa_node_cpu = atoi(argv[3]);
+ bitmask_in = numa_parse_nodestring(argv[1]);
+ bitmask_out = numa_parse_nodestring(argv[2]);
+ numa_node_cpu = atoi(argv[3]);
#if NUMA_MEMCPY
- bitmask_cpu = numa_parse_nodestring(argv[4]);
- numa_node_cpu_memcpy = atoi(argv[5]);
-#endif // NUMA_MEMCPY
+ bitmask_cpu = numa_parse_nodestring(argv[4]);
+ numa_node_cpu_memcpy = atoi(argv[5]);
+#endif // NUMA_MEMCPY
#else
- (void) argv;
-#endif // NUMA
+ (void)argv;
+#endif // NUMA
#if NUMA
- if (bitmask_out) {
- numa_set_membind(bitmask_out);
- numa_free_nodemask(bitmask_out);
- }
- b = (T*) numa_alloc(sizeof(T)*rows);
+ if (bitmask_out) {
+ numa_set_membind(bitmask_out);
+ numa_free_nodemask(bitmask_out);
+ }
+ b = (T *) numa_alloc(sizeof(T) * rows);
#else
- b = (T*) malloc(sizeof(T)*rows);
+ b = (T *) malloc(sizeof(T) * rows);
#endif
#if NUMA
- if (bitmask_in) {
- numa_set_membind(bitmask_in);
- // no free yet, re-used in allocate_dense
- }
- x = (T*) numa_alloc(sizeof(T)*cols);
+ if (bitmask_in) {
+ numa_set_membind(bitmask_in);
+ // no free yet, re-used in allocate_dense
+ }
+ x = (T *) numa_alloc(sizeof(T) * cols);
#else
- x = (T*) malloc(sizeof(T)*cols);
+ x = (T *) malloc(sizeof(T) * cols);
#endif
- allocate_dense(rows, cols, &A);
+ allocate_dense(rows, cols, &A);
#if NUMA
- if (bitmask_in) {
- numa_free_nodemask(bitmask_in);
- }
+ if (bitmask_in) {
+ numa_free_nodemask(bitmask_in);
+ }
#endif
- make_hilbert_mat(rows,cols, &A);
+ make_hilbert_mat(rows, cols, &A);
#if NUMA
#if NUMA_MEMCPY
- if (bitmask_cpu) {
- numa_set_membind(bitmask_cpu);
- numa_free_nodemask(bitmask_cpu);
- }
+ if (bitmask_cpu) {
+ numa_set_membind(bitmask_cpu);
+ numa_free_nodemask(bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
+#endif // NUMA
- A_local = A;
- x_local = x;
+ A_local = A;
+ x_local = x;
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(A) error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- mp_pages[0] = b;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(b)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages(b) error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(A) error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = b;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(b)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages(b) error: %d", mp_status[0]);
+ } else {
+ numa_node_out = mp_status[0];
+ }
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
#endif
- Timer timer;
- for (int i = 0; i < 20; i++) {
+ Timer timer;
+ for (int i = 0; i < 20; i++) {
#pragma omp parallel
- {
+ {
#pragma omp for
- for (size_t i = 0; i < cols; i++) {
- x[i] = (T) i+1 ;
- }
+ for (size_t i = 0; i < cols; i++) {
+ x[i] = (T) i + 1;
+ }
#pragma omp for
- for (size_t i = 0; i < rows; i++) {
- b[i] = (T) 0;
- }
- }
+ for (size_t i = 0; i < rows; i++) {
+ b[i] = (T) 0;
+ }
+ }
#if NUMA_MEMCPY
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- x_local = (T*) numa_alloc(sizeof(T)*cols);
- allocate_dense(rows, cols, &A_local);
- }
- stop(&timer, 1);
-
- if (x_local == NULL) {
- return 1;
- }
- if (A_local == NULL) {
- return 1;
- }
-
- if (!numa_node_in_is_local) {
- if (numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
-
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- //for (size_t i=0; i < rows; i++ ) {
- // memcpy(A_local[i], A[i], cols * sizeof(T));
- //}
- memcpy(*A_local, *A, rows * cols * sizeof(T));
- memcpy(x_local, x, cols * sizeof(T));
- } else {
- A_local = A;
- x_local = x;
- }
- stop(&timer, 2);
-
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
-
- mp_pages[0] = A_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ x_local = (T *) numa_alloc(sizeof(T) * cols);
+ allocate_dense(rows, cols, &A_local);
+ }
+ stop(&timer, 1);
+
+ if (x_local == NULL) {
+ return 1;
+ }
+ if (A_local == NULL) {
+ return 1;
+ }
+
+ if (!numa_node_in_is_local) {
+ if (numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(numa_node_cpu_memcpy) ==
+ -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ //for (size_t i=0; i < rows; i++ ) {
+ // memcpy(A_local[i], A[i], cols * sizeof(T));
+ //}
+ memcpy(*A_local, *A, rows * cols * sizeof(T));
+ memcpy(x_local, x, cols * sizeof(T));
+ } else {
+ A_local = A;
+ x_local = x;
+ }
+ stop(&timer, 2);
+
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
#endif
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- start(&timer, 0, 0);
- gemv(A_local, x_local, rows, cols, &b);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ gemv(A_local, x_local, rows, cols, &b);
+ stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(x_local, sizeof(T) * cols);
- numa_free(*A_local, sizeof(T) * rows * cols);
- numa_free(A_local, sizeof(void*) * rows);
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(x_local, sizeof(T) * cols);
+ numa_free(*A_local, sizeof(T) * rows * cols);
+ numa_free(A_local, sizeof(void *) * rows);
+ }
+ stop(&timer, 3);
#endif
#if NUMA_MEMCPY
- printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
- " | throughput_MBps=%f throughput_MOpps=%f",
- nr_threads, XSTR(T), rows * cols,
- numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
- rows * cols * sizeof(T) / timer.time[0],
- rows * cols / timer.time[0]);
- printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f throughput_MOpps=%f", nr_threads,
+ XSTR(T), rows * cols, numa_node_in, numa_node_out,
+ numa_node_cpu, numa_node_local, numa_node_cpu_memcpy,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+ rows * cols * sizeof(T) / timer.time[0],
+ rows * cols / timer.time[0]);
+ printf
+ (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] +
+ timer.time[3]);
#else
- printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
+ printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), rows * cols,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), rows * cols,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
#endif
- rows * cols * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f latency_us=%f\n",
- rows * cols / timer.time[0], timer.time[0]);
+ rows * cols * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ rows * cols / timer.time[0], timer.time[0]);
#endif
- }
-
+ }
#if 0
- print_vec(x, rows);
- print_mat(A, rows, cols);
- print_vec(b, rows);
+ print_vec(x, rows);
+ print_mat(A, rows, cols);
+ print_vec(b, rows);
#endif
#if TYPE_double || TYPE_float
- printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#else
- printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x,cols), sum_vec(b,rows));
+ printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x, cols),
+ sum_vec(b, rows));
#endif
#if NUMA
- numa_free(b, sizeof(T)*rows);
- numa_free(x, sizeof(T)*cols);
- numa_free(*A, sizeof(T)*rows*cols);
- numa_free(A, sizeof(void*)*rows);
+ numa_free(b, sizeof(T) * rows);
+ numa_free(x, sizeof(T) * cols);
+ numa_free(*A, sizeof(T) * rows * cols);
+ numa_free(A, sizeof(void *) * rows);
#else
- free(b);
- free(x);
- free(*A);
- free(A);
+ free(b);
+ free(x);
+ free(*A);
+ free(A);
#endif
- return 0;
+ return 0;
}
-void gemv(T** A, T* x, size_t rows, size_t cols, T** b) {
+void gemv(T **A, T *x, size_t rows, size_t cols, T **b)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i ++ )
- for (size_t j = 0; j < cols; j ++ ) {
- (*b)[i] = (*b)[i] + A[i][j]*x[j];
- }
+ for (size_t i = 0; i < rows; i++)
+ for (size_t j = 0; j < cols; j++) {
+ (*b)[i] = (*b)[i] + A[i][j] * x[j];
+ }
}
-void make_hilbert_mat(size_t rows, size_t cols, T*** A) {
+void make_hilbert_mat(size_t rows, size_t cols, T ***A)
+{
#pragma omp parallel for
- for (size_t i = 0; i < rows; i++) {
- for (size_t j = 0; j < cols; j++) {
+ for (size_t i = 0; i < rows; i++) {
+ for (size_t j = 0; j < cols; j++) {
#if TYPE_double || TYPE_float
- (*A)[i][j] = 1.0/( (T) i + (T) j + 1.0);
+ (*A)[i][j] = 1.0 / ((T) i + (T) j + 1.0);
#else
- (*A)[i][j] = (T)(((i+j)%10));
+ (*A)[i][j] = (T) (((i + j) % 10));
#endif
- }
- }
+ }
+ }
}
-T sum_vec(T* vec, size_t rows) {
- T sum = 0;
+T sum_vec(T *vec, size_t rows)
+{
+ T sum = 0;
#pragma omp parallel for reduction(+:sum)
- for (int i = 0; i < rows; i++) sum = sum + vec[i];
- return sum;
+ for (int i = 0; i < rows; i++)
+ sum = sum + vec[i];
+ return sum;
}
diff --git a/GEMV/baselines/cpu/run-perf.sh b/GEMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..5eae822
--- /dev/null
+++ b/GEMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4
diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c
index 0226437..3bf52e8 100644
--- a/GEMV/dpu/task.c
+++ b/GEMV/dpu/task.c
@@ -17,7 +17,8 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
bufferC[pos] += bufferA[i] * bufferB[i];
}
@@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
+int main()
+{
unsigned int tasklet_id = me();
#if PRINT
// printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
@@ -44,15 +46,15 @@ int main() {
uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
- unsigned int element_per_cacheC = 8/sizeof(T);
+ unsigned int element_per_cacheC = 8 / sizeof(T);
unsigned int nrows = nr_rows;
- unsigned int rows_per_tasklet;
+ unsigned int rows_per_tasklet;
unsigned int start_row;
unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC);
- unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
+ unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks;
rows_per_tasklet = dbl_chunks;
- unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
+ unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS);
if ((tasklet_id * element_per_cacheC) < rest_rows)
rows_per_tasklet += element_per_cacheC;
@@ -60,22 +62,32 @@ int main() {
if ((tasklet_id * element_per_cacheC) >= rest_rows) {
// unsigned int hlf_rest_rows = rest_rows >> 1;
if ((rest_rows % element_per_cacheC) != 0)
- start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+ start_row =
+ roundup(rest_rows,
+ element_per_cacheC) +
+ tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
else
- start_row = rest_rows + tasklet_id * dbl_chunks;
- // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
- } else
- start_row = tasklet_id * (dbl_chunks + element_per_cacheC);
- // start_row = tasklet_id * (dbl_chunks + 2);
+ start_row = rest_rows + tasklet_id * dbl_chunks;
+ // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
+ } else
+ start_row =
+ tasklet_id * (dbl_chunks + element_per_cacheC);
+ // start_row = tasklet_id * (dbl_chunks + 2);
} else {
start_row = tasklet_id * (dbl_chunks);
}
// Address of the current row in MRAM
- uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
- uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
- uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+ uint32_t mram_base_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+ uint32_t mram_base_addr_B =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T));
+ uint32_t mram_base_addr_C =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T) + start_row * sizeof(T));
uint32_t mram_temp_addr_A = mram_base_addr_A;
uint32_t mram_temp_addr_B = mram_base_addr_B;
@@ -87,55 +99,65 @@ int main() {
int offset = 0;
- #if PRINT
- printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet);
- printf("id: %d, start_row = %d\n",tasklet_id, start_row);
- #endif
+#if PRINT
+ printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet);
+ printf("id: %d, start_row = %d\n", tasklet_id, start_row);
+#endif
// Iterate over nr_rows
// for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
- for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) {
+ for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+ i += element_per_cacheC) {
- mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+ mram_temp_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
// cache_C[0] = 0;
// cache_C[1] = 0;
// clear the cache
- for(unsigned int c = 0; c < element_per_cacheC; c++){
- cache_C[c] = 0;
+ for (unsigned int c = 0; c < element_per_cacheC; c++) {
+ cache_C[c] = 0;
}
// for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
// for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){
// for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){
- for(unsigned int pos = 0; pos < element_per_cacheC; pos++){
- if(i + pos >= nr_rows){
+ for (unsigned int pos = 0; pos < element_per_cacheC; pos++) {
+ if (i + pos >= nr_rows) {
// printf("id: %d, nrows: %d, error\n", tasklet_id, nrows);
break;
- }
+ }
int n = 0, j;
- for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
- {
-
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- if(offset)
- {
-
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
- {
+ for (n = 0;
+ n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+ n += (BLOCK_SIZE / sizeof(T))) {
+
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A), cache_A,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_B), cache_B,
+ BLOCK_SIZE);
+
+ if (offset) {
+
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A +
+ BLOCK_SIZE), cache_A_aux,
+ 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
-
// Compute GEMV
gemv(cache_C, cache_A, cache_B, pos);
@@ -144,53 +166,55 @@ int main() {
mram_temp_addr_B += BLOCK_SIZE;
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
-
+ mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+ cache_A, BLOCK_SIZE);
- if(offset)
- {
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
- {
+ if (offset) {
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A + BLOCK_SIZE),
+ cache_A_aux, 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
+ mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+ cache_B, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- for (j = 0; j < (int) (n_size - n); j++) {
+ for (j = 0; j < (int)(n_size - n); j++) {
// Compute GEMV
- if(j >= (int)(BLOCK_SIZE / sizeof(T))){
+ if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
printf("error\n");
break;
}
cache_C[pos] += cache_A[j] * cache_B[j];
}
-
- mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+ mram_temp_addr_A +=
+ (BLOCK_SIZE -
+ ((BLOCK_SIZE / sizeof(T)) -
+ (n_size - n)) * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
- if(mram_temp_addr_A % 8 != 0)
- {
+ if (mram_temp_addr_A % 8 != 0) {
offset = 1;
- }
- else
- {
+ } else {
offset = 0;
}
}
// Write cache to current MRAM block
- mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+ mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
// Update memory address
// mram_base_addr_C += 2 * sizeof(T);
- mram_base_addr_C += 8;
+ mram_base_addr_C += 8;
}
diff --git a/GEMV/host/app.c b/GEMV/host/app.c
index ebd0336..6553774 100644
--- a/GEMV/host/app.c
+++ b/GEMV/host/app.c
@@ -33,69 +33,69 @@
#define DPU_BINARY "./bin/gemv_dpu"
#endif
-static T* A;
-static T* B;
-static T* C;
-static T* C_dpu;
+static T *A;
+static T *B;
+static T *C;
+static T *C_dpu;
// Create input arrays
-static void init_data(T* A, T* B, unsigned int m_size, unsigned int n_size) {
+static void init_data(T *A, T *B, unsigned int m_size, unsigned int n_size)
+{
srand(0);
- for (unsigned int i = 0; i < m_size * n_size; i++)
- {
- A[i] = (unsigned int) (rand()%50);
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ A[i] = (unsigned int)(rand() % 50);
}
- for (unsigned int i = 0; i < n_size; i++)
- {
- B[i] = (unsigned int) (rand()%50);
+ for (unsigned int i = 0; i < n_size; i++) {
+ B[i] = (unsigned int)(rand() % 50);
}
}
// Compute output in the host
-static void gemv_host(T* C, T* A, T* B, unsigned int m_size, unsigned int n_size) {
- for (unsigned int i = 0; i < m_size; i++)
- {
+static void gemv_host(T *C, T *A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
+ for (unsigned int i = 0; i < m_size; i++) {
C[i] = 0;
}
for (unsigned int m = 0; m < m_size; m++) {
- for (unsigned int n = 0; n < n_size; n++)
- {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[m * n_size + n] * B[n];
}
}
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ uint32_t nr_of_ranks;
// Timer
Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[8] = 0; // free
+ timer.time[8] = 0; // free
#endif
#if ENERGY
@@ -108,12 +108,13 @@ int main(int argc, char **argv) {
unsigned int n_size = p.n_size;
// Initialize help data
- dpu_info = (struct dpu_info_t *) malloc(NR_DPUS * sizeof(struct dpu_info_t));
- dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
+ dpu_info =
+ (struct dpu_info_t *)malloc(NR_DPUS * sizeof(struct dpu_info_t));
+ dpu_arguments_t *input_args =
+ (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t));
uint32_t max_rows_per_dpu = 0;
uint32_t n_size_pad = n_size;
- if(n_size % 2 == 1)
- {
+ if (n_size % 2 == 1) {
n_size_pad++;
}
@@ -127,7 +128,10 @@ int main(int argc, char **argv) {
rows_per_dpu++;
if (rest_rows > 0) {
if (i >= rest_rows)
- prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+ prev_rows_dpu =
+ rest_rows * (chunks + 1) + (i -
+ rest_rows) *
+ chunks;
else
prev_rows_dpu = i * (chunks + 1);
} else {
@@ -136,7 +140,7 @@ int main(int argc, char **argv) {
// Keep max rows for parallel transfers
uint32_t rows_per_dpu_pad = rows_per_dpu;
- if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+ if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
rows_per_dpu_pad++;
if (rows_per_dpu_pad > max_rows_per_dpu)
max_rows_per_dpu = rows_per_dpu_pad;
@@ -163,20 +167,20 @@ int main(int argc, char **argv) {
for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 0, 0);
}
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 0);
}
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 1, 0);
}
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 1);
}
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -186,26 +190,33 @@ int main(int argc, char **argv) {
// int prev_rank_id = -1;
int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
numa_node_rank = -1;
} else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
}
/*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
}
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 2, 0);
}
gemv_host(C, A, B, m_size, n_size);
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
}
if (rep >= p.n_warmup) {
@@ -220,23 +231,30 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 3);
}
if (rep >= p.n_warmup) {
start(&timer, 6, 0);
}
-
// Copy input array and vector
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A + dpu_info[i].prev_rows_dpu * n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 6);
}
if (rep >= p.n_warmup) {
@@ -246,12 +264,15 @@ int main(int argc, char **argv) {
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup) {
stop(&timer, 7);
}
-
// Run kernel on DPUs
if (rep >= p.n_warmup) {
start(&timer, 4, 0);
@@ -280,89 +301,140 @@ int main(int argc, char **argv) {
start(&timer, 5, 0);
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup) {
stop(&timer, 5);
}
-
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 8, 0);
}
#endif
DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 8);
}
#endif
#endif
-
// Check output
bool status = true;
- unsigned int n,j;
+ unsigned int n, j;
i = 0;
for (n = 0; n < NR_DPUS; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+ if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
status = false;
#if PRINT
- // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+ // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
#endif
}
i++;
}
}
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
if (rep >= p.n_warmup) {
- printf("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
- NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, n_size * m_size);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3] + timer.time[6] + timer.time[7],
- timer.time[4],
- timer.time[5],
- timer.time[8]);
- printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
- timer.time[3],
- timer.time[6],
- timer.time[7]
- );
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- n_size * m_size * sizeof(T) / timer.time[2],
- n_size * m_size * sizeof(T) / (timer.time[4]),
- n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- n_size * m_size * sizeof(T) / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- n_size * m_size / timer.time[2],
- n_size * m_size / (timer.time[4]),
- n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- n_size * m_size / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]),
- n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]));
+ printf
+ ("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d",
+ NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T),
+ BLOCK_SIZE, n_size * m_size);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ timer.time[0], timer.time[1],
+ timer.time[2],
+ timer.time[3] + timer.time[6] +
+ timer.time[7], timer.time[4],
+ timer.time[5], timer.time[8]);
+ printf
+ (" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f",
+ timer.time[3], timer.time[6], timer.time[7]
+ );
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ n_size * m_size * sizeof(T) /
+ timer.time[2],
+ n_size * m_size * sizeof(T) /
+ (timer.time[4]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5] + timer.time[8]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ n_size * m_size * sizeof(T) /
+ (timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[1] + timer.time[3] +
+ timer.time[6] + timer.time[7] +
+ timer.time[4] + timer.time[5]),
+ n_size * m_size * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[6] +
+ timer.time[7] + timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ n_size * m_size / timer.time[2],
+ n_size * m_size / (timer.time[4]),
+ n_size * m_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[8]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ n_size * m_size / (timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]),
+ n_size * m_size / (timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]),
+ n_size * m_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[6] +
+ timer.time[7] +
+ timer.time[4] +
+ timer.time[5]));
}
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
}
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -370,15 +442,15 @@ int main(int argc, char **argv) {
// Print timing results
/*
- printf("CPU Version Time (ms): ");
- print(&timer, 0, 1);
- printf("CPU-DPU Time (ms): ");
- print(&timer, 1, p.n_reps);
- printf("DPU Kernel Time (ms): ");
- print(&timer, 2, p.n_reps);
- printf("DPU-CPU Time (ms): ");
- print(&timer, 3, p.n_reps);
- */
+ printf("CPU Version Time (ms): ");
+ print(&timer, 0, 1);
+ printf("CPU-DPU Time (ms): ");
+ print(&timer, 1, p.n_reps);
+ printf("DPU Kernel Time (ms): ");
+ print(&timer, 2, p.n_reps);
+ printf("DPU-CPU Time (ms): ");
+ print(&timer, 3, p.n_reps);
+ */
#if ENERGY
printf("Energy (J): %f J\t", avg_energy);
diff --git a/GEMV/support/common.h b/GEMV/support/common.h
index 0deebcb..47a9628 100755
--- a/GEMV/support/common.h
+++ b/GEMV/support/common.h
@@ -3,17 +3,17 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t n_size;
- uint32_t n_size_pad;
- uint32_t nr_rows;
- uint32_t max_rows;
+ uint32_t n_size;
+ uint32_t n_size_pad;
+ uint32_t nr_rows;
+ uint32_t max_rows;
} dpu_arguments_t;
// Specific information for each DPU
struct dpu_info_t {
- uint32_t rows_per_dpu;
- uint32_t rows_per_dpu_pad;
- uint32_t prev_rows_dpu;
+ uint32_t rows_per_dpu;
+ uint32_t rows_per_dpu_pad;
+ uint32_t prev_rows_dpu;
};
struct dpu_info_t *dpu_info;
diff --git a/GEMV/support/params.h b/GEMV/support/params.h
index 526c71c..c72b0c1 100644
--- a/GEMV/support/params.h
+++ b/GEMV/support/params.h
@@ -4,53 +4,62 @@
#include "common.h"
typedef struct Params {
- unsigned int m_size;
- unsigned int n_size;
- unsigned int n_warmup;
- unsigned int n_reps;
-}Params;
+ unsigned int m_size;
+ unsigned int n_size;
+ unsigned int n_warmup;
+ unsigned int n_reps;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -m <I> m_size (default=8192 elements)"
- "\n -n <I> n_size (default=8192 elements)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -m <I> m_size (default=8192 elements)"
+ "\n -n <I> n_size (default=8192 elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.m_size = 8192;
- p.n_size = 8192;
- p.n_warmup = 1;
- p.n_reps = 3;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.m_size = 8192;
+ p.n_size = 8192;
+ p.n_warmup = 1;
+ p.n_reps = 3;
- int opt;
- while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'm': p.m_size = atoi(optarg); break;
- case 'n': p.n_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'm':
+ p.m_size = atoi(optarg);
+ break;
+ case 'n':
+ p.n_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/GEMV/support/timer.h b/GEMV/support/timer.h
index 99d79f4..b2b9148 100755
--- a/GEMV/support/timer.h
+++ b/GEMV/support/timer.h
@@ -1,69 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
- //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
-
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+typedef struct Timer {
+ struct timeval startTime[9];
+ struct timeval stopTime[9];
+ double time[9];
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+
+ //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("%f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/HST-S/baselines/cpu/app_baseline.c b/HST-S/baselines/cpu/app_baseline.c
index 745e384..bb4e28a 100644
--- a/HST-S/baselines/cpu/app_baseline.c
+++ b/HST-S/baselines/cpu/app_baseline.c
@@ -24,10 +24,10 @@
#include <numaif.h>
#include <numa.h>
-struct bitmask* bitmask_in;
-struct bitmask* bitmask_out;
+struct bitmask *bitmask_in;
+struct bitmask *bitmask_out;
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -41,7 +41,6 @@ int numa_node_local = -1;
int numa_node_in_is_local = 0;
#endif
-
#include "../../support/common.h"
#include "../../support/timer.h"
@@ -49,364 +48,399 @@ int numa_node_in_is_local = 0;
#define STR(x) #x
// Pointer declaration
-static T* A;
+static T *A;
static T *A_local;
-static unsigned int* histo_host;
+static unsigned int *histo_host;
typedef struct Params {
- unsigned int input_size;
- unsigned int bins;
- int n_warmup;
- int n_reps;
- const char *file_name;
- int exp;
- int n_threads;
+ unsigned int input_size;
+ unsigned int bins;
+ int n_warmup;
+ int n_reps;
+ const char *file_name;
+ int exp;
+ int n_threads;
#if NUMA
- struct bitmask* bitmask_in;
- struct bitmask* bitmask_out;
- int numa_node_cpu;
+ struct bitmask *bitmask_in;
+ struct bitmask *bitmask_out;
+ int numa_node_cpu;
#endif
#if NUMA_MEMCPY
- int numa_node_cpu_memcpy;
- struct bitmask* bitmask_cpu;
+ int numa_node_cpu_memcpy;
+ struct bitmask *bitmask_cpu;
#endif
-}Params;
+} Params;
/**
* @brief creates input arrays
* @param nr_elements how many elements in input arrays
*/
-static void read_input(T* A, const Params p) {
-
- char dctFileName[100];
- FILE *File = NULL;
-
- // Open input file
- unsigned short temp;
- sprintf(dctFileName, "%s", p.file_name);
- if((File = fopen(dctFileName, "rb")) != NULL) {
- for(unsigned int y = 0; y < p.input_size; y++) {
- if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
- A[y] = (unsigned int)ByteSwap16(temp);
- if(A[y] >= 4096)
- A[y] = 4095;
- } else {
- //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
- rewind(File);
- }
- }
- fclose(File);
- } else {
- printf("%s does not exist\n", dctFileName);
- exit(1);
- }
+static void read_input(T *A, const Params p)
+{
+
+ char dctFileName[100];
+ FILE *File = NULL;
+
+ // Open input file
+ unsigned short temp;
+ sprintf(dctFileName, "%s", p.file_name);
+ if ((File = fopen(dctFileName, "rb")) != NULL) {
+ for (unsigned int y = 0; y < p.input_size; y++) {
+ if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
+ A[y] = (unsigned int)ByteSwap16(temp);
+ if (A[y] >= 4096)
+ A[y] = 4095;
+ } else {
+ //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
+ rewind(File);
+ }
+ }
+ fclose(File);
+ } else {
+ printf("%s does not exist\n", dctFileName);
+ exit(1);
+ }
}
/**
* @brief compute output in the host
*/
-static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus, int t) {
-
- omp_set_num_threads(t);
-
- if(!exp){
- #pragma omp parallel for
- for (unsigned int i = 0; i < nr_of_dpus; i++) {
- for (unsigned int j = 0; j < nr_elements; j++) {
- T d = A[j];
- histo[i * bins + ((d * bins) >> DEPTH)] += 1;
- }
- }
- }
- else{
- #pragma omp parallel for
- for (unsigned int j = 0; j < nr_elements; j++) {
- T d = A[j];
- #pragma omp atomic update
- histo[(d * bins) >> DEPTH] += 1;
- }
- }
+static void histogram_host(unsigned int *histo, T *A, unsigned int bins,
+ unsigned int nr_elements, int exp,
+ unsigned int nr_of_dpus, int t)
+{
+
+ omp_set_num_threads(t);
+
+ if (!exp) {
+#pragma omp parallel for
+ for (unsigned int i = 0; i < nr_of_dpus; i++) {
+ for (unsigned int j = 0; j < nr_elements; j++) {
+ T d = A[j];
+ histo[i * bins + ((d * bins) >> DEPTH)] += 1;
+ }
+ }
+ } else {
+#pragma omp parallel for
+ for (unsigned int j = 0; j < nr_elements; j++) {
+ T d = A[j];
+#pragma omp atomic update
+ histo[(d * bins) >> DEPTH] += 1;
+ }
+ }
}
// Params ---------------------------------------------------------------------
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n -t <T> # of threads (default=8)"
- "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=1536*1024 elements)"
- "\n -b <B> histogram size (default=256 bins)"
- "\n -f <F> input image file (default=../input/image_VanHateren.iml)"
- "\n");
+void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n -t <T> # of threads (default=8)"
+ "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=1536*1024 elements)"
+ "\n -b <B> histogram size (default=256 bins)"
+ "\n -f <F> input image file (default=../input/image_VanHateren.iml)"
+ "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size = 1536 * 1024;
- p.bins = 256;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.n_threads = 8;
- p.exp = 1;
- p.file_name = "../../input/image_VanHateren.iml";
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size = 1536 * 1024;
+ p.bins = 256;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.n_threads = 8;
+ p.exp = 1;
+ p.file_name = "../../input/image_VanHateren.iml";
#if NUMA
- p.bitmask_in = NULL;
- p.bitmask_out = NULL;
- p.numa_node_cpu = -1;
+ p.bitmask_in = NULL;
+ p.bitmask_out = NULL;
+ p.numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
- p.numa_node_cpu_memcpy = -1;
- p.bitmask_cpu = NULL;
+ p.numa_node_cpu_memcpy = -1;
+ p.bitmask_cpu = NULL;
#endif
- int opt;
- while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'b': p.bins = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'f': p.file_name = optarg; break;
- case 'x': p.exp = atoi(optarg); break;
- case 't': p.n_threads = atoi(optarg); break;
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atoi(optarg);
+ break;
+ case 'b':
+ p.bins = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'f':
+ p.file_name = optarg;
+ break;
+ case 'x':
+ p.exp = atoi(optarg);
+ break;
+ case 't':
+ p.n_threads = atoi(optarg);
+ break;
#if NUMA
- case 'A': p.bitmask_in = numa_parse_nodestring(optarg); break;
- case 'B': p.bitmask_out = numa_parse_nodestring(optarg); break;
- case 'C': p.numa_node_cpu = atoi(optarg); break;
+ case 'A':
+ p.bitmask_in = numa_parse_nodestring(optarg);
+ break;
+ case 'B':
+ p.bitmask_out = numa_parse_nodestring(optarg);
+ break;
+ case 'C':
+ p.numa_node_cpu = atoi(optarg);
+ break;
#if NUMA_MEMCPY
- case 'D': p.bitmask_cpu = numa_parse_nodestring(optarg); break;
- case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
-#endif // NUMA_MEMCPY
-#endif // NUMA
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(p.n_threads > 0 && "Invalid # of ranks!");
-
- return p;
+ case 'D':
+ p.bitmask_cpu = numa_parse_nodestring(optarg);
+ break;
+ case 'M':
+ p.numa_node_cpu_memcpy = atoi(optarg);
+ break;
+#endif // NUMA_MEMCPY
+#endif // NUMA
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(p.n_threads > 0 && "Invalid # of ranks!");
+
+ return p;
}
/**
* @brief Main of the Host Application.
*/
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
+
+ struct Params p = input_params(argc, argv);
- struct Params p = input_params(argc, argv);
+ uint32_t nr_of_dpus;
- uint32_t nr_of_dpus;
-
- const unsigned int input_size = p.input_size; // Size of input image
- if(!p.exp)
- assert(input_size % p.n_threads == 0 && "Input size!");
- else
- assert(input_size % p.n_threads == 0 && "Input size!");
+ const unsigned int input_size = p.input_size; // Size of input image
+ if (!p.exp)
+ assert(input_size % p.n_threads == 0 && "Input size!");
+ else
+ assert(input_size % p.n_threads == 0 && "Input size!");
- // Input/output allocation
+ // Input/output allocation
#if NUMA
- if (p.bitmask_in) {
- numa_set_membind(p.bitmask_in);
- numa_free_nodemask(p.bitmask_in);
- }
- A = numa_alloc(input_size * sizeof(T));
+ if (p.bitmask_in) {
+ numa_set_membind(p.bitmask_in);
+ numa_free_nodemask(p.bitmask_in);
+ }
+ A = numa_alloc(input_size * sizeof(T));
#else
- A = malloc(input_size * sizeof(T));
+ A = malloc(input_size * sizeof(T));
#endif
- // Create an input file with arbitrary data.
- read_input(A, p);
+ // Create an input file with arbitrary data.
+ read_input(A, p);
#if NUMA
- if (p.bitmask_out) {
- numa_set_membind(p.bitmask_out);
- numa_free_nodemask(p.bitmask_out);
- }
+ if (p.bitmask_out) {
+ numa_set_membind(p.bitmask_out);
+ numa_free_nodemask(p.bitmask_out);
+ }
#endif
- if(!p.exp) {
- // upstream code left nr_of_dpus uninitialized
- nr_of_dpus = p.n_threads;
+ if (!p.exp) {
+ // upstream code left nr_of_dpus uninitialized
+ nr_of_dpus = p.n_threads;
#if NUMA
- histo_host = numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int));
+ histo_host =
+ numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int));
#else
- histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
+ histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
#endif
- } else {
+ } else {
#if NUMA
- histo_host = numa_alloc(p.bins * sizeof(unsigned int));
+ histo_host = numa_alloc(p.bins * sizeof(unsigned int));
#else
- histo_host = malloc(p.bins * sizeof(unsigned int));
+ histo_host = malloc(p.bins * sizeof(unsigned int));
#endif
- }
+ }
#if NUMA
#if NUMA_MEMCPY
- if (p.bitmask_cpu) {
- numa_set_membind(p.bitmask_cpu);
- numa_free_nodemask(p.bitmask_cpu);
- }
+ if (p.bitmask_cpu) {
+ numa_set_membind(p.bitmask_cpu);
+ numa_free_nodemask(p.bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
+#endif // NUMA
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- mp_pages[0] = histo_host;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(C)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
- numa_node_cpu = p.numa_node_cpu;
- if (numa_node_cpu != -1) {
- if (numa_run_on_node(numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = histo_host;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(C)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_out = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
#endif
- Timer timer;
+ Timer timer;
#if NUMA_MEMCPY
- numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- A_local = (T*) numa_alloc(input_size * sizeof(T));
- }
- stop(&timer, 1);
- if (!numa_node_in_is_local) {
- if (p.numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- memcpy(A_local, A, input_size * sizeof(T));
- } else {
- A_local = A;
- }
- stop(&timer, 2);
- if (p.numa_node_cpu != -1) {
- if (numa_run_on_node(p.numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
- mp_pages[0] = A_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ A_local = (T *) numa_alloc(input_size * sizeof(T));
+ }
+ stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (p.numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ memcpy(A_local, A, input_size * sizeof(T));
+ } else {
+ A_local = A;
+ }
+ stop(&timer, 2);
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
#else
- A_local = A;
+ A_local = A;
#endif
- start(&timer, 0, 0);
+ start(&timer, 0, 0);
- if(!p.exp)
- memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
- else
- memset(histo_host, 0, p.bins * sizeof(unsigned int));
+ if (!p.exp)
+ memset(histo_host, 0,
+ nr_of_dpus * p.bins * sizeof(unsigned int));
+ else
+ memset(histo_host, 0, p.bins * sizeof(unsigned int));
- histogram_host(histo_host, A_local, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads);
+ histogram_host(histo_host, A_local, p.bins, input_size, p.exp,
+ nr_of_dpus, p.n_threads);
- stop(&timer, 0);
+ stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(A_local, input_size * sizeof(T));
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(A_local, input_size * sizeof(T));
+ }
+ stop(&timer, 3);
#endif
- unsigned int nr_threads = 0;
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
#if NUMA_MEMCPY
- printf("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
- " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus,
- numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
- input_size * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- input_size / timer.time[0]);
- printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
+ " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f", nr_threads, XSTR(T), input_size,
+ p.exp ? p.bins : p.bins * nr_of_dpus, numa_node_in,
+ numa_node_local, numa_node_out, numa_node_cpu,
+ numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+ input_size * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f", input_size / timer.time[0]);
+ printf
+ (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
#else
- printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
+ printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), input_size,
+ p.exp ? p.bins : p.bins * nr_of_dpus,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
#endif
- input_size * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f latency_us=%f\n",
- input_size / timer.time[0], timer.time[0]);
-#endif // NUMA_MEMCPY
+ input_size * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f latency_us=%f\n",
+ input_size / timer.time[0], timer.time[0]);
+#endif // NUMA_MEMCPY
#if NUMA
- numa_free(A, input_size * sizeof(T));
- if (!p.exp) {
- numa_free(histo_host, nr_of_dpus * p.bins * sizeof(unsigned int));
- } else {
- numa_free(histo_host, p.bins * sizeof(unsigned int));
- }
+ numa_free(A, input_size * sizeof(T));
+ if (!p.exp) {
+ numa_free(histo_host,
+ nr_of_dpus * p.bins * sizeof(unsigned int));
+ } else {
+ numa_free(histo_host, p.bins * sizeof(unsigned int));
+ }
#else
- free(A);
- free(histo_host);
+ free(A);
+ free(histo_host);
#endif
- return 0;
+ return 0;
}
diff --git a/HST-S/dpu/task.c b/HST-S/dpu/task.c
index 135f0d1..0333072 100644
--- a/HST-S/dpu/task.c
+++ b/HST-S/dpu/task.c
@@ -15,102 +15,121 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// Array for communication between adjacent tasklets
-uint32_t* message[NR_TASKLETS];
+uint32_t *message[NR_TASKLETS];
// DPU histogram
-uint32_t* histo_dpu;
+uint32_t *histo_dpu;
// Barrier
BARRIER_INIT(my_barrier, NR_TASKLETS);
// Histogram in each tasklet
-static void histogram(uint32_t* histo, uint32_t bins, T *input, unsigned int l_size){
- for(unsigned int j = 0; j < l_size; j++) {
- T d = input[j];
- histo[(d * bins) >> DEPTH] += 1;
- }
+static void histogram(uint32_t *histo, uint32_t bins, T *input,
+ unsigned int l_size)
+{
+ for (unsigned int j = 0; j < l_size; j++) {
+ T d = input[j];
+ histo[(d * bins) >> DEPTH] += 1;
+ }
}
extern int main_kernel1(void);
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
-int main(void) {
- // Kernel
- return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+int main(void)
+{
+ // Kernel
+ return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
}
// main_kernel1
-int main_kernel1() {
- unsigned int tasklet_id = me();
+int main_kernel1()
+{
+ unsigned int tasklet_id = me();
#if PRINT
- printf("tasklet_id = %u\n", tasklet_id);
+ printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
- uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
- uint32_t bins = DPU_INPUT_ARGUMENTS.bins;
-
- // Address of the current processing block in MRAM
- uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
- uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
- uint32_t mram_base_addr_histo = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
-
- // Initialize a local cache to store the MRAM block
- T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
-
- // Local histogram
- uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
-
- // Initialize local histogram
- for(unsigned int i = 0; i < bins; i++){
- histo[i] = 0;
- }
-
- // Compute histogram
- for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
-
- // Bound checking
- uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
-
- // Load cache with current MRAM block
- mram_read((const __mram_ptr void*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
-
- // Histogram in each tasklet
- histogram(histo, bins, cache_A, l_size_bytes >> DIV);
-
- }
- message[tasklet_id] = histo;
-
- // Barrier
- barrier_wait(&my_barrier);
-
- uint32_t *histo_dpu = message[0];
-
- for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS){
- uint32_t b = 0;
- for (unsigned int j = 0; j < NR_TASKLETS; j++){
- b += *(message[j] + i);
- }
- histo_dpu[i] = b;
- }
-
- // Barrier
- barrier_wait(&my_barrier);
-
- // Write dpu histogram to current MRAM block
- if(tasklet_id == 0){
- if(bins * sizeof(uint32_t) <= 2048)
- mram_write(histo_dpu, (__mram_ptr void*)(mram_base_addr_histo), bins * sizeof(uint32_t));
- else
- for(unsigned int offset = 0; offset < ((bins * sizeof(uint32_t)) >> 11); offset++){
- mram_write(histo_dpu + (offset << 9), (__mram_ptr void*)(mram_base_addr_histo + (offset << 11)), 2048);
- }
- }
-
- return 0;
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
+ uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
+ uint32_t bins = DPU_INPUT_ARGUMENTS.bins;
+
+ // Address of the current processing block in MRAM
+ uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+ uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ uint32_t mram_base_addr_histo =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
+
+ // Initialize a local cache to store the MRAM block
+ T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+
+ // Local histogram
+ uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
+
+ // Initialize local histogram
+ for (unsigned int i = 0; i < bins; i++) {
+ histo[i] = 0;
+ }
+
+ // Compute histogram
+ for (unsigned int byte_index = base_tasklet;
+ byte_index < input_size_dpu_bytes;
+ byte_index += BLOCK_SIZE * NR_TASKLETS) {
+
+ // Bound checking
+ uint32_t l_size_bytes =
+ (byte_index + BLOCK_SIZE >=
+ input_size_dpu_bytes) ? (input_size_dpu_bytes -
+ byte_index) : BLOCK_SIZE;
+
+ // Load cache with current MRAM block
+ mram_read((const __mram_ptr void *)(mram_base_addr_A +
+ byte_index), cache_A,
+ l_size_bytes);
+
+ // Histogram in each tasklet
+ histogram(histo, bins, cache_A, l_size_bytes >> DIV);
+
+ }
+ message[tasklet_id] = histo;
+
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ uint32_t *histo_dpu = message[0];
+
+ for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS) {
+ uint32_t b = 0;
+ for (unsigned int j = 0; j < NR_TASKLETS; j++) {
+ b += *(message[j] + i);
+ }
+ histo_dpu[i] = b;
+ }
+
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ // Write dpu histogram to current MRAM block
+ if (tasklet_id == 0) {
+ if (bins * sizeof(uint32_t) <= 2048)
+ mram_write(histo_dpu,
+ (__mram_ptr void *)(mram_base_addr_histo),
+ bins * sizeof(uint32_t));
+ else
+ for (unsigned int offset = 0;
+ offset < ((bins * sizeof(uint32_t)) >> 11);
+ offset++) {
+ mram_write(histo_dpu + (offset << 9),
+ (__mram_ptr void
+ *)(mram_base_addr_histo +
+ (offset << 11)), 2048);
+ }
+ }
+
+ return 0;
}
diff --git a/HST-S/host/app.c b/HST-S/host/app.c
index 2c4e6a5..7f66f6e 100644
--- a/HST-S/host/app.c
+++ b/HST-S/host/app.c
@@ -40,362 +40,415 @@
#endif
// Pointer declaration
-static T* A;
-static unsigned int* histo_host;
-static unsigned int* histo;
+static T *A;
+static unsigned int *histo_host;
+static unsigned int *histo;
// Create input arrays
-static void read_input(T* A, const Params p) {
-
- char dctFileName[100];
- FILE *File = NULL;
-
- // Open input file
- unsigned short temp;
- sprintf(dctFileName, "%s", p.file_name);
- if((File = fopen(dctFileName, "rb")) != NULL) {
- for(unsigned int y = 0; y < p.input_size; y++) {
- if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
- A[y] = (unsigned int)ByteSwap16(temp);
- if(A[y] >= 4096)
- A[y] = 4095;
- } else {
- //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
- rewind(File);
- }
- }
- fclose(File);
- } else {
- printf("%s does not exist\n", dctFileName);
- exit(1);
- }
+static void read_input(T *A, const Params p)
+{
+
+ char dctFileName[100];
+ FILE *File = NULL;
+
+ // Open input file
+ unsigned short temp;
+ sprintf(dctFileName, "%s", p.file_name);
+ if ((File = fopen(dctFileName, "rb")) != NULL) {
+ for (unsigned int y = 0; y < p.input_size; y++) {
+ if (fread(&temp, sizeof(unsigned short), 1, File) == 1) {
+ A[y] = (unsigned int)ByteSwap16(temp);
+ if (A[y] >= 4096)
+ A[y] = 4095;
+ } else {
+ //printf("out of bounds read at offset %d -- seeking back to 0\n", y);
+ rewind(File);
+ }
+ }
+ fclose(File);
+ } else {
+ printf("%s does not exist\n", dctFileName);
+ exit(1);
+ }
}
// Compute output in the host
-static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus) {
- if(!exp){
- for (unsigned int i = 0; i < nr_of_dpus; i++) {
- for (unsigned int j = 0; j < nr_elements; j++) {
- T d = A[j];
- histo[i * bins + ((d * bins) >> DEPTH)] += 1;
- }
- }
- }
- else{
- for (unsigned int j = 0; j < nr_elements; j++) {
- T d = A[j];
- histo[(d * bins) >> DEPTH] += 1;
- }
- }
+static void histogram_host(unsigned int *histo, T *A, unsigned int bins,
+ unsigned int nr_elements, int exp,
+ unsigned int nr_of_dpus)
+{
+ if (!exp) {
+ for (unsigned int i = 0; i < nr_of_dpus; i++) {
+ for (unsigned int j = 0; j < nr_elements; j++) {
+ T d = A[j];
+ histo[i * bins + ((d * bins) >> DEPTH)] += 1;
+ }
+ }
+ } else {
+ for (unsigned int j = 0; j < nr_elements; j++) {
+ T d = A[j];
+ histo[(d * bins) >> DEPTH] += 1;
+ }
+ }
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
+ struct Params p = input_params(argc, argv);
- struct dpu_set_t dpu_set, dpu;
- uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t nr_of_dpus;
+ uint32_t nr_of_ranks;
#if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
- // Timer declaration
- Timer timer;
+ // Timer declaration
+ Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[6] = 0; // free
#endif
- unsigned int i = 0;
- unsigned int input_size; // Size of input image
- unsigned int dpu_s = p.dpu_s;
- if(p.exp == 0)
- input_size = p.input_size * NR_DPUS; // Size of input image
- else if(p.exp == 1)
- input_size = p.input_size; // Size of input image
- else
- input_size = p.input_size * dpu_s; // Size of input image
-
- const unsigned int input_size_8bytes =
- ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
- const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
- const unsigned int input_size_dpu_8bytes =
- ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
-
- // Input/output allocation
- A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
- T *bufferA = A;
- histo_host = malloc(p.bins * sizeof(unsigned int));
- histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int));
-
- // Create an input file with arbitrary data
- read_input(A, p);
- if(p.exp == 0){
- for(unsigned int j = 1; j < NR_DPUS; j++){
- memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T));
- }
- }
- else if(p.exp == 2){
- for(unsigned int j = 1; j < dpu_s; j++)
- memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T));
- }
-
- // Loop over main kernel
- for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
- memset(histo_host, 0, p.bins * sizeof(unsigned int));
- memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int));
+ unsigned int i = 0;
+ unsigned int input_size; // Size of input image
+ unsigned int dpu_s = p.dpu_s;
+ if (p.exp == 0)
+ input_size = p.input_size * NR_DPUS; // Size of input image
+ else if (p.exp == 1)
+ input_size = p.input_size; // Size of input image
+ else
+ input_size = p.input_size * dpu_s; // Size of input image
+
+ const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
+ const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
+ const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
+
+ // Input/output allocation
+ A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+ T *bufferA = A;
+ histo_host = malloc(p.bins * sizeof(unsigned int));
+ histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int));
+
+ // Create an input file with arbitrary data
+ read_input(A, p);
+ if (p.exp == 0) {
+ for (unsigned int j = 1; j < NR_DPUS; j++) {
+ memcpy(&A[j * input_size_dpu_8bytes], &A[0],
+ input_size_dpu_8bytes * sizeof(T));
+ }
+ } else if (p.exp == 2) {
+ for (unsigned int j = 1; j < dpu_s; j++)
+ memcpy(&A[j * p.input_size], &A[0],
+ p.input_size * sizeof(T));
+ }
+ // Loop over main kernel
+ for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+ memset(histo_host, 0, p.bins * sizeof(unsigned int));
+ memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int));
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 0, 0);
- }
- DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
- if(rep >= p.n_warmup) {
- stop(&timer, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, 0, 0);
+ }
+ DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 0);
+ }
#endif
#if WITH_DPUINFO
- printf("DPUs:");
- DPU_FOREACH (dpu_set, dpu) {
- int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- int slice = dpu_get_slice_id(dpu_from_set(dpu));
- int member = dpu_get_member_id(dpu_from_set(dpu));
- printf(" %d(%d.%d)", rank, slice, member);
- }
- printf("\n");
+ printf("DPUs:");
+ DPU_FOREACH(dpu_set, dpu) {
+ int rank =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ int slice = dpu_get_slice_id(dpu_from_set(dpu));
+ int member = dpu_get_member_id(dpu_from_set(dpu));
+ printf(" %d(%d.%d)", rank, slice, member);
+ }
+ printf("\n");
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 1, 0);
- }
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
- stop(&timer, 1);
- }
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
+ if (rep >= p.n_warmup) {
+ start(&timer, 1, 0);
+ }
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 1);
+ }
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
#endif
- // int prev_rank_id = -1;
- int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
- numa_node_rank = -1;
- } else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
- }
- /*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
- }
-
- // Compute output on CPU (performance comparison and verification purposes)
- if(rep >= p.n_warmup) {
- start(&timer, 2, 0);
- }
- histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS);
- if(rep >= p.n_warmup) {
- stop(&timer, 2);
- }
-
- if(rep >= p.n_warmup) {
- start(&timer, 3, 0);
- }
- // Input arguments
- unsigned int kernel = 0;
- i = 0;
- dpu_arguments_t input_arguments[NR_DPUS];
- for(i=0; i<NR_DPUS-1; i++) {
- input_arguments[i].size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[i].bins=p.bins;
- input_arguments[i].kernel=kernel;
- }
- input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T);
- input_arguments[NR_DPUS-1].transfer_size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[NR_DPUS-1].bins=p.bins;
- input_arguments[NR_DPUS-1].kernel=kernel;
-
- // Copy input arrays
- i = 0;
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
- stop(&timer, 3);
- }
-
- // Run DPU kernel
- if(rep >= p.n_warmup) {
- start(&timer, 4, 0);
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- }
-
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if(rep >= p.n_warmup) {
- stop(&timer, 4);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- #endif
- }
+ // int prev_rank_id = -1;
+ int rank_id = -1;
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
+ numa_node_rank = -1;
+ } else {
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
+ }
+ /*
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
+ }
+
+ // Compute output on CPU (performance comparison and verification purposes)
+ if (rep >= p.n_warmup) {
+ start(&timer, 2, 0);
+ }
+ histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS);
+ if (rep >= p.n_warmup) {
+ stop(&timer, 2);
+ }
+
+ if (rep >= p.n_warmup) {
+ start(&timer, 3, 0);
+ }
+ // Input arguments
+ unsigned int kernel = 0;
+ i = 0;
+ dpu_arguments_t input_arguments[NR_DPUS];
+ for (i = 0; i < NR_DPUS - 1; i++) {
+ input_arguments[i].size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[i].transfer_size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[i].bins = p.bins;
+ input_arguments[i].kernel = kernel;
+ }
+ input_arguments[NR_DPUS - 1].size =
+ (input_size_8bytes -
+ input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T);
+ input_arguments[NR_DPUS - 1].transfer_size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[NR_DPUS - 1].bins = p.bins;
+ input_arguments[NR_DPUS - 1].kernel = kernel;
+
+ // Copy input arrays
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferA + input_size_dpu_8bytes * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ input_size_dpu_8bytes * sizeof(T),
+ DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 3);
+ }
+ // Run DPU kernel
+ if (rep >= p.n_warmup) {
+ start(&timer, 4, 0);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ }
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 4);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+ }
#if PRINT
- {
- unsigned int each_dpu = 0;
- printf("Display DPU Logs\n");
- DPU_FOREACH (dpu_set, dpu) {
- printf("DPU#%d:\n", each_dpu);
- DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
- each_dpu++;
- }
- }
+ {
+ unsigned int each_dpu = 0;
+ printf("Display DPU Logs\n");
+ DPU_FOREACH(dpu_set, dpu) {
+ printf("DPU#%d:\n", each_dpu);
+ DPU_ASSERT(dpulog_read_for_dpu
+ (dpu.dpu, stdout));
+ each_dpu++;
+ }
+ }
#endif
- i = 0;
- if(rep >= p.n_warmup) {
- start(&timer, 5, 0);
- }
- // PARALLEL RETRIEVE TRANSFER
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
-
- // Final histogram merging
- for(i = 1; i < NR_DPUS; i++){
- for(unsigned int j = 0; j < p.bins; j++){
- histo[j] += histo[j + i * p.bins];
- }
- }
- if(rep >= p.n_warmup) {
- stop(&timer, 5);
- }
-
+ i = 0;
+ if (rep >= p.n_warmup) {
+ start(&timer, 5, 0);
+ }
+ // PARALLEL RETRIEVE TRANSFER
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ input_size_dpu_8bytes * sizeof(T),
+ p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
+
+ // Final histogram merging
+ for (i = 1; i < NR_DPUS; i++) {
+ for (unsigned int j = 0; j < p.bins; j++) {
+ histo[j] += histo[j + i * p.bins];
+ }
+ }
+ if (rep >= p.n_warmup) {
+ stop(&timer, 5);
+ }
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 6, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, 6, 0);
+ }
#endif
- DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- stop(&timer, 6);
- }
+ if (rep >= p.n_warmup) {
+ stop(&timer, 6);
+ }
#endif
#endif
- if (rep >= p.n_warmup) {
- printf("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d",
- nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), input_size, p.bins);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3],
- timer.time[4],
- timer.time[5],
- timer.time[6]);
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- input_size * sizeof(T) / timer.time[2],
- input_size * sizeof(T) / (timer.time[4]),
- input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- input_size / timer.time[2],
- input_size / (timer.time[4]),
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- }
-
- }
-
- #if ENERGY
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- printf("DPU Energy (J): %f\t", energy);
- #endif
-
- // Check output
- bool status = true;
- if(p.exp == 1)
- for (unsigned int j = 0; j < p.bins; j++) {
- if(histo_host[j] != histo[j]){
- status = false;
+ if (rep >= p.n_warmup) {
+ printf
+ ("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d",
+ nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T),
+ input_size, p.bins);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ timer.time[0], timer.time[1], timer.time[2],
+ timer.time[3], timer.time[4], timer.time[5],
+ timer.time[6]);
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ input_size * sizeof(T) / timer.time[2],
+ input_size * sizeof(T) / (timer.time[4]),
+ input_size * sizeof(T) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ input_size * sizeof(T) / (timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size * sizeof(T) / (timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size * sizeof(T) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ input_size / timer.time[2],
+ input_size / (timer.time[4]),
+ input_size / (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5] + timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ input_size / (timer.time[3] + timer.time[4] +
+ timer.time[5]),
+ input_size / (timer.time[1] + timer.time[3] +
+ timer.time[4] + timer.time[5]),
+ input_size / (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5]));
+ }
+
+ }
+
+#if ENERGY
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ printf("DPU Energy (J): %f\t", energy);
+#endif
+
+ // Check output
+ bool status = true;
+ if (p.exp == 1)
+ for (unsigned int j = 0; j < p.bins; j++) {
+ if (histo_host[j] != histo[j]) {
+ status = false;
#if PRINT
- printf("%u - %u: %u -- %u\n", j, j, histo_host[j], histo[j]);
+ printf("%u - %u: %u -- %u\n", j, j,
+ histo_host[j], histo[j]);
#endif
- }
- }
- else if(p.exp == 2)
- for (unsigned int j = 0; j < p.bins; j++) {
- if(dpu_s * histo_host[j] != histo[j]){
- status = false;
+ }
+ } else if (p.exp == 2)
+ for (unsigned int j = 0; j < p.bins; j++) {
+ if (dpu_s * histo_host[j] != histo[j]) {
+ status = false;
#if PRINT
- printf("%u - %u: %u -- %u\n", j, j, dpu_s * histo_host[j], histo[j]);
+ printf("%u - %u: %u -- %u\n", j, j,
+ dpu_s * histo_host[j], histo[j]);
#endif
- }
- }
- else
- for (unsigned int j = 0; j < p.bins; j++) {
- if(NR_DPUS * histo_host[j] != histo[j]){
- status = false;
+ }
+ } else
+ for (unsigned int j = 0; j < p.bins; j++) {
+ if (NR_DPUS * histo_host[j] != histo[j]) {
+ status = false;
#if PRINT
- printf("%u - %u: %u -- %u\n", j, j, NR_DPUS * histo_host[j], histo[j]);
+ printf("%u - %u: %u -- %u\n", j, j,
+ NR_DPUS * histo_host[j], histo[j]);
#endif
- }
- }
- if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
- } else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
- }
-
- // Deallocation
- free(A);
- free(histo_host);
- free(histo);
+ }
+ }
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
+ }
+
+ // Deallocation
+ free(A);
+ free(histo_host);
+ free(histo);
#if !WITH_ALLOC_OVERHEAD
DPU_ASSERT(dpu_free(dpu_set));
#endif
-
- return status ? 0 : -1;
+
+ return status ? 0 : -1;
}
diff --git a/HST-S/support/common.h b/HST-S/support/common.h
index 30df40d..e0cacbb 100755
--- a/HST-S/support/common.h
+++ b/HST-S/support/common.h
@@ -13,8 +13,8 @@
// Data type
#define T uint32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
-#define REGS (BLOCK_SIZE >> 2) // 32 bits
+#define DIV 2 // Shift right to divide by sizeof(T)
+#define REGS (BLOCK_SIZE >> 2) // 32 bits
// Pixel depth
#define DEPTH 12
@@ -22,19 +22,19 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t size;
- uint32_t transfer_size;
- uint32_t bins;
+ uint32_t size;
+ uint32_t transfer_size;
+ uint32_t bins;
enum kernels {
- kernel1 = 0,
- nr_kernels = 1,
+ kernel1 = 0,
+ nr_kernels = 1,
} kernel;
} dpu_arguments_t;
#ifndef ENERGY
#define ENERGY 0
#endif
-#define PRINT 0
+#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
diff --git a/HST-S/support/params.h b/HST-S/support/params.h
index e29449b..3028a50 100644
--- a/HST-S/support/params.h
+++ b/HST-S/support/params.h
@@ -4,64 +4,80 @@
#include "common.h"
typedef struct Params {
- unsigned int input_size;
- unsigned int bins;
- int n_warmup;
- int n_reps;
- const char *file_name;
- int exp;
- int dpu_s;
-}Params;
+ unsigned int input_size;
+ unsigned int bins;
+ int n_warmup;
+ int n_reps;
+ const char *file_name;
+ int exp;
+ int dpu_s;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n -x <X> Weak (0) or strong (1, 2) scaling (default=0)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=1536*1024 elements)"
- "\n -b <B> histogram size (default=256 bins)"
- "\n -f <F> input image file (default=../input/image_VanHateren.iml)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n -x <X> Weak (0) or strong (1, 2) scaling (default=0)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=1536*1024 elements)"
+ "\n -b <B> histogram size (default=256 bins)"
+ "\n -f <F> input image file (default=../input/image_VanHateren.iml)"
+ "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size = 1536 * 1024;
- p.bins = 256;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.exp = 0;
- p.file_name = "./input/image_VanHateren.iml";
- p.dpu_s = 64;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size = 1536 * 1024;
+ p.bins = 256;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.exp = 0;
+ p.file_name = "./input/image_VanHateren.iml";
+ p.dpu_s = 64;
- int opt;
- while((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'b': p.bins = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'f': p.file_name = optarg; break;
- case 'x': p.exp = atoi(optarg); break;
- case 'z': p.dpu_s = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atoi(optarg);
+ break;
+ case 'b':
+ p.bins = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'f':
+ p.file_name = optarg;
+ break;
+ case 'x':
+ p.exp = atoi(optarg);
+ break;
+ case 'z':
+ p.dpu_s = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/HST-S/support/timer.h b/HST-S/support/timer.h
index 4d597b9..df68334 100755
--- a/HST-S/support/timer.h
+++ b/HST-S/support/timer.h
@@ -1,66 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+ struct timeval startTime[7];
+ struct timeval stopTime[7];
+ double time[7];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile
index e2e6780..7eb5f00 100644
--- a/MLP/baselines/cpu/Makefile
+++ b/MLP/baselines/cpu/Makefile
@@ -1,7 +1,28 @@
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
+endif
+
all: mlp_openmp
mlp_openmp: mlp_openmp.c
- gcc -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99
+ gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS}
mlp_openmp_O0: mlp_openmp.c
gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99
@@ -18,4 +39,7 @@ run_O0: mlp_openmp_O0
run_O2: mlp_openmp_O2
./mlp_openmp_O2
-.PHONY: all run run_O0 run_O2
+clean:
+ rm -f mlp_openmp mlp_openmp_O0 mlp_openmp_O2
+
+.PHONY: all run run_O0 run_O2 clean
diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c
index 8f95e7c..b473d7a 100644
--- a/MLP/baselines/cpu/mlp_openmp.c
+++ b/MLP/baselines/cpu/mlp_openmp.c
@@ -11,173 +11,261 @@
#include <getopt.h>
#include <assert.h>
#include <stdint.h>
-#include "../../support/timer.h"
#include "../../support/common.h"
+#if WITH_BENCHMARK
+#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
+
+#if NUMA
+#include <numaif.h>
+#include <numa.h>
+
+void *mp_pages[1];
+int mp_status[1];
+int mp_nodes[1];
+int numa_node_data = -1;
+int numa_node_cpu = -1;
+#endif
+
#define XSTR(x) STR(x)
#define STR(x) #x
-T** A;
-T* B;
-T* C;
+// weights
+T **A;
+
+// input/output
+T *B;
+
+// intermediate
+T *C;
// Create input arrays
-static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){
- for (unsigned int l = 0; l < NUM_LAYERS; l++)
- for (unsigned int i = 0; i < m_size * n_size; i++){
- if(i % 100 < 98){
+static void init_data(T **A, unsigned int m_size, unsigned int n_size)
+{
+ for (unsigned int l = 0; l < NUM_LAYERS; l++) {
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ if (i % 100 < 98) {
A[l][i] = 0;
- }else{
- A[l][i] = (l+i) % 2;
+ } else {
+ A[l][i] = (l + i) % 2;
}
}
- for (unsigned int i = 0; i < n_size; i++){
- if(i % 50 < 48){
+ }
+}
+
+static void init_B(T *B, unsigned int n_size)
+{
+ for (unsigned int i = 0; i < n_size; i++) {
+ if (i % 50 < 48) {
B[i] = 0;
- }
- else{
+ } else {
B[i] = i % 2;
}
}
}
// Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
- for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
- for (unsigned int m = 0; m < m_size; m++){
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
+ for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+ for (unsigned int m = 0; m < m_size; m++) {
C[m] = 0;
}
- #pragma omp parallel for
- for (unsigned int m = 0; m < m_size; m++){
- for (unsigned int n = 0; n < n_size; n++){
+#pragma omp parallel for
+ for (unsigned int m = 0; m < m_size; m++) {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[nl][m * n_size + n] * B[n];
}
C[m] = max(0, C[m]);
}
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int n = 0; n < n_size; n++) {
B[n] = C[n];
}
}
}
-static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) {
- uint64_t sum = 0;
- for (uint64_t m = 0; m < n_size; m++){
- sum += B[m];
- }
- return sum;
+static uint64_t mlp_host_sum(uint64_t n_size)
+{
+ uint64_t sum = 0;
+ for (uint64_t m = 0; m < n_size; m++) {
+ sum += B[m];
+ }
+ return sum;
}
// Params ---------------------------------------------------------------------
typedef struct Params {
- char* dpu_type;
- int nr_of_ranks;
- int input_size_n;
- int input_size_m;
- int n_warmup;
- int n_reps;
-}Params;
-
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -d <D> DPU type (default=fsim)"
- "\n -r <R> # of ranks (default=2)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=8M elements)"
- "\n");
- }
-
- struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.dpu_type = "fsim";
- p.nr_of_ranks = 1;
- p.input_size_n = 1 << 9;
- p.input_size_m = 1 << 9;
- p.n_warmup = 2;
- p.n_reps = 3;
-
- int opt;
- while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'd': p.dpu_type = optarg; break;
- case 'r': p.nr_of_ranks = atoi(optarg); break;
- case 'n': p.input_size_n = atoi(optarg); break;
- case 'm': p.input_size_m = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(p.nr_of_ranks > 0 && "Invalid # of ranks!");
-
- return p;
- }
+ int input_size_n;
+ int input_size_m;
+ int n_reps;
+#if NUMA
+ struct bitmask *bitmask;
+ int numa_node_cpu;
+#endif
+} Params;
+
+void usage()
+{
+ fprintf(stderr, "\nUsage: ./program [options]" "\n");
+}
+
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size_n = 8192;
+ p.input_size_m = 20480;
+ p.n_reps = 100;
+#if NUMA
+ p.bitmask = NULL;
+ p.numa_node_cpu = -1;
+#endif
+
+ int opt;
+ while ((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'n':
+ p.input_size_n = atoi(optarg);
+ break;
+ case 'm':
+ p.input_size_m = atoi(optarg);
+ break;
+#if NUMA
+ case 'A':
+ p.bitmask = numa_parse_nodestring(optarg);
+ break;
+ case 'C':
+ p.numa_node_cpu = atoi(optarg);
+ break;
+#endif
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+
+ return p;
+}
/**
* @brief Main of the Host Application.
*/
- int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
- uint64_t n_size = 8192;
- uint64_t m_size = 20480;
+ struct Params p = input_params(argc, argv);
+ uint64_t n_size = p.input_size_n;
+ uint64_t m_size = p.input_size_m;
- Timer timer;
- A = malloc(NUM_LAYERS * sizeof(T*));
- for(int l = 0; l < NUM_LAYERS; l++)
- A[l] = malloc(n_size*m_size*sizeof(unsigned int));
- B = malloc(m_size*sizeof(unsigned int));
- C = malloc(m_size*sizeof(unsigned int));
+#if WITH_BENCHMARK
+ Timer timer;
+#endif
- for (int i = 0; i < 100; i++) {
- // Create an input file with arbitrary data.
- init_data(A, B, m_size, n_size);
+#if NUMA
+ if (p.bitmask) {
+ numa_set_membind(p.bitmask);
+ numa_free_nodemask(p.bitmask);
+ }
+ A = numa_alloc(NUM_LAYERS * sizeof(T *));
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ A[l] = numa_alloc(n_size * m_size * sizeof(unsigned int));
+ }
+ B = numa_alloc(m_size * sizeof(unsigned int));
+ C = numa_alloc(m_size * sizeof(unsigned int));
+
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_data = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (numa_node_cpu != -1) {
+ if (numa_run_on_node(numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+#else
+ A = malloc(NUM_LAYERS * sizeof(T *));
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ A[l] = malloc(n_size * m_size * sizeof(unsigned int));
+ }
+ B = malloc(m_size * sizeof(unsigned int));
+ C = malloc(m_size * sizeof(unsigned int));
+#endif
+
+ // Create an input file with arbitrary data.
+ init_data(A, m_size, n_size);
+
+ for (int i = 0; i < p.n_reps; i++) {
+ init_B(B, n_size);
- start(&timer, 0, 0);
- mlp_host(C, A, B, n_size, m_size);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ mlp_host(C, A, B, n_size, m_size);
+ stop(&timer, 0);
- unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
-
- printf("[::] n_threads=%d e_type=%s n_elements=%lu "
- "| throughput_cpu_omp_MBps=%f\n",
- nr_threads, XSTR(T), n_size * m_size,
- n_size * m_size * sizeof(T) / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%lu "
- "| throughput_cpu_omp_MOpps=%f\n",
- nr_threads, XSTR(T), n_size * m_size,
- n_size * m_size / timer.time[0]);
- printf("[::] n_threads=%d e_type=%s n_elements=%lu |",
- nr_threads, XSTR(T), n_size * m_size);
- printall(&timer, 0);
- }
-
- uint32_t sum = mlp_host_sum(n_size, m_size);
-
- printf("Kernel ");
- print(&timer, 0, 1);
- printf("\n");
-
- printf("SUM = %d \n", sum);
-
- for(int l = 0; l < NUM_LAYERS; l++)
- free(A[l]);
- free(A);
- free(B);
- free(C);
-
- return 0;
+ nr_threads++;
+
+ printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu",
+ nr_threads, XSTR(T), n_size * m_size);
+#if NUMA
+ printf
+ (" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d",
+ numa_node_data, numa_node_cpu,
+ numa_distance(numa_node_data, numa_node_cpu));
+#endif
+ printf(" | throughput_MBps=%f throughput_MOpps=%f",
+ n_size * m_size * sizeof(T) / timer.time[0],
+ n_size * m_size / timer.time[0]);
+ printf(" latency_us=%f\n", timer.time[0]);
+#endif // WITH_BENCHMARK
+ }
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
+#endif
+
+ uint32_t sum = mlp_host_sum(n_size);
+
+ printf("SUM = %d \n", sum);
+
+#if NUMA
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ numa_free(A[l], n_size * m_size * sizeof(unsigned int));
+ }
+ numa_free(A, NUM_LAYERS * sizeof(T *));
+ numa_free(B, m_size * sizeof(unsigned int));
+ numa_free(C, m_size * sizeof(unsigned int));
+#else
+ for (int l = 0; l < NUM_LAYERS; l++) {
+ free(A[l]);
+ }
+ free(A);
+ free(B);
+ free(C);
+#endif
+
+ return 0;
}
diff --git a/MLP/dpu/task.c b/MLP/dpu/task.c
index de3e554..4f85024 100644
--- a/MLP/dpu/task.c
+++ b/MLP/dpu/task.c
@@ -15,7 +15,8 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// GEMV
-static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
+static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos)
+{
for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
bufferC[pos] += bufferA[i] * bufferB[i];
}
@@ -26,13 +27,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
-int main() {
+int main()
+{
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
@@ -42,12 +44,11 @@ int main() {
uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
-
unsigned int nrows = nr_rows;
- unsigned int rows_per_tasklet;
+ unsigned int rows_per_tasklet;
unsigned int start_row;
unsigned int chunks = nrows / (NR_TASKLETS + NR_TASKLETS);
- unsigned int dbl_chunks = chunks + chunks;
+ unsigned int dbl_chunks = chunks + chunks;
rows_per_tasklet = dbl_chunks;
unsigned int rest_rows = nrows % (NR_TASKLETS + NR_TASKLETS);
@@ -57,19 +58,30 @@ int main() {
if ((tasklet_id + tasklet_id) >= rest_rows) {
unsigned int hlf_rest_rows = rest_rows >> 1;
if ((rest_rows & 1) == 1)
- start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
+ start_row =
+ (hlf_rest_rows + 1) * (dbl_chunks + 2) +
+ (tasklet_id - 1 -
+ hlf_rest_rows) * dbl_chunks;
else
- start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
- } else
+ start_row =
+ (hlf_rest_rows) * (dbl_chunks + 2) +
+ (tasklet_id - hlf_rest_rows) * dbl_chunks;
+ } else
start_row = tasklet_id * (dbl_chunks + 2);
} else {
start_row = tasklet_id * (dbl_chunks);
}
// Address of the current row in MRAM
- uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
- uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
- uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
+ uint32_t mram_base_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
+ uint32_t mram_base_addr_B =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T));
+ uint32_t mram_base_addr_C =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER +
+ max_rows * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T) + start_row * sizeof(T));
uint32_t mram_temp_addr_A = mram_base_addr_A;
uint32_t mram_temp_addr_B = mram_base_addr_B;
@@ -82,34 +94,44 @@ int main() {
int offset = 0;
// Iterate over nr_rows
- for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
+ for (unsigned int i = start_row; i < start_row + rows_per_tasklet;
+ i += 2) {
- mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
+ mram_temp_addr_A =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
cache_C[0] = 0;
cache_C[1] = 0;
- for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
+ for (unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++) {
int n = 0, j;
- for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
- {
-
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- if(offset)
- {
-
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
- {
+ for (n = 0;
+ n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T)));
+ n += (BLOCK_SIZE / sizeof(T))) {
+
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A), cache_A,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_B), cache_B,
+ BLOCK_SIZE);
+
+ if (offset) {
+
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A +
+ BLOCK_SIZE), cache_A_aux,
+ 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
-
// Compute GEMV
gemv(cache_C, cache_A, cache_B, pos);
@@ -118,49 +140,51 @@ int main() {
mram_temp_addr_B += BLOCK_SIZE;
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)(mram_temp_addr_A),
+ cache_A, BLOCK_SIZE);
-
- if(offset)
- {
- for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
- {
+ if (offset) {
+ for (unsigned int off = 0;
+ off < (BLOCK_SIZE / sizeof(T)) - 1;
+ off++) {
cache_A[off] = cache_A[off + 1];
}
- mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
+ mram_read((__mram_ptr void const
+ *)(mram_temp_addr_A + BLOCK_SIZE),
+ cache_A_aux, 8);
- cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
+ cache_A[BLOCK_SIZE / sizeof(T) - 1] =
+ cache_A_aux[0];
}
+ mram_read((__mram_ptr void const *)(mram_temp_addr_B),
+ cache_B, BLOCK_SIZE);
- mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
-
- for (j = 0; j < (int) (n_size - n); j++) {
+ for (j = 0; j < (int)(n_size - n); j++) {
// Compute GEMV
- if(j >= (int)(BLOCK_SIZE / sizeof(T))){
+ if (j >= (int)(BLOCK_SIZE / sizeof(T))) {
printf("error\n");
break;
}
cache_C[pos] += cache_A[j] * cache_B[j];
}
-
- mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
+ mram_temp_addr_A +=
+ (BLOCK_SIZE -
+ ((BLOCK_SIZE / sizeof(T)) -
+ (n_size - n)) * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
- if(mram_temp_addr_A % 8 != 0)
- {
+ if (mram_temp_addr_A % 8 != 0) {
offset = 1;
- }
- else
- {
+ } else {
offset = 0;
}
}
// Write cache to current MRAM block
- mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
+ mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8);
// Update memory address
mram_base_addr_C += 2 * sizeof(T);
diff --git a/MLP/host/app.c b/MLP/host/app.c
index 952cb3f..24243bf 100644
--- a/MLP/host/app.c
+++ b/MLP/host/app.c
@@ -27,28 +27,29 @@
#define DPU_BINARY "./bin/mlp_dpu"
#endif
-static T** A;
-static T* B;
-static T* B_host;
-static T* B_tmp;
-static T* C;
-static T* C_dpu;
+static T **A;
+static T *B;
+static T *B_host;
+static T *B_tmp;
+static T *C;
+static T *C_dpu;
// Create input arrays
-static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int n_size) {
+static void init_data(T **A, T *B, T *B_host, unsigned int m_size,
+ unsigned int n_size)
+{
for (unsigned int l = 0; l < NUM_LAYERS; l++)
- for (unsigned int i = 0; i < m_size * n_size; i++){
- if(i % 100 < 98){
+ for (unsigned int i = 0; i < m_size * n_size; i++) {
+ if (i % 100 < 98) {
A[l][i] = 0;
- }else{
- A[l][i] = (l+i) % 2;
+ } else {
+ A[l][i] = (l + i) % 2;
}
}
- for (unsigned int i = 0; i < n_size; i++){
- if(i % 50 < 48){
+ for (unsigned int i = 0; i < n_size; i++) {
+ if (i % 50 < 48) {
B[i] = 0;
- }
- else{
+ } else {
B[i] = i % 2;
}
B_host[i] = B[i];
@@ -56,26 +57,29 @@ static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int
}
// Compute output in the host
-static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) {
+static void mlp_host(T *C, T **A, T *B, unsigned int m_size,
+ unsigned int n_size)
+{
- for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){
- for (unsigned int m = 0; m < m_size; m++){
+ for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) {
+ for (unsigned int m = 0; m < m_size; m++) {
C[m] = 0;
}
- for (unsigned int m = 0; m < m_size; m++){
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int m = 0; m < m_size; m++) {
+ for (unsigned int n = 0; n < n_size; n++) {
C[m] += A[nl][m * n_size + n] * B[n];
}
C[m] = max(0, C[m]);
}
- for (unsigned int n = 0; n < n_size; n++){
+ for (unsigned int n = 0; n < n_size; n++) {
B[n] = C[n];
}
}
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
struct Params p = input_params(argc, argv);
@@ -97,14 +101,15 @@ int main(int argc, char **argv) {
unsigned int n_size = p.n_size;
// Initialize help data
- dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t));
- dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
+ dpu_info =
+ (struct dpu_info_t *)malloc(nr_of_dpus * sizeof(struct dpu_info_t));
+ dpu_arguments_t *input_args =
+ (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
uint32_t max_rows_per_dpu = 0;
uint32_t n_size_pad = n_size;
- if(n_size % 2 == 1){
+ if (n_size % 2 == 1) {
n_size_pad++;
}
-
// Timer
Timer timer;
i = 0;
@@ -118,7 +123,10 @@ int main(int argc, char **argv) {
rows_per_dpu++;
if (rest_rows > 0) {
if (i >= rest_rows)
- prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
+ prev_rows_dpu =
+ rest_rows * (chunks + 1) + (i -
+ rest_rows) *
+ chunks;
else
prev_rows_dpu = i * (chunks + 1);
} else {
@@ -127,7 +135,7 @@ int main(int argc, char **argv) {
// Keep max rows for parallel transfers
uint32_t rows_per_dpu_pad = rows_per_dpu;
- if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
+ if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
rows_per_dpu_pad++;
if (rows_per_dpu_pad > max_rows_per_dpu)
max_rows_per_dpu = rows_per_dpu_pad;
@@ -142,14 +150,15 @@ int main(int argc, char **argv) {
input_args[i].nr_rows = rows_per_dpu;
}
- A = (T**)malloc(NUM_LAYERS * sizeof(T*));
- for(l = 0; l < NUM_LAYERS; l++)
- A[l] = (T*)malloc( max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T));
+ A = (T **) malloc(NUM_LAYERS * sizeof(T *));
+ for (l = 0; l < NUM_LAYERS; l++)
+ A[l] =
+ (T *) malloc(max_rows_per_dpu * nr_of_dpus * n_size_pad *
+ sizeof(T));
-
- B = (T*)malloc(n_size * sizeof(T));
- B_host = (T*)malloc(n_size * sizeof(T));
- C = (T*)malloc(m_size * sizeof(T));
+ B = (T *) malloc(n_size * sizeof(T));
+ B_host = (T *) malloc(n_size * sizeof(T));
+ C = (T *) malloc(m_size * sizeof(T));
C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
@@ -170,26 +179,36 @@ int main(int argc, char **argv) {
input_args[i].max_rows = max_rows_per_dpu;
DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
-
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
// Copy input array and vector
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A[0] + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A[0] + dpu_info[i].prev_rows_dpu * n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup)
stop(&timer, 1);
// Run kernel on DPUs
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
@@ -198,31 +217,38 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
- for(int lay = 1; lay < NUM_LAYERS; lay++){
+ for (int lay = 1; lay < NUM_LAYERS; lay++) {
if (rep >= p.n_warmup)
start(&timer, 4, rep - p.n_warmup);
i = 0;
// Copy C_dpu
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T),
+ DPU_XFER_DEFAULT));
// B = C
unsigned int n, j;
i = 0;
for (n = 0; n < nr_of_dpus; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- B_tmp[i] = C_dpu[n * max_rows_per_dpu + j];
+ B_tmp[i] =
+ C_dpu[n * max_rows_per_dpu + j];
i++;
}
}
@@ -230,20 +256,31 @@ int main(int argc, char **argv) {
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B_tmp));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
// Copy next matrix of weights
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, A[lay] + dpu_info[i].prev_rows_dpu * n_size));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu,
+ A[lay] +
+ dpu_info[i].prev_rows_dpu *
+ n_size));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ max_rows_per_dpu * n_size_pad * sizeof(T),
+ DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup)
+ if (rep >= p.n_warmup)
stop(&timer, 4);
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
@@ -252,8 +289,7 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
@@ -273,16 +309,23 @@ int main(int argc, char **argv) {
start(&timer, 3, rep - p.n_warmup);
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, C_dpu + i * max_rows_per_dpu));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup)
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ max_rows_per_dpu * n_size_pad * sizeof(T) +
+ n_size_pad * sizeof(T),
+ max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup)
stop(&timer, 3);
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -311,23 +354,26 @@ int main(int argc, char **argv) {
i = 0;
for (n = 0; n < nr_of_dpus; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
- if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
+ if (C[i] != C_dpu[n * max_rows_per_dpu + j]) {
status = false;
#if PRINT
- printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
+ printf("%d: %d -- %d\n", i, C[i],
+ C_dpu[n * max_rows_per_dpu + j]);
#endif
}
i++;
}
}
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
}
// Deallocation
- for(i = 0; i < NUM_LAYERS; i++)
+ for (i = 0; i < NUM_LAYERS; i++)
free(A[i]);
free(A);
free(B);
diff --git a/MLP/support/common.h b/MLP/support/common.h
index 53b2f1c..4b5031b 100755
--- a/MLP/support/common.h
+++ b/MLP/support/common.h
@@ -3,21 +3,21 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t n_size;
- uint32_t n_size_pad;
- uint32_t nr_rows;
- uint32_t max_rows;
+ uint32_t n_size;
+ uint32_t n_size_pad;
+ uint32_t nr_rows;
+ uint32_t max_rows;
} dpu_arguments_t;
// Specific information for each DPU
struct dpu_info_t {
- uint32_t rows_per_dpu;
- uint32_t rows_per_dpu_pad;
- uint32_t prev_rows_dpu;
+ uint32_t rows_per_dpu;
+ uint32_t rows_per_dpu_pad;
+ uint32_t prev_rows_dpu;
};
struct dpu_info_t *dpu_info;
-#define NUM_LAYERS 3
+#define NUM_LAYERS 3
#define max(x, y) (x > y ? x : y)
#define min(x, y) (x < y ? x : y)
diff --git a/MLP/support/params.h b/MLP/support/params.h
index f9e790e..4bfc2fc 100644
--- a/MLP/support/params.h
+++ b/MLP/support/params.h
@@ -4,53 +4,62 @@
#include "common.h"
typedef struct Params {
- unsigned int m_size;
- unsigned int n_size;
- unsigned int n_warmup;
- unsigned int n_reps;
-}Params;
+ unsigned int m_size;
+ unsigned int n_size;
+ unsigned int n_warmup;
+ unsigned int n_reps;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -m <I> m_size (default=2048 elements)"
- "\n -n <I> n_size (default=2048 elements)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -m <I> m_size (default=2048 elements)"
+ "\n -n <I> n_size (default=2048 elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.m_size = 163840;
- p.n_size = 4096;
- p.n_warmup = 1;
- p.n_reps = 3;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.m_size = 163840;
+ p.n_size = 4096;
+ p.n_warmup = 1;
+ p.n_reps = 3;
- int opt;
- while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'm': p.m_size = atoi(optarg); break;
- case 'n': p.n_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'm':
+ p.m_size = atoi(optarg);
+ break;
+ case 'n':
+ p.n_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/MLP/support/timer.h b/MLP/support/timer.h
index 886380a..961ed11 100755
--- a/MLP/support/timer.h
+++ b/MLP/support/timer.h
@@ -1,62 +1,69 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[5];
- struct timeval stopTime[5];
- double time[5];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
- //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
-
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+ struct timeval startTime[5];
+ struct timeval stopTime[5];
+ double time[5];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+ //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
+ // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
+
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("%f\t", timer->time[i] / (1000 * REP));
+}
diff --git a/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh
new file mode 100755
index 0000000..869ada3
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/nodmc25-alloc
+
+source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ ./make-size.sh ${size}
+ n_nops=$((size * 256))
+ if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then
+ for l in $(seq 1 20); do
+ bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}')
+ done
+ fi
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+(
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \
+ ::: i $(seq 1 5) \
+ ::: numa_rank -1 \
+ ::: numa_cpu 0 1 \
+ ::: nr_ranks $(seq 1 40) \
+ ::: size $(seq 0 15)
+
+) >> ${fn}.txt
diff --git a/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh
new file mode 100755
index 0000000..33bb12f
--- /dev/null
+++ b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+mkdir -p log/$(hostname)
+fn=log/$(hostname)/nodmc25-transfer
+
+source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+
+./make-size.sh 0
+
+run_benchmark_nmc() {
+ local "$@"
+ set -e
+ sudo limit_ranks_to_numa_node ${numa_rank}
+ make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1
+ bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size}
+ return $?
+}
+
+export -f run_benchmark_nmc
+
+# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output).
+# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB).
+# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory.
+
+(
+
+parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
+ run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \
+ ::: i $(seq 1 10) \
+ ::: numa_rank -1 \
+ ::: numa_in 0 1 \
+ ::: numa_out 0 1 \
+ ::: numa_cpu 0 1 \
+ ::: nr_ranks $(seq 1 40) \
+ ::: input_size 1 1048576
+
+) >> ${fn}.txt
diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile
index 64b20db..a24b764 100644
--- a/SpMV/baselines/cpu/Makefile
+++ b/SpMV/baselines/cpu/Makefile
@@ -1,7 +1,15 @@
+native ?= 1
+
+CFLAGS =
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
all: spmv
spmv: app.c
- gcc -O2 -o spmv -fopenmp app.c
+ gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o spmv -fopenmp app.c
spmv_O0: app.c
gcc -o spmv_O0 -fopenmp app.c
diff --git a/SpMV/baselines/cpu/run-perf.sh b/SpMV/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..714498d
--- /dev/null
+++ b/SpMV/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B
+
+OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
+OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run
diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile
index 236f7bb..438b9fb 100644
--- a/TRNS/baselines/cpu/Makefile
+++ b/TRNS/baselines/cpu/Makefile
@@ -32,16 +32,23 @@
# THE SOFTWARE.
#
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+native ?= 1
+numa ?= 0
+numa_memcpy ?= 0
-ifeq (${NUMA}, 1)
+CFLAGS =
+LDFLAGS =
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
+
+ifeq (${numa}, 1)
FLAGS += -lnuma
endif
CXX=g++
-CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY}
+CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy}
LIB=-L/usr/lib/ -lm -pthread
@@ -52,7 +59,7 @@ EXE=trns
all: trns
trns: ${SRC}
- $(CXX) -O2 $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE) $(FLAGS)
+ $(CXX) -O3 $(CXX_FLAGS) ${CFLAGS} $(SRC) $(LIB) -o $(EXE) ${LDFLAGS}
trns_O0: ${SRC}
$(CXX) $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE)_O0
diff --git a/TRNS/baselines/cpu/run-perf.sh b/TRNS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..f16a3b1
--- /dev/null
+++ b/TRNS/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 1 -a 4 -c 4
+perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 4 -a 4 -c 4
diff --git a/TRNS/dimes-hetsim-hbm.sh b/TRNS/dimes-hetsim-hbm.sh
index e2efaee..cc5dc68 100755
--- a/TRNS/dimes-hetsim-hbm.sh
+++ b/TRNS/dimes-hetsim-hbm.sh
@@ -32,7 +32,7 @@ fn=log/$(hostname)/dimes-hetsim-hbm
(
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2
@@ -43,10 +43,9 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \
::: ram_in $(seq 0 15) \
:::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \
::: ram_local $(seq 0 15) \
- :::+ cpu $(seq 0 7) $(seq 0 7) \
- ::: input_size 167772160
+ :::+ cpu $(seq 0 7) $(seq 0 7)
-make -B NUMA=1
+make -B numa=1
echo "CPU single-node operation (2/3)" >&2
diff --git a/TRNS/dimes-hetsim-nmc.sh b/TRNS/dimes-hetsim-nmc.sh
index b5f6f13..80987e7 100755
--- a/TRNS/dimes-hetsim-nmc.sh
+++ b/TRNS/dimes-hetsim-nmc.sh
@@ -73,7 +73,7 @@ parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \
) >> ${fn}.txt
cd baselines/cpu
-make -B NUMA=1
+make -B numa=1
(
@@ -97,7 +97,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \
) >> ${fn}.txt
-make -B NUMA=1 NUMA_MEMCPY=1
+make -B numa=1 numa_memcpy=1
(
diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h
index 120c225..bfaf052 100644
--- a/TS/baselines/cpu/mprofile.h
+++ b/TS/baselines/cpu/mprofile.h
@@ -10,5 +10,7 @@
//#define HBM_ALOC
//#define RANDOM_DIAGS
-int loadTimeSeriesFromFile (std::string infilename, std::vector<DTYPE> &A, int &timeSeriesLength);
-int saveProfileToFile(std::string outfilename, DTYPE * profile, int * profileIndex, int timeSeriesLength, int windowSize);
+int loadTimeSeriesFromFile(std::string infilename, std::vector < DTYPE > &A,
+ int &timeSeriesLength);
+int saveProfileToFile(std::string outfilename, DTYPE * profile,
+ int *profileIndex, int timeSeriesLength, int windowSize);
diff --git a/TS/baselines/cpu/run-perf.sh b/TS/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..453b64b
--- /dev/null
+++ b/TS/baselines/cpu/run-perf.sh
@@ -0,0 +1,8 @@
+#!/bin/zsh
+
+make -B NUMA=1
+
+for i in $(seq 1 20); do
+ OMP_NUM_THREADS=1 perf stat record -o t1.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4
+ OMP_NUM_THREADS=4 perf stat record -o t4.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4
+done
diff --git a/TS/dpu/task.c b/TS/dpu/task.c
index d704160..5a756aa 100644
--- a/TS/dpu/task.c
+++ b/TS/dpu/task.c
@@ -18,18 +18,18 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS;
__host dpu_result_t DPU_RESULTS[NR_TASKLETS];
// Dot product
-static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) {
-
- for(uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++)
- {
- for(uint32_t j = 0; j < DOTPIP; j++)
- {
- if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1)
- {
- result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)] * vectorB[i];
- }
- else
- {
+static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB,
+ DTYPE *result)
+{
+
+ for (uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++) {
+ for (uint32_t j = 0; j < DOTPIP; j++) {
+ if ((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1) {
+ result[j] +=
+ vectorA_aux[(j + i) -
+ BLOCK_SIZE / sizeof(DTYPE)] *
+ vectorB[i];
+ } else {
result[j] += vectorA[j + i] * vectorB[i];
}
}
@@ -40,43 +40,46 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
-int(*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
-int main(void){
+int main(void)
+{
// Kernel
- return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+ return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
}
// main_kernel1
-int main_kernel1() {
+int main_kernel1()
+{
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
- if(tasklet_id == 0){
- mem_reset(); // Reset the heap
+ if (tasklet_id == 0) {
+ mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
// Input arguments
- uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length;
- DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean;
- DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std;
+ uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length;
+ DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean;
+ DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std;
uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu;
// Boundaries for current tasklet
- uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS));
- uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;
+ uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS));
+ uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1;
// Check time series limit
- if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length;
+ if (myEndElem > slice_per_dpu - query_length)
+ myEndElem = slice_per_dpu - query_length;
// Starting address of the current processing block in MRAM
uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER;
// Starting address of the query subsequence
- uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset);
+ uint32_t current_mram_block_addr_query = (uint32_t) (mem_offset);
mem_offset += query_length * sizeof(DTYPE);
// Starting address of the time series slice
@@ -86,18 +89,18 @@ int main_kernel1() {
// Starting address of the time series means
mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
- uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset);
+ uint32_t current_mram_block_addr_TSMean = (uint32_t) (mem_offset);
// Starting address of the time series standard deviations
mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE);
- uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset);
+ uint32_t current_mram_block_addr_TSSigma = (uint32_t) (mem_offset);
// Initialize local caches to store the MRAM blocks
- DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE);
- DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE);
+ DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE);
// Create result structure pointer
@@ -108,41 +111,56 @@ int main_kernel1() {
DTYPE min_distance = DTYPE_MAX;
uint32_t min_index = 0;
-
- for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE)))
- {
- for(uint32_t d = 0; d < DOTPIP; d++)
+ for (uint32_t i = myStartElem; i < myEndElem;
+ i += (BLOCK_SIZE / sizeof(DTYPE))) {
+ for (uint32_t d = 0; d < DOTPIP; d++)
cache_dotprods[d] = 0;
- current_mram_block_addr_TS = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE);
- current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER;
-
- for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++)
- {
- mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE);
- mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE);
- mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE);
-
- current_mram_block_addr_TS += BLOCK_SIZE;
+ current_mram_block_addr_TS =
+ (uint32_t) starting_offset_ts + (i -
+ myStartElem) *
+ sizeof(DTYPE);
+ current_mram_block_addr_query =
+ (uint32_t) DPU_MRAM_HEAP_POINTER;
+
+ for (uint32_t j = 0;
+ j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++) {
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_TS, cache_TS,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_TS + BLOCK_SIZE,
+ cache_TS_aux, BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_query, cache_query,
+ BLOCK_SIZE);
+
+ current_mram_block_addr_TS += BLOCK_SIZE;
current_mram_block_addr_query += BLOCK_SIZE;
- dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods);
+ dot_product(cache_TS, cache_TS_aux, cache_query,
+ cache_dotprods);
}
-
- mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE);
- mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE);
- current_mram_block_addr_TSMean += BLOCK_SIZE;
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_TSMean, cache_TSMean,
+ BLOCK_SIZE);
+ mram_read((__mram_ptr void const *)
+ current_mram_block_addr_TSSigma, cache_TSSigma,
+ BLOCK_SIZE);
+ current_mram_block_addr_TSMean += BLOCK_SIZE;
current_mram_block_addr_TSSigma += BLOCK_SIZE;
- for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++)
- {
- distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k]
- * query_mean) / (cache_TSSigma[k] * query_std));
-
- if(distance < min_distance)
- {
- min_distance = distance;
- min_index = i + k;
+ for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++) {
+ distance =
+ 2 * ((DTYPE) query_length -
+ (cache_dotprods[k] -
+ (DTYPE) query_length * cache_TSMean[k]
+ * query_mean) / (cache_TSSigma[k] *
+ query_std));
+
+ if (distance < min_distance) {
+ min_distance = distance;
+ min_index = i + k;
}
}
}
diff --git a/TS/host/app.c b/TS/host/app.c
index b9faa9c..a19232b 100644
--- a/TS/host/app.c
+++ b/TS/host/app.c
@@ -31,23 +31,23 @@
#define MAX_DATA_VAL 127
static DTYPE tSeries[1 << 26];
-static DTYPE query [1 << 15];
-static DTYPE AMean [1 << 26];
-static DTYPE ASigma [1 << 26];
+static DTYPE query[1 << 15];
+static DTYPE AMean[1 << 26];
+static DTYPE ASigma[1 << 26];
static DTYPE minHost;
static DTYPE minHostIdx;
// Create input arrays
-static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elements) {
+static DTYPE *create_test_file(unsigned int ts_elements,
+ unsigned int query_elements)
+{
srand(0);
- for (uint64_t i = 0; i < ts_elements; i++)
- {
+ for (uint64_t i = 0; i < ts_elements; i++) {
tSeries[i] = i % MAX_DATA_VAL;
}
- for (uint64_t i = 0; i < query_elements; i++)
- {
+ for (uint64_t i = 0; i < query_elements; i++) {
query[i] = i % MAX_DATA_VAL;
}
@@ -55,61 +55,62 @@ static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elem
}
// Compute output in the host
-static void streamp(DTYPE* tSeries, DTYPE* AMean, DTYPE* ASigma, int ProfileLength,
- DTYPE* query, int queryLength, DTYPE queryMean, DTYPE queryStdDeviation)
+static void streamp(DTYPE *tSeries, DTYPE *AMean, DTYPE *ASigma,
+ int ProfileLength, DTYPE *query, int queryLength,
+ DTYPE queryMean, DTYPE queryStdDeviation)
{
DTYPE distance;
DTYPE dotprod;
- minHost = INT32_MAX;
+ minHost = INT32_MAX;
minHostIdx = 0;
- for (int subseq = 0; subseq < ProfileLength; subseq++)
- {
+ for (int subseq = 0; subseq < ProfileLength; subseq++) {
dotprod = 0;
- for(int j = 0; j < queryLength; j++)
- {
+ for (int j = 0; j < queryLength; j++) {
dotprod += tSeries[j + subseq] * query[j];
}
- distance = 2 * (queryLength - (dotprod - queryLength * AMean[subseq]
- * queryMean) / (ASigma[subseq] * queryStdDeviation));
+ distance =
+ 2 * (queryLength - (dotprod - queryLength * AMean[subseq]
+ * queryMean) / (ASigma[subseq] *
+ queryStdDeviation));
- if(distance < minHost)
- {
+ if (distance < minHost) {
minHost = distance;
minHostIdx = subseq;
}
}
}
-static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int ProfileLength, unsigned int queryLength)
+static void compute_ts_statistics(unsigned int timeSeriesLength,
+ unsigned int ProfileLength,
+ unsigned int queryLength)
{
- double* ACumSum = malloc(sizeof(double) * timeSeriesLength);
+ double *ACumSum = malloc(sizeof(double) * timeSeriesLength);
ACumSum[0] = tSeries[0];
for (uint64_t i = 1; i < timeSeriesLength; i++)
ACumSum[i] = tSeries[i] + ACumSum[i - 1];
- double* ASqCumSum = malloc(sizeof(double) * timeSeriesLength);
+ double *ASqCumSum = malloc(sizeof(double) * timeSeriesLength);
ASqCumSum[0] = tSeries[0] * tSeries[0];
for (uint64_t i = 1; i < timeSeriesLength; i++)
ASqCumSum[i] = tSeries[i] * tSeries[i] + ASqCumSum[i - 1];
- double* ASum = malloc(sizeof(double) * ProfileLength);
+ double *ASum = malloc(sizeof(double) * ProfileLength);
ASum[0] = ACumSum[queryLength - 1];
for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++)
ASum[i + 1] = ACumSum[queryLength + i] - ACumSum[i];
- double* ASumSq = malloc(sizeof(double) * ProfileLength);
+ double *ASumSq = malloc(sizeof(double) * ProfileLength);
ASumSq[0] = ASqCumSum[queryLength - 1];
for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++)
ASumSq[i + 1] = ASqCumSum[queryLength + i] - ASqCumSum[i];
- double * AMean_tmp = malloc(sizeof(double) * ProfileLength);
+ double *AMean_tmp = malloc(sizeof(double) * ProfileLength);
for (uint64_t i = 0; i < ProfileLength; i++)
AMean_tmp[i] = ASum[i] / queryLength;
- double* ASigmaSq = malloc(sizeof(double) * ProfileLength);
+ double *ASigmaSq = malloc(sizeof(double) * ProfileLength);
for (uint64_t i = 0; i < ProfileLength; i++)
ASigmaSq[i] = ASumSq[i] / queryLength - AMean[i] * AMean[i];
- for (uint64_t i = 0; i < ProfileLength; i++)
- {
+ for (uint64_t i = 0; i < ProfileLength; i++) {
ASigma[i] = sqrt(ASigmaSq[i]);
- AMean[i] = (DTYPE) AMean_tmp[i];
+ AMean[i] = (DTYPE) AMean_tmp[i];
}
free(ACumSum);
@@ -121,7 +122,8 @@ static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int Pr
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
// Timer declaration
Timer timer;
@@ -129,22 +131,22 @@ int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ uint32_t nr_of_ranks;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[6] = 0; // free
#endif
#if ENERGY
@@ -152,12 +154,15 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
- unsigned long int ts_size = p.input_size_n;
+ unsigned long int ts_size = p.input_size_n;
const unsigned int query_length = p.input_size_m;
// Size adjustment
- if(ts_size % (NR_DPUS * NR_TASKLETS*query_length))
- ts_size = ts_size + (NR_DPUS * NR_TASKLETS * query_length - ts_size % (NR_DPUS * NR_TASKLETS*query_length));
+ if (ts_size % (NR_DPUS * NR_TASKLETS * query_length))
+ ts_size =
+ ts_size + (NR_DPUS * NR_TASKLETS * query_length -
+ ts_size % (NR_DPUS * NR_TASKLETS *
+ query_length));
// Create an input file with arbitrary data
create_test_file(ts_size, query_length);
@@ -165,30 +170,34 @@ int main(int argc, char **argv) {
DTYPE query_mean;
double queryMean = 0;
- for(unsigned i = 0; i < query_length; i++) queryMean += query[i];
- queryMean /= (double) query_length;
+ for (unsigned i = 0; i < query_length; i++)
+ queryMean += query[i];
+ queryMean /= (double)query_length;
query_mean = (DTYPE) queryMean;
DTYPE query_std;
double queryStdDeviation;
double queryVariance = 0;
- for(unsigned i = 0; i < query_length; i++)
- {
- queryVariance += (query[i] - queryMean) * (query[i] - queryMean);
+ for (unsigned i = 0; i < query_length; i++) {
+ queryVariance +=
+ (query[i] - queryMean) * (query[i] - queryMean);
}
- queryVariance /= (double) query_length;
+ queryVariance /= (double)query_length;
queryStdDeviation = sqrt(queryVariance);
query_std = (DTYPE) queryStdDeviation;
- DTYPE *bufferTS = tSeries;
- DTYPE *bufferQ = query;
- DTYPE *bufferAMean = AMean;
+ DTYPE *bufferTS = tSeries;
+ DTYPE *bufferQ = query;
+ DTYPE *bufferAMean = AMean;
DTYPE *bufferASigma = ASigma;
uint32_t slice_per_dpu = ts_size / NR_DPUS;
unsigned int kernel = 0;
- dpu_arguments_t input_arguments = {ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, kernel};
+ dpu_arguments_t input_arguments =
+ { ts_size, query_length, query_mean, query_std, slice_per_dpu, 0,
+ kernel
+ };
uint32_t mem_offset;
dpu_result_t result;
@@ -200,20 +209,20 @@ int main(int argc, char **argv) {
for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 0, 0);
}
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 0);
}
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 1, 0);
}
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 1);
}
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
@@ -229,54 +238,72 @@ int main(int argc, char **argv) {
DPU_FOREACH(dpu_set, dpu) {
input_arguments.exclusion_zone = 0;
- DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGUMENTS", 0, (const void *) &input_arguments, sizeof(input_arguments)));
+ DPU_ASSERT(dpu_copy_to
+ (dpu, "DPU_INPUT_ARGUMENTS", 0,
+ (const void *)&input_arguments,
+ sizeof(input_arguments)));
i++;
}
i = 0;
mem_offset = 0;
- DPU_FOREACH(dpu_set, dpu, i)
- {
+ DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, bufferQ));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, query_length * sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ query_length * sizeof(DTYPE), DPU_XFER_DEFAULT));
i = 0;
mem_offset += query_length * sizeof(DTYPE);
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferTS + slice_per_dpu * i));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferTS + slice_per_dpu * i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset,(slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+ (slice_per_dpu + query_length) * sizeof(DTYPE),
+ DPU_XFER_DEFAULT));
mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE));
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferAMean + slice_per_dpu * i));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferAMean + slice_per_dpu * i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+ (slice_per_dpu + query_length) * sizeof(DTYPE),
+ DPU_XFER_DEFAULT));
i = 0;
mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE));
DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferASigma + slice_per_dpu * i));
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferASigma + slice_per_dpu * i));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, mem_offset,
+ (slice_per_dpu + query_length) * sizeof(DTYPE),
+ DPU_XFER_DEFAULT));
if (rep >= p.n_warmup) {
stop(&timer, 2);
}
-
// Run kernel on DPUs
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
start(&timer, 3, 0);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
@@ -285,37 +312,49 @@ int main(int argc, char **argv) {
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if (rep >= p.n_warmup)
- {
+ if (rep >= p.n_warmup) {
stop(&timer, 3);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
- dpu_result_t* results_retrieve[NR_DPUS];
+ dpu_result_t *results_retrieve[NR_DPUS];
if (rep >= p.n_warmup) {
start(&timer, 4, 0);
}
DPU_FOREACH(dpu_set, dpu, i) {
- results_retrieve[i] = (dpu_result_t*)malloc(NR_TASKLETS * sizeof(dpu_result_t));
+ results_retrieve[i] =
+ (dpu_result_t *) malloc(NR_TASKLETS *
+ sizeof(dpu_result_t));
}
-
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
}
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_result_t), DPU_XFER_DEFAULT));
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0,
+ NR_TASKLETS * sizeof(dpu_result_t),
+ DPU_XFER_DEFAULT));
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
- for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) {
- if(results_retrieve[i][each_tasklet].minValue < result.minValue && results_retrieve[i][each_tasklet].minValue > 0)
- {
- result.minValue = results_retrieve[i][each_tasklet].minValue;
- result.minIndex = (DTYPE)results_retrieve[i][each_tasklet].minIndex + (i * slice_per_dpu);
+ for (unsigned int each_tasklet = 0;
+ each_tasklet < NR_TASKLETS; each_tasklet++) {
+ if (results_retrieve[i][each_tasklet].minValue <
+ result.minValue
+ &&
+ results_retrieve[i][each_tasklet].minValue >
+ 0) {
+ result.minValue =
+ results_retrieve[i]
+ [each_tasklet].minValue;
+ result.minIndex = (DTYPE)
+ results_retrieve[i]
+ [each_tasklet].minIndex +
+ (i * slice_per_dpu);
}
}
@@ -323,11 +362,9 @@ int main(int argc, char **argv) {
i++;
}
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 4);
}
-
-
#if PRINT
printf("LOGS\n");
DPU_FOREACH(dpu_set, dpu) {
@@ -337,13 +374,13 @@ int main(int argc, char **argv) {
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
start(&timer, 5, 0);
}
#endif
DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
stop(&timer, 5);
}
#endif
@@ -352,52 +389,83 @@ int main(int argc, char **argv) {
if (rep >= p.n_warmup) {
start(&timer, 6, 0);
}
- streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, query, query_length, query_mean, query_std);
- if(rep >= p.n_warmup) {
+ streamp(tSeries, AMean, ASigma, ts_size - query_length - 1,
+ query, query_length, query_mean, query_std);
+ if (rep >= p.n_warmup) {
stop(&timer, 6);
}
int status = (minHost == result.minValue);
if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] results are equal\n");
if (rep >= p.n_warmup) {
- printf("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
- NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, ts_size);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ",
- timer.time[0], // alloc
- timer.time[1], // load
- timer.time[2], // write
- timer.time[3], // kernel
- timer.time[4], // read
- timer.time[5], // free
- timer.time[6]); // CPU
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- ts_size * sizeof(DTYPE) / timer.time[6],
- ts_size * sizeof(DTYPE) / (timer.time[3]),
- ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- ts_size * sizeof(DTYPE) / (timer.time[2] + timer.time[3] + timer.time[4]),
- ts_size * sizeof(DTYPE) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]),
- ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- ts_size / timer.time[6],
- ts_size / (timer.time[3]),
- ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- ts_size / (timer.time[2] + timer.time[3] + timer.time[4]),
- ts_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]),
- ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
+ printf
+ ("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu",
+ NR_DPUS, nr_of_ranks, NR_TASKLETS,
+ XSTR(DTYPE), BLOCK_SIZE, ts_size);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD);
+ printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", timer.time[0], // alloc
+ timer.time[1], // load
+ timer.time[2], // write
+ timer.time[3], // kernel
+ timer.time[4], // read
+ timer.time[5], // free
+ timer.time[6]); // CPU
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ ts_size * sizeof(DTYPE) / timer.time[6],
+ ts_size * sizeof(DTYPE) / (timer.time[3]),
+ ts_size * sizeof(DTYPE) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ ts_size * sizeof(DTYPE) / (timer.time[2] +
+ timer.time[3] +
+ timer.time[4]),
+ ts_size * sizeof(DTYPE) / (timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4]),
+ ts_size * sizeof(DTYPE) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ ts_size / timer.time[6],
+ ts_size / (timer.time[3]),
+ ts_size / (timer.time[0] + timer.time[1] +
+ timer.time[2] + timer.time[3] +
+ timer.time[4] + timer.time[5]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ ts_size / (timer.time[2] + timer.time[3] +
+ timer.time[4]),
+ ts_size / (timer.time[1] + timer.time[2] +
+ timer.time[3] + timer.time[4]),
+ ts_size / (timer.time[0] + timer.time[1] +
+ timer.time[2] + timer.time[3] +
+ timer.time[4]));
}
} else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] results differ!\n");
}
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
@@ -407,7 +475,6 @@ int main(int argc, char **argv) {
printf("Energy (J): %f J\t", avg_energy);
#endif
-
#if !WITH_ALLOC_OVERHEAD
DPU_ASSERT(dpu_free(dpu_set));
#endif
diff --git a/TS/support/common.h b/TS/support/common.h
index b120bb1..7585b90 100755
--- a/TS/support/common.h
+++ b/TS/support/common.h
@@ -14,30 +14,30 @@
#define DTYPE int32_t
#define DTYPE_MAX INT32_MAX
-typedef struct {
+typedef struct {
uint32_t ts_length;
- uint32_t query_length;
- DTYPE query_mean;
- DTYPE query_std;
- uint32_t slice_per_dpu;
- int32_t exclusion_zone;
- enum kernels {
+ uint32_t query_length;
+ DTYPE query_mean;
+ DTYPE query_std;
+ uint32_t slice_per_dpu;
+ int32_t exclusion_zone;
+ enum kernels {
kernel1 = 0,
nr_kernels = 1,
} kernel;
-}dpu_arguments_t;
+} dpu_arguments_t;
-typedef struct {
- DTYPE minValue;
- uint32_t minIndex;
- DTYPE maxValue;
- uint32_t maxIndex;
-}dpu_result_t;
+typedef struct {
+ DTYPE minValue;
+ uint32_t minIndex;
+ DTYPE maxValue;
+ uint32_t maxIndex;
+} dpu_result_t;
#ifndef ENERGY
#define ENERGY 0
#endif
-#define PRINT 0
+#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
diff --git a/TS/support/params.h b/TS/support/params.h
index 4668604..b7d9763 100644
--- a/TS/support/params.h
+++ b/TS/support/params.h
@@ -5,54 +5,63 @@
// Params ---------------------------------------------------------------------
typedef struct Params {
- unsigned long input_size_n;
- unsigned long input_size_m;
- int n_warmup;
- int n_reps;
-}Params;
+ unsigned long input_size_n;
+ unsigned long input_size_m;
+ int n_warmup;
+ int n_reps;
+} Params;
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -n <n> n (TS length. Default=64K elements)"
- "\n -m <m> m (Query length. Default=256 elements)"
- "\n");
- }
+void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -n <n> n (TS length. Default=64K elements)"
+ "\n -m <m> m (Query length. Default=256 elements)" "\n");
+}
- struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size_n = 1 << 16;
- p.input_size_m = 1 << 8;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size_n = 1 << 16;
+ p.input_size_m = 1 << 8;
- p.n_warmup = 1;
- p.n_reps = 3;
+ p.n_warmup = 1;
+ p.n_reps = 3;
- int opt;
- while((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'n': p.input_size_n = atol(optarg); break;
- case 'm': p.input_size_m = atol(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'n':
+ p.input_size_n = atol(optarg);
+ break;
+ case 'm':
+ p.input_size_m = atol(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
- }
+ return p;
+}
#endif
diff --git a/TS/support/timer.h b/TS/support/timer.h
index ff1ae1b..c569de7 100755
--- a/TS/support/timer.h
+++ b/TS/support/timer.h
@@ -1,66 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+ struct timeval startTime[7];
+ struct timeval stopTime[7];
+ double time[7];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("%f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/UNI/baselines/cpu/Makefile b/UNI/baselines/cpu/Makefile
index ec3f403..bbf9db0 100644
--- a/UNI/baselines/cpu/Makefile
+++ b/UNI/baselines/cpu/Makefile
@@ -4,7 +4,7 @@ all: uni
TYPE ?= int64_t
uni: app_baseline.c
- gcc -O2 -o uni -fopenmp -DT=${TYPE} app_baseline.c
+ gcc -Wall -Wextra -pedantic -march=native -O2 -o uni -fopenmp -DT=${TYPE} app_baseline.c
uni_O0: app_baseline.c
gcc -o uni_O0 -fopenmp app_baseline.c
diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile
index 76a82e1..04aacb6 100644
--- a/VA/baselines/cpu/Makefile
+++ b/VA/baselines/cpu/Makefile
@@ -1,9 +1,23 @@
-NUMA ?= 0
-NUMA_MEMCPY ?= 0
-FLAGS =
+benchmark ?= 1
+debug ?= 0
+native ?= 1
+nop_sync ?= 0
+numa ?= 0
+numa_memcpy ?= 0
+
+CFLAGS =
+LDFLAGS =
+
+ifeq (${debug}, 1)
+ CFLAGS += -g
+endif
+
+ifeq (${native}, 1)
+ CFLAGS += -march=native
+endif
-ifeq (${NUMA}, 1)
- FLAGS += -lnuma
+ifeq (${numa}, 1)
+ LDFLAGS += -lnuma
endif
.PHONY: all
@@ -12,7 +26,7 @@ all: va
TYPE ?= int32_t
va: app_baseline.c
- gcc -Wall -Wextra -pedantic -march=native -O2 -o va -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} app_baseline.c ${FLAGS}
+ gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o va -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -DT=${TYPE} app_baseline.c ${LDFLAGS}
va_O0: app_baseline.c
gcc -o va_O0 -fopenmp app_baseline.c
diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c
index 4c8610a..7975200 100644
--- a/VA/baselines/cpu/app_baseline.c
+++ b/VA/baselines/cpu/app_baseline.c
@@ -13,13 +13,19 @@
#include <stdint.h>
#include <omp.h>
+
+#if WITH_BENCHMARK
#include "../../support/timer.h"
+#else
+#define start(...)
+#define stop(...)
+#endif
#if NUMA
#include <numaif.h>
#include <numa.h>
-void* mp_pages[1];
+void *mp_pages[1];
int mp_status[1];
int mp_nodes[1];
int numa_node_in = -1;
@@ -49,301 +55,345 @@ static T *B_local;
/**
* @brief compute output in the host
*/
-static void vector_addition_host(unsigned int nr_elements, int t) {
- omp_set_num_threads(t);
- #pragma omp parallel for
- for (int i = 0; i < nr_elements; i++) {
+static void vector_addition_host(unsigned long nr_elements, int t)
+{
+ omp_set_num_threads(t);
+#pragma omp parallel for
+ for (long i = 0; i < nr_elements; i++) {
#if NUMA_MEMCPY
- C[i] = A_local[i] + B_local[i];
+ C[i] = A_local[i] + B_local[i];
#else
- C[i] = A[i] + B[i];
+ C[i] = A[i] + B[i];
#endif
- }
+ }
}
// Params ---------------------------------------------------------------------
typedef struct Params {
- int input_size;
- int n_warmup;
- int n_reps;
- int exp;
- int n_threads;
+ long input_size;
+ int n_warmup;
+ int n_reps;
+ int exp;
+ int n_threads;
#if NUMA
- struct bitmask* bitmask_in;
- struct bitmask* bitmask_out;
- int numa_node_cpu;
+ struct bitmask *bitmask_in;
+ struct bitmask *bitmask_out;
+ int numa_node_cpu;
#endif
#if NUMA_MEMCPY
- int numa_node_cpu_memcpy;
- struct bitmask* bitmask_cpu;
+ int numa_node_cpu_memcpy;
+ struct bitmask *bitmask_cpu;
#endif
-}Params;
-
-void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -t <T> # of threads (default=8)"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=8M elements)"
- "\n");
+} Params;
+
+void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -t <T> # of threads (default=8)"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=8M elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size = 16777216;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.exp = 1;
- p.n_threads = 5;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size = 16777216;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.exp = 1;
+ p.n_threads = 5;
#if NUMA
- p.bitmask_in = NULL;
- p.bitmask_out = NULL;
- p.numa_node_cpu = -1;
+ p.bitmask_in = NULL;
+ p.bitmask_out = NULL;
+ p.numa_node_cpu = -1;
#endif
#if NUMA_MEMCPY
- p.numa_node_cpu_memcpy = -1;
- p.bitmask_cpu = NULL;
+ p.numa_node_cpu_memcpy = -1;
+ p.bitmask_cpu = NULL;
#endif
- int opt;
- while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'x': p.exp = atoi(optarg); break;
- case 't': p.n_threads = atoi(optarg); break;
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atol(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'x':
+ p.exp = atoi(optarg);
+ break;
+ case 't':
+ p.n_threads = atoi(optarg);
+ break;
#if NUMA
- case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break;
- case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break;
- case 'c': p.numa_node_cpu = atoi(optarg); break;
+ case 'a':
+ p.bitmask_in = numa_parse_nodestring(optarg);
+ break;
+ case 'b':
+ p.bitmask_out = numa_parse_nodestring(optarg);
+ break;
+ case 'c':
+ p.numa_node_cpu = atoi(optarg);
+ break;
#if NUMA_MEMCPY
- case 'C': p.bitmask_cpu = numa_parse_nodestring(optarg); break;
- case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break;
-#endif // NUMA_MEMCPY
-#endif // NUMA
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(p.n_threads > 0 && "Invalid # of ranks!");
-
- return p;
+ case 'C':
+ p.bitmask_cpu = numa_parse_nodestring(optarg);
+ break;
+ case 'M':
+ p.numa_node_cpu_memcpy = atoi(optarg);
+ break;
+#endif // NUMA_MEMCPY
+#endif // NUMA
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(p.n_threads > 0 && "Invalid # of ranks!");
+
+ return p;
}
/**
* @brief Main of the Host Application.
*/
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
+ struct Params p = input_params(argc, argv);
- const unsigned int input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
+ const unsigned long input_size =
+ p.exp == 0 ? p.input_size * p.n_threads : p.input_size;
- // Create an input file with arbitrary data.
+ // Create an input file with arbitrary data.
/**
* @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values
* @param nr_elements how many 32-bit elements we want the file to be
* @return the buffer address
*/
- srand(0);
+ srand(0);
#if NUMA
- if (p.bitmask_in) {
- numa_set_membind(p.bitmask_in);
- numa_free_nodemask(p.bitmask_in);
- }
- A = (T*) numa_alloc(input_size * sizeof(T));
- B = (T*) numa_alloc(input_size * sizeof(T));
+ if (p.bitmask_in) {
+ numa_set_membind(p.bitmask_in);
+ numa_free_nodemask(p.bitmask_in);
+ }
+ A = (T *) numa_alloc(input_size * sizeof(T));
+ B = (T *) numa_alloc(input_size * sizeof(T));
#else
- A = (T*) malloc(input_size * sizeof(T));
- B = (T*) malloc(input_size * sizeof(T));
+ A = (T *) malloc(input_size * sizeof(T));
+ B = (T *) malloc(input_size * sizeof(T));
#endif
#if NUMA
- if (p.bitmask_out) {
- numa_set_membind(p.bitmask_out);
- numa_free_nodemask(p.bitmask_out);
- }
- C = (T*) numa_alloc(input_size * sizeof(T));
+ if (p.bitmask_out) {
+ numa_set_membind(p.bitmask_out);
+ numa_free_nodemask(p.bitmask_out);
+ }
+ C = (T *) numa_alloc(input_size * sizeof(T));
#else
- C = (T*) malloc(input_size * sizeof(T));
+ C = (T *) malloc(input_size * sizeof(T));
#endif
- for (unsigned int i = 0; i < input_size; i++) {
- A[i] = (T) (rand());
- B[i] = (T) (rand());
- }
+ for (unsigned long i = 0; i < input_size; i++) {
+ A[i] = (T) (rand());
+ B[i] = (T) (rand());
+ }
#if NUMA
#if NUMA_MEMCPY
- if (p.bitmask_cpu) {
- numa_set_membind(p.bitmask_cpu);
- numa_free_nodemask(p.bitmask_cpu);
- }
+ if (p.bitmask_cpu) {
+ numa_set_membind(p.bitmask_cpu);
+ numa_free_nodemask(p.bitmask_cpu);
+ }
#else
- struct bitmask *bitmask_all = numa_allocate_nodemask();
- numa_bitmask_setall(bitmask_all);
- numa_set_membind(bitmask_all);
- numa_free_nodemask(bitmask_all);
-#endif // NUMA_MEMCPY
-#endif // NUMA
+ struct bitmask *bitmask_all = numa_allocate_nodemask();
+ numa_bitmask_setall(bitmask_all);
+ numa_set_membind(bitmask_all);
+ numa_free_nodemask(bitmask_all);
+#endif // NUMA_MEMCPY
+#endif // NUMA
#if NUMA
- mp_pages[0] = A;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_in = mp_status[0];
- }
-
- mp_pages[0] = C;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(C)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_out = mp_status[0];
- }
-
- numa_node_cpu = p.numa_node_cpu;
- if (p.numa_node_cpu != -1) {
- if (numa_run_on_node(p.numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
+ mp_pages[0] = A;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_in = mp_status[0];
+ }
+
+ mp_pages[0] = C;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(C)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_out = mp_status[0];
+ }
+
+ numa_node_cpu = p.numa_node_cpu;
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
#endif
#if NUMA_MEMCPY
- numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1;
+ numa_node_in_is_local = ((numa_node_cpu == numa_node_in)
+ || (numa_node_cpu + 8 == numa_node_in)) * 1;
+#endif
+
+#if WITH_BENCHMARK
+ Timer timer;
#endif
- Timer timer;
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
+#endif
- for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+ for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if NUMA_MEMCPY
- numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
- start(&timer, 1, 0);
- if (!numa_node_in_is_local) {
- A_local = (T*) numa_alloc(input_size * sizeof(T));
- B_local = (T*) numa_alloc(input_size * sizeof(T));
- }
- stop(&timer, 1);
- if (!numa_node_in_is_local) {
- if (p.numa_node_cpu_memcpy != -1) {
- if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu_memcpy = -1;
- }
- }
- }
- start(&timer, 2, 0);
- if (!numa_node_in_is_local) {
- memcpy(A_local, A, input_size * sizeof(T));
- memcpy(B_local, B, input_size * sizeof(T));
- } else {
- A_local = A;
- B_local = B;
- }
- stop(&timer, 2);
- if (p.numa_node_cpu != -1) {
- if (numa_run_on_node(p.numa_node_cpu) == -1) {
- perror("numa_run_on_node");
- numa_node_cpu = -1;
- }
- }
- mp_pages[0] = A_local;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(A_local)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_local = mp_status[0];
- }
+ numa_node_cpu_memcpy = p.numa_node_cpu_memcpy;
+ start(&timer, 1, 0);
+ if (!numa_node_in_is_local) {
+ A_local = (T *) numa_alloc(input_size * sizeof(T));
+ B_local = (T *) numa_alloc(input_size * sizeof(T));
+ }
+ stop(&timer, 1);
+ if (!numa_node_in_is_local) {
+ if (p.numa_node_cpu_memcpy != -1) {
+ if (numa_run_on_node(p.numa_node_cpu_memcpy) ==
+ -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu_memcpy = -1;
+ }
+ }
+ }
+ start(&timer, 2, 0);
+ if (!numa_node_in_is_local) {
+ memcpy(A_local, A, input_size * sizeof(T));
+ memcpy(B_local, B, input_size * sizeof(T));
+ } else {
+ A_local = A;
+ B_local = B;
+ }
+ stop(&timer, 2);
+ if (p.numa_node_cpu != -1) {
+ if (numa_run_on_node(p.numa_node_cpu) == -1) {
+ perror("numa_run_on_node");
+ numa_node_cpu = -1;
+ }
+ }
+ mp_pages[0] = A_local;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(A_local)");
+ } else if (mp_status[0] < 0) {
+ printf("move_pages error: %d", mp_status[0]);
+ } else {
+ numa_node_local = mp_status[0];
+ }
#endif
- start(&timer, 0, 0);
- vector_addition_host(input_size, p.n_threads);
- stop(&timer, 0);
+ start(&timer, 0, 0);
+ vector_addition_host(input_size, p.n_threads);
+ stop(&timer, 0);
#if NUMA_MEMCPY
- start(&timer, 3, 0);
- if (!numa_node_in_is_local) {
- numa_free(A_local, input_size * sizeof(T));
- numa_free(B_local, input_size * sizeof(T));
- }
- stop(&timer, 3);
+ start(&timer, 3, 0);
+ if (!numa_node_in_is_local) {
+ numa_free(A_local, input_size * sizeof(T));
+ numa_free(B_local, input_size * sizeof(T));
+ }
+ stop(&timer, 3);
#endif
- unsigned int nr_threads = 0;
+#if WITH_BENCHMARK
+ unsigned int nr_threads = 0;
#pragma omp parallel
#pragma omp atomic
- nr_threads++;
+ nr_threads++;
- if (rep >= p.n_warmup) {
+ if (rep >= p.n_warmup) {
#if NUMA_MEMCPY
- printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d"
- " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), input_size,
- numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
- input_size * 3 * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- input_size / timer.time[0]);
- printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
- timer.time[0], timer.time[1], timer.time[2], timer.time[3],
- timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]);
+ printf
+ ("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld"
+ " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " | throughput_MBps=%f", nr_threads, XSTR(T),
+ input_size, numa_node_in, numa_node_local,
+ numa_node_out, numa_node_cpu, numa_node_cpu_memcpy,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+ input_size * 3 * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f",
+ input_size / timer.time[0]);
+ printf
+ (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2],
+ timer.time[3],
+ timer.time[0] + timer.time[1] + timer.time[2] +
+ timer.time[3]);
#else
- printf("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%d"
+ printf
+ ("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%ld"
#if NUMA
- " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
+ " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d"
#endif
- " | throughput_MBps=%f",
- nr_threads, XSTR(T), input_size,
+ " | throughput_MBps=%f",
+ nr_threads, XSTR(T), input_size,
#if NUMA
- numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out),
+ numa_node_in, numa_node_out, numa_node_cpu,
+ numa_distance(numa_node_in, numa_node_cpu),
+ numa_distance(numa_node_cpu, numa_node_out),
+#endif
+ input_size * 3 * sizeof(T) / timer.time[0]);
+ printf(" throughput_MOpps=%f",
+ input_size / timer.time[0]);
+ printf(" latency_us=%f\n", timer.time[0]);
+#endif // NUMA_MEMCPY
+ }
+#endif // WITH_BENCHMARK
+ }
+
+#if NOP_SYNC
+ for (int rep = 0; rep < 200000; rep++) {
+ asm volatile ("nop"::);
+ }
#endif
- input_size * 3 * sizeof(T) / timer.time[0]);
- printf(" throughput_MOpps=%f",
- input_size / timer.time[0]);
- printf(" latency_us=%f\n",
- timer.time[0]);
-#endif // NUMA_MEMCPY
- }
- }
#if NUMA
- numa_free(A, input_size * sizeof(T));
- numa_free(B, input_size * sizeof(T));
- numa_free(C, input_size * sizeof(T));
+ numa_free(A, input_size * sizeof(T));
+ numa_free(B, input_size * sizeof(T));
+ numa_free(C, input_size * sizeof(T));
#else
- free(A);
- free(B);
- free(C);
+ free(A);
+ free(B);
+ free(C);
#endif
- return 0;
- }
+ return 0;
+}
diff --git a/VA/baselines/cpu/run-perf.sh b/VA/baselines/cpu/run-perf.sh
new file mode 100755
index 0000000..8075256
--- /dev/null
+++ b/VA/baselines/cpu/run-perf.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B numa=1
+
+perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 167772160
+perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 167772160
diff --git a/VA/baselines/cpu/run-ws.sh b/VA/baselines/cpu/run-ws.sh
new file mode 100755
index 0000000..ccc4993
--- /dev/null
+++ b/VA/baselines/cpu/run-ws.sh
@@ -0,0 +1,6 @@
+#!/bin/zsh
+
+make -B benchmark=0 debug=1 native=0 nop_sync=1 numa=1
+
+~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t1.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 16777216
+~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t4.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 16777216
diff --git a/VA/dpu/task.c b/VA/dpu/task.c
index bb41303..9622911 100644
--- a/VA/dpu/task.c
+++ b/VA/dpu/task.c
@@ -15,10 +15,11 @@
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// vector_addition: Computes the vector addition of a cached block
-static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
- for (unsigned int i = 0; i < l_size; i++){
- bufferB[i] += bufferA[i];
- }
+static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size)
+{
+ for (unsigned int i = 0; i < l_size; i++) {
+ bufferB[i] += bufferA[i];
+ }
}
// Barrier
@@ -26,53 +27,67 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
-int (*kernels[nr_kernels])(void) = {main_kernel1};
+int (*kernels[nr_kernels])(void) = { main_kernel1 };
-int main(void) {
- // Kernel
- return kernels[DPU_INPUT_ARGUMENTS.kernel]();
+int main(void)
+{
+ // Kernel
+ return kernels[DPU_INPUT_ARGUMENTS.kernel] ();
}
// main_kernel1
-int main_kernel1() {
- unsigned int tasklet_id = me();
+int main_kernel1()
+{
+ unsigned int tasklet_id = me();
#if PRINT
- printf("tasklet_id = %u\n", tasklet_id);
+ printf("tasklet_id = %u\n", tasklet_id);
#endif
- if (tasklet_id == 0){ // Initialize once the cycle counter
- mem_reset(); // Reset the heap
- }
- // Barrier
- barrier_wait(&my_barrier);
-
- uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
- uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
-
- // Address of the current processing block in MRAM
- uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
- uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
- uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
-
- // Initialize a local cache to store the MRAM block
- T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
- T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
-
- for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
-
- // Bound checking
- uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
-
- // Load cache with current MRAM block
- mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
- mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);
-
- // Computer vector addition
- vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
-
- // Write cache to current MRAM block
- mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);
-
- }
-
- return 0;
+ if (tasklet_id == 0) { // Initialize once the cycle counter
+ mem_reset(); // Reset the heap
+ }
+ // Barrier
+ barrier_wait(&my_barrier);
+
+ uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
+ uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
+
+ // Address of the current processing block in MRAM
+ uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
+ uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
+ uint32_t mram_base_addr_B =
+ (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
+
+ // Initialize a local cache to store the MRAM block
+ T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
+ T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
+
+ for (unsigned int byte_index = base_tasklet;
+ byte_index < input_size_dpu_bytes;
+ byte_index += BLOCK_SIZE * NR_TASKLETS) {
+
+ // Bound checking
+ uint32_t l_size_bytes =
+ (byte_index + BLOCK_SIZE >=
+ input_size_dpu_bytes) ? (input_size_dpu_bytes -
+ byte_index) : BLOCK_SIZE;
+
+ // Load cache with current MRAM block
+ mram_read((__mram_ptr void const *)(mram_base_addr_A +
+ byte_index), cache_A,
+ l_size_bytes);
+ mram_read((__mram_ptr void const *)(mram_base_addr_B +
+ byte_index), cache_B,
+ l_size_bytes);
+
+ // Computer vector addition
+ vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
+
+ // Write cache to current MRAM block
+ mram_write(cache_B,
+ (__mram_ptr void *)(mram_base_addr_B + byte_index),
+ l_size_bytes);
+
+ }
+
+ return 0;
}
diff --git a/VA/host/app.c b/VA/host/app.c
index 5fe3f61..1a2cdfd 100644
--- a/VA/host/app.c
+++ b/VA/host/app.c
@@ -33,296 +33,361 @@
#include <dpu_target_macros.h>
// Pointer declaration
-static T* A;
-static T* B;
-static T* C;
-static T* C2;
+static T *A;
+static T *B;
+static T *C;
+static T *C2;
// Create input arrays
-static void read_input(T* A, T* B, unsigned int nr_elements) {
- srand(0);
- for (unsigned int i = 0; i < nr_elements; i++) {
- A[i] = (T) (rand());
- B[i] = (T) (rand());
- }
+static void read_input(T *A, T *B, unsigned int nr_elements)
+{
+ srand(0);
+ for (unsigned int i = 0; i < nr_elements; i++) {
+ A[i] = (T) (rand());
+ B[i] = (T) (rand());
+ }
}
// Compute output in the host
-static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
- for (unsigned int i = 0; i < nr_elements; i++) {
- C[i] = A[i] + B[i];
- }
+static void vector_addition_host(T *C, T *A, T *B, unsigned int nr_elements)
+{
+ for (unsigned int i = 0; i < nr_elements; i++) {
+ C[i] = A[i] + B[i];
+ }
}
// Main of the Host Application
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
- struct Params p = input_params(argc, argv);
+ struct Params p = input_params(argc, argv);
- struct dpu_set_t dpu_set, dpu;
- uint32_t nr_of_dpus;
- uint32_t nr_of_ranks;
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t nr_of_dpus;
+ uint32_t nr_of_ranks;
#if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
- // Timer declaration
- Timer timer;
+ // Timer declaration
+ Timer timer;
- int numa_node_rank = -2;
+ int numa_node_rank = -2;
- // Allocate DPUs and load binary
+ // Allocate DPUs and load binary
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- timer.time[0] = 0; // alloc
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ timer.time[0] = 0; // alloc
#endif
#if !WITH_LOAD_OVERHEAD
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
- timer.time[1] = 0; // load
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
+ timer.time[1] = 0; // load
#endif
#if !WITH_FREE_OVERHEAD
- timer.time[6] = 0; // free
+ timer.time[6] = 0; // free
#endif
- unsigned int i = 0;
- const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
- const unsigned int input_size_8bytes =
- ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
- const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
- const unsigned int input_size_dpu_8bytes =
- ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
-
- // Input/output allocation
- A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
- B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
- C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
- C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
- T *bufferA = A;
- T *bufferB = B;
- T *bufferC = C2;
-
- // Create an input file with arbitrary data
- read_input(A, B, input_size);
-
- // Loop over main kernel
- for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
+ unsigned int i = 0;
+ const unsigned int input_size =
+ p.exp == 0 ? p.input_size * NR_DPUS : p.input_size;
+ const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
+ const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.)
+ const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
+
+ // Input/output allocation
+ A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+ B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+ C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+ C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T));
+ T *bufferA = A;
+ T *bufferB = B;
+ T *bufferC = C2;
+
+ // Create an input file with arbitrary data
+ read_input(A, B, input_size);
+
+ // Loop over main kernel
+ for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
#if WITH_ALLOC_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 0, 0);
- }
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- if(rep >= p.n_warmup) {
- stop(&timer, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, 0, 0);
+ }
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 0);
+ }
#endif
#if WITH_DPUINFO
- printf("DPUs:");
- DPU_FOREACH (dpu_set, dpu) {
- int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- int slice = dpu_get_slice_id(dpu_from_set(dpu));
- int member = dpu_get_member_id(dpu_from_set(dpu));
- printf(" %d(%d.%d)", rank, slice, member);
- }
- printf("\n");
+ printf("DPUs:");
+ DPU_FOREACH(dpu_set, dpu) {
+ int rank =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ int slice = dpu_get_slice_id(dpu_from_set(dpu));
+ int member = dpu_get_member_id(dpu_from_set(dpu));
+ printf(" %d(%d.%d)", rank, slice, member);
+ }
+ printf("\n");
#endif
#if WITH_LOAD_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 1, 0);
- }
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- if(rep >= p.n_warmup) {
- stop(&timer, 1);
- }
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
- DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
- assert(nr_of_dpus == NR_DPUS);
+ if (rep >= p.n_warmup) {
+ start(&timer, 1, 0);
+ }
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 1);
+ }
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks));
+ assert(nr_of_dpus == NR_DPUS);
#endif
- // int prev_rank_id = -1;
- int rank_id = -1;
- DPU_FOREACH (dpu_set, dpu) {
- rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK;
- if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) {
- numa_node_rank = -1;
- } else {
- numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)));
- }
- /*
- if (rank_id != prev_rank_id) {
- printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
- prev_rank_id = rank_id;
- }
- */
- }
-
-
- // Compute output on CPU (performance comparison and verification purposes)
- if(rep >= p.n_warmup) {
- start(&timer, 2, 0);
- }
- vector_addition_host(C, A, B, input_size);
- if(rep >= p.n_warmup) {
- stop(&timer, 2);
- }
-
- if(rep >= p.n_warmup) {
- start(&timer, 3, 0);
- }
- // Input arguments
- unsigned int kernel = 0;
- dpu_arguments_t input_arguments[NR_DPUS];
- for(i=0; i<nr_of_dpus-1; i++) {
- input_arguments[i].size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[i].kernel=kernel;
- }
- input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T);
- input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T);
- input_arguments[nr_of_dpus-1].kernel=kernel;
-
- // Copy input arrays
- i = 0;
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
-
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
-
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
- stop(&timer, 3);
- }
-
- // Run DPU kernel
- if(rep >= p.n_warmup) {
- start(&timer, 4, 0);
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- }
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- if(rep >= p.n_warmup) {
- stop(&timer, 4);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- #endif
- }
-
+ // int prev_rank_id = -1;
+ int rank_id = -1;
+ DPU_FOREACH(dpu_set, dpu) {
+ rank_id =
+ dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) &
+ DPU_TARGET_MASK;
+ if ((numa_node_rank != -2)
+ && numa_node_rank !=
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)))) {
+ numa_node_rank = -1;
+ } else {
+ numa_node_rank =
+ dpu_get_rank_numa_node(dpu_get_rank
+ (dpu_from_set(dpu)));
+ }
+ /*
+ if (rank_id != prev_rank_id) {
+ printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank);
+ prev_rank_id = rank_id;
+ }
+ */
+ }
+
+ // Compute output on CPU (performance comparison and verification purposes)
+ if (rep >= p.n_warmup) {
+ start(&timer, 2, 0);
+ }
+ vector_addition_host(C, A, B, input_size);
+ if (rep >= p.n_warmup) {
+ stop(&timer, 2);
+ }
+
+ if (rep >= p.n_warmup) {
+ start(&timer, 3, 0);
+ }
+ // Input arguments
+ unsigned int kernel = 0;
+ dpu_arguments_t input_arguments[NR_DPUS];
+ for (i = 0; i < nr_of_dpus - 1; i++) {
+ input_arguments[i].size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[i].transfer_size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[i].kernel = kernel;
+ }
+ input_arguments[nr_of_dpus - 1].size =
+ (input_size_8bytes -
+ input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T);
+ input_arguments[nr_of_dpus - 1].transfer_size =
+ input_size_dpu_8bytes * sizeof(T);
+ input_arguments[nr_of_dpus - 1].kernel = kernel;
+
+ // Copy input arrays
+ i = 0;
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0,
+ sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
+
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferA + input_size_dpu_8bytes * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME, 0,
+ input_size_dpu_8bytes * sizeof(T),
+ DPU_XFER_DEFAULT));
+
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferB + input_size_dpu_8bytes * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_TO_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ input_size_dpu_8bytes * sizeof(T),
+ input_size_dpu_8bytes * sizeof(T),
+ DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 3);
+ }
+ // Run DPU kernel
+ if (rep >= p.n_warmup) {
+ start(&timer, 4, 0);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ }
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 4);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+#endif
+ }
#if PRINT
- {
- unsigned int each_dpu = 0;
- printf("Display DPU Logs\n");
- DPU_FOREACH (dpu_set, dpu) {
- printf("DPU#%d:\n", each_dpu);
- DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
- each_dpu++;
- }
- }
+ {
+ unsigned int each_dpu = 0;
+ printf("Display DPU Logs\n");
+ DPU_FOREACH(dpu_set, dpu) {
+ printf("DPU#%d:\n", each_dpu);
+ DPU_ASSERT(dpulog_read_for_dpu
+ (dpu.dpu, stdout));
+ each_dpu++;
+ }
+ }
#endif
- if(rep >= p.n_warmup) {
- start(&timer, 5, 0);
- }
- i = 0;
- // PARALLEL RETRIEVE TRANSFER
- DPU_FOREACH(dpu_set, dpu, i) {
- DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
- }
- DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
- if(rep >= p.n_warmup) {
- stop(&timer, 5);
- }
-
+ if (rep >= p.n_warmup) {
+ start(&timer, 5, 0);
+ }
+ i = 0;
+ // PARALLEL RETRIEVE TRANSFER
+ DPU_FOREACH(dpu_set, dpu, i) {
+ DPU_ASSERT(dpu_prepare_xfer
+ (dpu, bufferC + input_size_dpu_8bytes * i));
+ }
+ DPU_ASSERT(dpu_push_xfer
+ (dpu_set, DPU_XFER_FROM_DPU,
+ DPU_MRAM_HEAP_POINTER_NAME,
+ input_size_dpu_8bytes * sizeof(T),
+ input_size_dpu_8bytes * sizeof(T),
+ DPU_XFER_DEFAULT));
+ if (rep >= p.n_warmup) {
+ stop(&timer, 5);
+ }
#if WITH_ALLOC_OVERHEAD
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- start(&timer, 6, 0);
- }
+ if (rep >= p.n_warmup) {
+ start(&timer, 6, 0);
+ }
#endif
- DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_free(dpu_set));
#if WITH_FREE_OVERHEAD
- if(rep >= p.n_warmup) {
- stop(&timer, 6);
- }
+ if (rep >= p.n_warmup) {
+ stop(&timer, 6);
+ }
#endif
#endif
- // Check output
- bool status = true;
- for (i = 0; i < input_size; i++) {
- if(C[i] != bufferC[i]){
- status = false;
+ // Check output
+ bool status = true;
+ for (i = 0; i < input_size; i++) {
+ if (C[i] != bufferC[i]) {
+ status = false;
#if PRINT
- printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
+ printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
#endif
- }
- }
- if (status) {
- printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
- if (rep >= p.n_warmup) {
- printf("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
- nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size / NR_DPUS);
- printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
- WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank);
- printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
- timer.time[0],
- timer.time[1],
- timer.time[2],
- timer.time[3],
- timer.time[4],
- timer.time[5],
- timer.time[6]);
- printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
- input_size * 3 * sizeof(T) / timer.time[2],
- input_size * 3 * sizeof(T) / (timer.time[4]),
- input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
- input_size * 3 * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * 3 * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
- input_size / timer.time[2],
- input_size / (timer.time[4]),
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6]));
- printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
- input_size / (timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]),
- input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]));
- }
- } else {
- printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
- }
- }
+ }
+ }
+ if (status) {
+ printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET
+ "] Outputs are equal\n");
+ if (rep >= p.n_warmup) {
+ printf
+ ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d",
+ nr_of_dpus, nr_of_ranks, NR_TASKLETS,
+ XSTR(T), BLOCK_SIZE, input_size,
+ input_size / NR_DPUS);
+ printf
+ (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD,
+ WITH_FREE_OVERHEAD, numa_node_rank);
+ printf
+ ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f",
+ timer.time[0], timer.time[1],
+ timer.time[2], timer.time[3],
+ timer.time[4], timer.time[5],
+ timer.time[6]);
+ printf
+ (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f",
+ input_size * 3 * sizeof(T) / timer.time[2],
+ input_size * 3 * sizeof(T) /
+ (timer.time[4]),
+ input_size * 3 * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5] + timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f",
+ input_size * 3 * sizeof(T) /
+ (timer.time[3] + timer.time[4] +
+ timer.time[5]),
+ input_size * 3 * sizeof(T) /
+ (timer.time[1] + timer.time[3] +
+ timer.time[4] + timer.time[5]),
+ input_size * 3 * sizeof(T) /
+ (timer.time[0] + timer.time[1] +
+ timer.time[3] + timer.time[4] +
+ timer.time[5]));
+ printf
+ (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f",
+ input_size / timer.time[2],
+ input_size / (timer.time[4]),
+ input_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5] +
+ timer.time[6]));
+ printf
+ (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n",
+ input_size / (timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size / (timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]),
+ input_size / (timer.time[0] +
+ timer.time[1] +
+ timer.time[3] +
+ timer.time[4] +
+ timer.time[5]));
+ }
+ } else {
+ printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET
+ "] Outputs differ!\n");
+ }
+ }
#if ENERGY
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- printf("DPU Energy (J): %f\t", energy);
-#endif
-
+ double energy;
+ DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ printf("DPU Energy (J): %f\t", energy);
+#endif
- // Deallocation
- free(A);
- free(B);
- free(C);
- free(C2);
+ // Deallocation
+ free(A);
+ free(B);
+ free(C);
+ free(C2);
#if !WITH_ALLOC_OVERHEAD
- DPU_ASSERT(dpu_free(dpu_set));
+ DPU_ASSERT(dpu_free(dpu_set));
#endif
-
- return 0;
+
+ return 0;
}
diff --git a/VA/support/common.h b/VA/support/common.h
index c1043fd..cee09e2 100755
--- a/VA/support/common.h
+++ b/VA/support/common.h
@@ -3,11 +3,11 @@
// Structures used by both the host and the dpu to communicate information
typedef struct {
- uint32_t size;
- uint32_t transfer_size;
+ uint32_t size;
+ uint32_t transfer_size;
enum kernels {
- kernel1 = 0,
- nr_kernels = 1,
+ kernel1 = 0,
+ nr_kernels = 1,
} kernel;
} dpu_arguments_t;
@@ -24,34 +24,34 @@ typedef struct {
// Data type
#ifdef UINT32
#define T uint32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2 // Shift right to divide by sizeof(T)
#elif UINT64
#define T uint64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3 // Shift right to divide by sizeof(T)
#elif INT32
#define T int32_t
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2 // Shift right to divide by sizeof(T)
#elif INT64
#define T int64_t
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3 // Shift right to divide by sizeof(T)
#elif FLOAT
#define T float
-#define DIV 2 // Shift right to divide by sizeof(T)
+#define DIV 2 // Shift right to divide by sizeof(T)
#elif DOUBLE
#define T double
-#define DIV 3 // Shift right to divide by sizeof(T)
+#define DIV 3 // Shift right to divide by sizeof(T)
#elif CHAR
#define T char
-#define DIV 0 // Shift right to divide by sizeof(T)
+#define DIV 0 // Shift right to divide by sizeof(T)
#elif SHORT
#define T short
-#define DIV 1 // Shift right to divide by sizeof(T)
+#define DIV 1 // Shift right to divide by sizeof(T)
#endif
#ifndef ENERGY
#define ENERGY 0
#endif
-#define PRINT 0
+#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
diff --git a/VA/support/params.h b/VA/support/params.h
index 8bd71a6..47c10ef 100644
--- a/VA/support/params.h
+++ b/VA/support/params.h
@@ -4,53 +4,62 @@
#include "common.h"
typedef struct Params {
- unsigned int input_size;
- int n_warmup;
- int n_reps;
- int exp;
-}Params;
+ unsigned int input_size;
+ int n_warmup;
+ int n_reps;
+ int exp;
+} Params;
-static void usage() {
- fprintf(stderr,
- "\nUsage: ./program [options]"
- "\n"
- "\nGeneral options:"
- "\n -h help"
- "\n -w <W> # of untimed warmup iterations (default=1)"
- "\n -e <E> # of timed repetition iterations (default=3)"
- "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
- "\n"
- "\nBenchmark-specific options:"
- "\n -i <I> input size (default=2621440 elements)"
- "\n");
+static void usage()
+{
+ fprintf(stderr,
+ "\nUsage: ./program [options]"
+ "\n"
+ "\nGeneral options:"
+ "\n -h help"
+ "\n -w <W> # of untimed warmup iterations (default=1)"
+ "\n -e <E> # of timed repetition iterations (default=3)"
+ "\n -x <X> Weak (0) or strong (1) scaling (default=0)"
+ "\n"
+ "\nBenchmark-specific options:"
+ "\n -i <I> input size (default=2621440 elements)" "\n");
}
-struct Params input_params(int argc, char **argv) {
- struct Params p;
- p.input_size = 2621440;
- p.n_warmup = 1;
- p.n_reps = 3;
- p.exp = 0;
+struct Params input_params(int argc, char **argv)
+{
+ struct Params p;
+ p.input_size = 2621440;
+ p.n_warmup = 1;
+ p.n_reps = 3;
+ p.exp = 0;
- int opt;
- while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
- switch(opt) {
- case 'h':
- usage();
- exit(0);
- break;
- case 'i': p.input_size = atoi(optarg); break;
- case 'w': p.n_warmup = atoi(optarg); break;
- case 'e': p.n_reps = atoi(optarg); break;
- case 'x': p.exp = atoi(optarg); break;
- default:
- fprintf(stderr, "\nUnrecognized option!\n");
- usage();
- exit(0);
- }
- }
- assert(NR_DPUS > 0 && "Invalid # of dpus!");
+ int opt;
+ while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) {
+ switch (opt) {
+ case 'h':
+ usage();
+ exit(0);
+ break;
+ case 'i':
+ p.input_size = atoi(optarg);
+ break;
+ case 'w':
+ p.n_warmup = atoi(optarg);
+ break;
+ case 'e':
+ p.n_reps = atoi(optarg);
+ break;
+ case 'x':
+ p.exp = atoi(optarg);
+ break;
+ default:
+ fprintf(stderr, "\nUnrecognized option!\n");
+ usage();
+ exit(0);
+ }
+ }
+ assert(NR_DPUS > 0 && "Invalid # of dpus!");
- return p;
+ return p;
}
#endif
diff --git a/VA/support/timer.h b/VA/support/timer.h
index 4d597b9..df68334 100755
--- a/VA/support/timer.h
+++ b/VA/support/timer.h
@@ -1,66 +1,74 @@
-/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/*
+ * Copyright (c) 2016 University of Cordoba and University of Illinois
+ * All rights reserved.
+ *
+ * Developed by: IMPACT Research Group
+ * University of Cordoba and University of Illinois
+ * http://impact.crhc.illinois.edu/
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * with the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * > Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimers.
+ * > Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ * documentation and/or other materials provided with the distribution.
+ * > Neither the names of IMPACT Research Group, University of Cordoba,
+ * University of Illinois nor the names of its contributors may be used
+ * to endorse or promote products derived from this Software without
+ * specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+ * THE SOFTWARE.
+ *
+ */
+
+#include <sys/time.h>
+
+typedef struct Timer {
+
+ struct timeval startTime[7];
+ struct timeval stopTime[7];
+ double time[7];
+
+} Timer;
+
+void start(Timer *timer, int i, int rep)
+{
+ if (rep == 0) {
+ timer->time[i] = 0.0;
+ }
+ gettimeofday(&timer->startTime[i], NULL);
+}
+
+void stop(Timer *timer, int i)
+{
+ gettimeofday(&timer->stopTime[i], NULL);
+ timer->time[i] +=
+ (timer->stopTime[i].tv_sec -
+ timer->startTime[i].tv_sec) * 1000000.0 +
+ (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
+}
+
+void print(Timer *timer, int i, int REP)
+{
+ printf("Time (ms): %f\t", timer->time[i] / (1000 * REP));
+}
+
+void printall(Timer *timer, int maxt)
+{
+ for (int i = 0; i <= maxt; i++) {
+ printf(" timer%d_us=%f", i, timer->time[i]);
+ }
+ printf("\n");
+}
diff --git a/perf-events.txt b/perf-events.txt
new file mode 100644
index 0000000..ab57ab2
--- /dev/null
+++ b/perf-events.txt
@@ -0,0 +1,44 @@
+cache-misses # NMPO
+cache-references
+
+cpu-cycles # NMPO
+instructions # NMPO
+
+page-faults
+
+mem-loads
+mem-loads-aux
+mem-stores
+
+branch-misses # NMPO
+branch-instructions # NMPO
+branch-load-misses # NMPO
+branch-loads # NMPO
+
+l1d_pend_miss.pending # mccalpin2023hpc <https://link.springer.com/chapter/10.1007/978-3-031-40843-4_30>
+l1d_pend_miss.pending_cycles
+
+offcore_requests.all_requests
+offcore_requests.data_rd
+offcore_requests.demand_data_rd
+
+offcore_requests_outstanding.data_rd # mccalpin2023hpc
+offcore_requests_outstanding.cycles_with_data_rd
+offcore_requests_outstanding.cycles_with_demand_data_rd
+offcore_requests_outstanding.demand_data_rd # mccalpin2023hpc
+
+L1-dcache-loads # NMPO
+L1-dcache-load-misses # NMPO
+L1-dcache-stores # NMPO
+L1-icache-load-misses # NMPO
+
+LLC-loads # NMPO
+LLC-load-misses
+LLC-stores # NMPO
+LLC-store-misses # NMPO
+
+l2_lines_out.useless_hwpf
+l2_lines_out.non_silent
+l2_lines_out.silent
+l2_request.all
+l2_request.miss