diff options
77 files changed, 5814 insertions, 4330 deletions
diff --git a/BFS/Makefile b/BFS/Makefile index a4ea69d..d43202f 100644 --- a/BFS/Makefile +++ b/BFS/Makefile @@ -1,12 +1,15 @@ NR_DPUS ?= 1 NR_TASKLETS ?= 16 +WITH_ALLOC_OVERHEAD ?= 0 +WITH_LOAD_OVERHEAD ?= 0 +WITH_FREE_OVERHEAD ?= 0 COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} +HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} QUIET = @ diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile index 6f082b1..1efe457 100644 --- a/BFS/baselines/cpu/Makefile +++ b/BFS/baselines/cpu/Makefile @@ -1,8 +1,26 @@ -.PHONY: all -all: bfs +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 + +LDFLAGS = +CFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma +endif bfs: app.c - gcc -O2 -o bfs -fopenmp app.c + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -o bfs -fopenmp app.c ${LDFLAGS} bfs_O0: app.c gcc -o bfs_O0 -fopenmp app.c @@ -27,3 +45,5 @@ run_O2: bfs_O2 .PHONY: clean clean: rm -f bfs bfs_O0 bfs_O2 + +.PHONY: all diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c index caf4cbc..390b1f9 100644 --- a/BFS/baselines/cpu/app.c +++ b/BFS/baselines/cpu/app.c @@ -8,12 +8,30 @@ #include <omp.h> +#if NUMA +#include <numaif.h> +#include <numa.h> + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +struct bitmask* bitmask_in; +int numa_node_in = -1; +int numa_node_cpu = -1; +#endif + #include "../../support/common.h" #include "../../support/graph.h" #include "../../support/params.h" -#include "../../support/timer.h" #include "../../support/utils.h" +#if WITH_BENCHMARK +#include "../../support/timer.h" +#else +#define startTimer(...) +#define stopTimer(...) +#endif + int main(int argc, char** argv) { // Process parameters @@ -24,8 +42,9 @@ int main(int argc, char** argv) { struct COOGraph cooGraph = readCOOGraph(p.fileName); PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); - +#if WITH_BENCHMARK Timer timer; +#endif for(int rep = 0; rep < 100; rep++) { struct CSRGraph csrGraph = coo2csr(cooGraph); @@ -43,6 +62,12 @@ int main(int argc, char** argv) { uint32_t* prevFrontier = buffer1; uint32_t* currFrontier = buffer2; +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + // Calculating result on CPU startTimer(&timer, 0, 0); nodeLevel[srcNode] = 0; @@ -86,6 +111,12 @@ int main(int argc, char** argv) { } stopTimer(&timer, 0); +#if NOP_SYNC + for(int rep = 0; rep < 200000; rep++) { + asm volatile("nop" ::); + } +#endif + freeCSRGraph(csrGraph); free(buffer1); free(buffer2); @@ -135,6 +166,7 @@ int main(int argc, char** argv) { } stopTimer(&timer, 1); +#if WITH_BENCHMARK unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic @@ -158,8 +190,11 @@ int main(int argc, char** argv) { printf(" throughput_seq_MOpps=%f throughput_MOpps=%f", csrGraph.numNodes / timer.time[1], csrGraph.numNodes / timer.time[0]); - printAll(&timer, 1); + printf(" latency_us=%f latency_seq_us=%f\n", + timer.time[0], + timer.time[1]); } +#endif // WITH_BENCHMARK freeCSRGraph(csrGraph); free(nodeLevel); diff --git a/BFS/dpu/dpu-utils.h b/BFS/dpu/dpu-utils.h index b02c073..dc986d2 100644 --- a/BFS/dpu/dpu-utils.h +++ b/BFS/dpu/dpu-utils.h @@ -6,39 +6,46 @@ #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__) -static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - mram_read((__mram_ptr void const*)(ptr_m + idx*sizeof(uint64_t)), cache_w, 8); - return cache_w[0]; +static uint64_t load8B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w) +{ + mram_read((__mram_ptr void const *)(ptr_m + idx * sizeof(uint64_t)), + cache_w, 8); + return cache_w[0]; } -static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - cache_w[0] = val; - mram_write(cache_w, (__mram_ptr void*)(ptr_m + idx*sizeof(uint64_t)), 8); +static void store8B(uint64_t val, uint32_t ptr_m, uint32_t idx, + uint64_t *cache_w) +{ + cache_w[0] = val; + mram_write(cache_w, (__mram_ptr void *)(ptr_m + idx * sizeof(uint64_t)), + 8); } -static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - // Load 8B - uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t); - uint32_t offset = ((uint32_t)ptr_idx_m)%8; - uint32_t ptr_block_m = ptr_idx_m - offset; - mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8); - // Extract 4B - uint32_t* cache_32_w = (uint32_t*) cache_w; - return cache_32_w[offset/4]; +static uint32_t load4B(uint32_t ptr_m, uint32_t idx, uint64_t *cache_w) +{ + // Load 8B + uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t); + uint32_t offset = ((uint32_t) ptr_idx_m) % 8; + uint32_t ptr_block_m = ptr_idx_m - offset; + mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8); + // Extract 4B + uint32_t *cache_32_w = (uint32_t *) cache_w; + return cache_32_w[offset / 4]; } -static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, uint64_t* cache_w) { - // Load 8B - uint32_t ptr_idx_m = ptr_m + idx*sizeof(uint32_t); - uint32_t offset = ((uint32_t)ptr_idx_m)%8; - uint32_t ptr_block_m = ptr_idx_m - offset; - mram_read((__mram_ptr void const*)ptr_block_m, cache_w, 8); - // Modify 4B - uint32_t* cache_32_w = (uint32_t*) cache_w; - cache_32_w[offset/4] = val; - // Write back 8B - mram_write(cache_w, (__mram_ptr void*)ptr_block_m, 8); +static void store4B(uint32_t val, uint32_t ptr_m, uint32_t idx, + uint64_t *cache_w) +{ + // Load 8B + uint32_t ptr_idx_m = ptr_m + idx * sizeof(uint32_t); + uint32_t offset = ((uint32_t) ptr_idx_m) % 8; + uint32_t ptr_block_m = ptr_idx_m - offset; + mram_read((__mram_ptr void const *)ptr_block_m, cache_w, 8); + // Modify 4B + uint32_t *cache_32_w = (uint32_t *) cache_w; + cache_32_w[offset / 4] = val; + // Write back 8B + mram_write(cache_w, (__mram_ptr void *)ptr_block_m, 8); } #endif - diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c index 43a2d0f..44ec214 100644 --- a/BFS/dpu/task.c +++ b/BFS/dpu/task.c @@ -20,127 +20,155 @@ BARRIER_INIT(bfsBarrier, NR_TASKLETS); MUTEX_INIT(nextFrontierMutex); // main -int main() { - - if(me() == 0) { - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); - - // Load parameters - uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER; - struct DPUParams* params_w = (struct DPUParams*) mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); - mram_read((__mram_ptr void const*)params_m, params_w, ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); - - // Extract parameters - uint32_t numGlobalNodes = params_w->numNodes; - uint32_t startNodeIdx = params_w->dpuStartNodeIdx; - uint32_t numNodes = params_w->dpuNumNodes; - uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset; - uint32_t level = params_w->level; - uint32_t nodePtrs_m = params_w->dpuNodePtrs_m; - uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m; - uint32_t nodeLevel_m = params_w->dpuNodeLevel_m; - uint32_t visited_m = params_w->dpuVisited_m; - uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m; - uint32_t nextFrontier_m = params_w->dpuNextFrontier_m; - - if(numNodes > 0) { - - // Sanity check - if(me() == 0) { - if(numGlobalNodes%64 != 0) { - //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!"); - } - if(startNodeIdx%64 != 0 || numNodes%64 != 0) { - //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!"); - } - } - - // Allocate WRAM cache for each tasklet to use throughout - uint64_t* cache_w = mem_alloc(sizeof(uint64_t)); - - // Update current frontier and visited list based on the next frontier from the previous iteration - for(uint32_t nodeTileIdx = me(); nodeTileIdx < numGlobalNodes/64; nodeTileIdx += NR_TASKLETS) { - - // Get the next frontier tile from MRAM - uint64_t nextFrontierTile = load8B(nextFrontier_m, nodeTileIdx, cache_w); - - // Process next frontier tile if it is not empty - if(nextFrontierTile) { - - // Mark everything that was previously added to the next frontier as visited - uint64_t visitedTile = load8B(visited_m, nodeTileIdx, cache_w); - visitedTile |= nextFrontierTile; - store8B(visitedTile, visited_m, nodeTileIdx, cache_w); - - // Clear the next frontier - store8B(0, nextFrontier_m, nodeTileIdx, cache_w); - - } - - // Extract the current frontier from the previous next frontier and update node levels - uint32_t startTileIdx = startNodeIdx/64; - uint32_t numTiles = numNodes/64; - if(startTileIdx <= nodeTileIdx && nodeTileIdx < startTileIdx + numTiles) { - - // Update current frontier - store8B(nextFrontierTile, currentFrontier_m, nodeTileIdx - startTileIdx, cache_w); - - // Update node levels - if(nextFrontierTile) { - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(nextFrontierTile, node%64)) { - store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks - } - } - } - } - - } - - // Wait until all tasklets have updated the current frontier - barrier_wait(&bfsBarrier); - - // Identify tasklet's nodes - uint32_t numNodesPerTasklet = (numNodes + NR_TASKLETS - 1)/NR_TASKLETS; - uint32_t taskletNodesStart = me()*numNodesPerTasklet; - uint32_t taskletNumNodes; - if(taskletNodesStart > numNodes) { - taskletNumNodes = 0; - } else if(taskletNodesStart + numNodesPerTasklet > numNodes) { - taskletNumNodes = numNodes - taskletNodesStart; - } else { - taskletNumNodes = numNodesPerTasklet; - } - - // Visit neighbors of the current frontier - mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex); - for(uint32_t node = taskletNodesStart; node < taskletNodesStart + taskletNumNodes; ++node) { - uint32_t nodeTileIdx = node/64; - uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile - if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier - // Visit its neighbors - uint32_t nodePtr = load4B(nodePtrs_m, node, cache_w) - nodePtrsOffset; - uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr - for(uint32_t i = nodePtr; i < nextNodePtr; ++i) { - uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader - uint32_t neighborTileIdx = neighbor/64; - uint64_t visitedTile = load8B(visited_m, neighborTileIdx, cache_w); - if(!isSet(visitedTile, neighbor%64)) { // Neighbor not previously visited - // Add neighbor to next frontier - mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention - uint64_t nextFrontierTile = load8B(nextFrontier_m, neighborTileIdx, cache_w); - setBit(nextFrontierTile, neighbor%64); - store8B(nextFrontierTile, nextFrontier_m, neighborTileIdx, cache_w); - mutex_unlock(mutexID); - } - } - } - } - - } - - return 0; +int main() +{ + + if (me() == 0) { + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + // Load parameters + uint32_t params_m = (uint32_t) DPU_MRAM_HEAP_POINTER; + struct DPUParams *params_w = + (struct DPUParams *) + mem_alloc(ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); + mram_read((__mram_ptr void const *)params_m, params_w, + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams))); + + // Extract parameters + uint32_t numGlobalNodes = params_w->numNodes; + uint32_t startNodeIdx = params_w->dpuStartNodeIdx; + uint32_t numNodes = params_w->dpuNumNodes; + uint32_t nodePtrsOffset = params_w->dpuNodePtrsOffset; + uint32_t level = params_w->level; + uint32_t nodePtrs_m = params_w->dpuNodePtrs_m; + uint32_t neighborIdxs_m = params_w->dpuNeighborIdxs_m; + uint32_t nodeLevel_m = params_w->dpuNodeLevel_m; + uint32_t visited_m = params_w->dpuVisited_m; + uint32_t currentFrontier_m = params_w->dpuCurrentFrontier_m; + uint32_t nextFrontier_m = params_w->dpuNextFrontier_m; + + if (numNodes > 0) { + + // Sanity check + if (me() == 0) { + if (numGlobalNodes % 64 != 0) { + //PRINT_ERROR("The number of nodes in the graph is not a multiple of 64!"); + } + if (startNodeIdx % 64 != 0 || numNodes % 64 != 0) { + //PRINT_ERROR("The number of nodes assigned to the DPU is not aligned to or a multiple of 64!"); + } + } + // Allocate WRAM cache for each tasklet to use throughout + uint64_t *cache_w = mem_alloc(sizeof(uint64_t)); + + // Update current frontier and visited list based on the next frontier from the previous iteration + for (uint32_t nodeTileIdx = me(); + nodeTileIdx < numGlobalNodes / 64; + nodeTileIdx += NR_TASKLETS) { + + // Get the next frontier tile from MRAM + uint64_t nextFrontierTile = + load8B(nextFrontier_m, nodeTileIdx, cache_w); + + // Process next frontier tile if it is not empty + if (nextFrontierTile) { + + // Mark everything that was previously added to the next frontier as visited + uint64_t visitedTile = + load8B(visited_m, nodeTileIdx, cache_w); + visitedTile |= nextFrontierTile; + store8B(visitedTile, visited_m, nodeTileIdx, + cache_w); + + // Clear the next frontier + store8B(0, nextFrontier_m, nodeTileIdx, + cache_w); + + } + // Extract the current frontier from the previous next frontier and update node levels + uint32_t startTileIdx = startNodeIdx / 64; + uint32_t numTiles = numNodes / 64; + if (startTileIdx <= nodeTileIdx + && nodeTileIdx < startTileIdx + numTiles) { + + // Update current frontier + store8B(nextFrontierTile, currentFrontier_m, + nodeTileIdx - startTileIdx, cache_w); + + // Update node levels + if (nextFrontierTile) { + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; + ++node) { + if (isSet + (nextFrontierTile, + node % 64)) { + store4B(level, nodeLevel_m, node - startNodeIdx, cache_w); // No false sharing so no need for locks + } + } + } + } + + } + + // Wait until all tasklets have updated the current frontier + barrier_wait(&bfsBarrier); + + // Identify tasklet's nodes + uint32_t numNodesPerTasklet = + (numNodes + NR_TASKLETS - 1) / NR_TASKLETS; + uint32_t taskletNodesStart = me() * numNodesPerTasklet; + uint32_t taskletNumNodes; + if (taskletNodesStart > numNodes) { + taskletNumNodes = 0; + } else if (taskletNodesStart + numNodesPerTasklet > numNodes) { + taskletNumNodes = numNodes - taskletNodesStart; + } else { + taskletNumNodes = numNodesPerTasklet; + } + + // Visit neighbors of the current frontier + mutex_id_t mutexID = MUTEX_GET(nextFrontierMutex); + for (uint32_t node = taskletNodesStart; + node < taskletNodesStart + taskletNumNodes; ++node) { + uint32_t nodeTileIdx = node / 64; + uint64_t currentFrontierTile = load8B(currentFrontier_m, nodeTileIdx, cache_w); // TODO: Optimize: load tile then loop over nodes in the tile + if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier + // Visit its neighbors + uint32_t nodePtr = + load4B(nodePtrs_m, node, + cache_w) - nodePtrsOffset; + uint32_t nextNodePtr = load4B(nodePtrs_m, node + 1, cache_w) - nodePtrsOffset; // TODO: Optimize: might be in the same 8B as nodePtr + for (uint32_t i = nodePtr; i < nextNodePtr; ++i) { + uint32_t neighbor = load4B(neighborIdxs_m, i, cache_w); // TODO: Optimize: sequential access to neighbors can use sequential reader + uint32_t neighborTileIdx = + neighbor / 64; + uint64_t visitedTile = + load8B(visited_m, neighborTileIdx, + cache_w); + if (!isSet(visitedTile, neighbor % 64)) { // Neighbor not previously visited + // Add neighbor to next frontier + mutex_lock(mutexID); // TODO: Optimize: use more locks to reduce contention + uint64_t nextFrontierTile = + load8B(nextFrontier_m, + neighborTileIdx, + cache_w); + setBit(nextFrontierTile, + neighbor % 64); + store8B(nextFrontierTile, + nextFrontier_m, + neighborTileIdx, + cache_w); + mutex_unlock(mutexID); + } + } + } + } + + } + + return 0; } diff --git a/BFS/host/app.c b/BFS/host/app.c index 54b9cdc..9ba7ffb 100644 --- a/BFS/host/app.c +++ b/BFS/host/app.c @@ -30,305 +30,429 @@ #define DPU_BINARY "./bin/dpu_code" // Main of the Host Application -int main(int argc, char** argv) { - - // Process parameters - struct Params p = input_params(argc, argv); - - // Timer and profiling - Timer timer; - #if ENERGY - struct dpu_probe_t probe; - DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); - double tenergy=0; - #endif - - // Allocate DPUs and load binary - struct dpu_set_t dpu_set, dpu; - uint32_t numDPUs; - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); - PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs); - - // Initialize BFS data structures - PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); - struct COOGraph cooGraph = readCOOGraph(p.fileName); - PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); - struct CSRGraph csrGraph = coo2csr(cooGraph); - uint32_t numNodes = csrGraph.numNodes; - uint32_t* nodePtrs = csrGraph.nodePtrs; - uint32_t* neighborIdxs = csrGraph.neighborIdxs; - uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) - uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node - setBit(nextFrontier[0], 0); // Initialize frontier to first node - uint32_t level = 1; - - // Partition data structure across DPUs - uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1); - PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU); - struct DPUParams dpuParams[numDPUs]; - uint32_t dpuParams_m[numDPUs]; - unsigned int dpuIdx = 0; - unsigned int t0ini = 0; - unsigned int t1ini = 0; - unsigned int t2ini = 0; - unsigned int t3ini = 0; - DPU_FOREACH (dpu_set, dpu) { - - // Allocate parameters - struct mram_heap_allocator_t allocator; - init_allocator(&allocator); - dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams)); - - // Find DPU's nodes - uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU; - uint32_t dpuNumNodes; - if(dpuStartNodeIdx > numNodes) { - dpuNumNodes = 0; - } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) { - dpuNumNodes = numNodes - dpuStartNodeIdx; - } else { - dpuNumNodes = numNodesPerDPU; - } - dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes; - PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx); - PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", dpuNumNodes); - - // Partition edges and copy data - if(dpuNumNodes > 0) { - - // Find DPU's CSR graph partition - uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx]; - uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0]; - uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset; - uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset; - uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx]; - - // Allocate MRAM - uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t)); - uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t)); - uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t)); - uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t)); - uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t)); - uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t)); - PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated); - - // Set up DPU parameters - dpuParams[dpuIdx].numNodes = numNodes; - dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx; - dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset; - dpuParams[dpuIdx].level = level; - dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m; - dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m; - dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m; - dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m; - dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m; - dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m; - - // Send data to DPU - PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); - startTimer(&timer, 0, t0ini++); - copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t)); - copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t)); - // NOTE: No need to copy current frontier because it is written before being read - stopTimer(&timer, 0); - //loadTime += getElapsedTime(timer); - - } - - // Send parameters to DPU - PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); - startTimer(&timer, 1, t1ini++); - copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams)); - stopTimer(&timer, 1); - //loadTime += getElapsedTime(timer); - - ++dpuIdx; - - } - - // Iterate until next frontier is empty - uint32_t nextFrontierEmpty = 0; - while(!nextFrontierEmpty) { - - PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level); - - #if ENERGY - DPU_ASSERT(dpu_probe_start(&probe)); - #endif - // Run all DPUs - PRINT_INFO(p.verbosity >= 1, " Booting DPUs"); - startTimer(&timer, 2, t2ini++); - DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - stopTimer(&timer, 2); - //dpuTime += getElapsedTime(timer); - #if ENERGY - DPU_ASSERT(dpu_probe_stop(&probe)); - double energy; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); - tenergy += energy; - #endif - - - - // Copy back next frontier from all DPUs and compute their union as the current frontier - startTimer(&timer, 3, t3ini++); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - if(dpuIdx == 0) { - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t)); - } else { - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t)); - for(uint32_t i = 0; i < numNodes/64; ++i) { - currentFrontier[i] |= nextFrontier[i]; - } - } - ++dpuIdx; - } - } - - // Check if the next frontier is empty, and copy data to DPU if not empty - nextFrontierEmpty = 1; - for(uint32_t i = 0; i < numNodes/64; ++i) { - if(currentFrontier[i]) { - nextFrontierEmpty = 0; - break; - } - } - if(!nextFrontierEmpty) { - ++level; - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier) - copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t)); - // Copy new level to DPU - dpuParams[dpuIdx].level = level; - copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams)); - ++dpuIdx; - } - } - } - stopTimer(&timer, 3); - //hostTime += getElapsedTime(timer); - - } - - // Copy back node levels - PRINT_INFO(p.verbosity >= 1, "Copying back the result"); - startTimer(&timer, 4, 0); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; - if(dpuNumNodes > 0) { - uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU; - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float)); - } - ++dpuIdx; - } - stopTimer(&timer, 4); - //retrieveTime += getElapsedTime(timer); - //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); - - // Calculating result on CPU - PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); - uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) - memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t)); - setBit(nextFrontier[0], 0); // Initialize frontier to first node - nextFrontierEmpty = 0; - level = 1; - while(!nextFrontierEmpty) { - // Update current frontier and visited list based on the next frontier from the previous iteration - for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) { - uint64_t nextFrontierTile = nextFrontier[nodeTileIdx]; - currentFrontier[nodeTileIdx] = nextFrontierTile; - if(nextFrontierTile) { - visited[nodeTileIdx] |= nextFrontierTile; - nextFrontier[nodeTileIdx] = 0; - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(nextFrontierTile, node%64)) { - nodeLevelReference[node] = level; - } - } - } - } - // Visit neighbors of the current frontier - nextFrontierEmpty = 1; - for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) { - uint64_t currentFrontierTile = currentFrontier[nodeTileIdx]; - if(currentFrontierTile) { - for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) { - if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier - // Visit its neighbors - uint32_t nodePtr = nodePtrs[node]; - uint32_t nextNodePtr = nodePtrs[node + 1]; - for(uint32_t i = nodePtr; i < nextNodePtr; ++i) { - uint32_t neighbor = neighborIdxs[i]; - if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited - // Add neighbor to next frontier - setBit(nextFrontier[neighbor/64], neighbor%64); - nextFrontierEmpty = 0; - } - } - } - } - } - } - ++level; - } - - // Verify the result - PRINT_INFO(p.verbosity >= 1, "Verifying the result"); - int status = 1; - for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) { - if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) { - PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]); - status = 0; - } - } - - if (status) { - printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d " - "| throughput_pim_MBps=%f throughput_MBps=%f", - numDPUs, NR_TASKLETS, "uint32_t", numNodes, - numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]), - numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", - numNodes / (timer.time[2] + timer.time[3]), - numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printAll(&timer, 4); - } - - // Display DPU Logs - if(p.verbosity >= 2) { - PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); - dpuIdx = 0; - DPU_FOREACH (dpu_set, dpu) { - PRINT("DPU %u:", dpuIdx); - DPU_ASSERT(dpu_log_read(dpu, stdout)); - ++dpuIdx; - } - } - - // Deallocate data structures - freeCOOGraph(cooGraph); - freeCSRGraph(csrGraph); - free(nodeLevel); - free(visited); - free(currentFrontier); - free(nextFrontier); - free(nodeLevelReference); - - return 0; +int main(int argc, char **argv) +{ -} + // Process parameters + struct Params p = input_params(argc, argv); + + // Timer and profiling + Timer timer; +#if ENERGY + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + double tenergy = 0; +#endif + + printf + ("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD); + + // Allocate DPUs and load binary + struct dpu_set_t dpu_set, dpu; + uint32_t numDPUs, numRanks; + +#if WITH_ALLOC_OVERHEAD + startTimer(&timer, 0, 0); +#endif + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); +#if WITH_ALLOC_OVERHEAD + stopTimer(&timer, 0); +#else + timer.time[0] = 0; +#endif + +#if WITH_LOAD_OVERHEAD + startTimer(&timer, 1, 0); +#endif + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); +#if WITH_LOAD_OVERHEAD + stopTimer(&timer, 0); +#else + timer.time[1] = 0; +#endif + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks)); + assert(NR_DPUS == numDPUs); + PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs); + + // Initialize BFS data structures + PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); + struct COOGraph cooGraph = readCOOGraph(p.fileName); + PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", + cooGraph.numNodes, cooGraph.numEdges); + struct CSRGraph csrGraph = coo2csr(cooGraph); + uint32_t numNodes = csrGraph.numNodes; + uint32_t *nodePtrs = csrGraph.nodePtrs; + uint32_t *neighborIdxs = csrGraph.neighborIdxs; + uint32_t *nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + uint64_t *visited = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *currentFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *nextFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + setBit(nextFrontier[0], 0); // Initialize frontier to first node + uint32_t level = 1; + + // Partition data structure across DPUs + uint32_t numNodesPerDPU = + ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1); + PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", + numNodesPerDPU); + struct DPUParams dpuParams[numDPUs]; + uint32_t dpuParams_m[numDPUs]; + unsigned int dpuIdx = 0; + unsigned int t0ini = 0; + unsigned int t1ini = 0; + unsigned int t2ini = 0; + unsigned int t3ini = 0; + DPU_FOREACH(dpu_set, dpu) { + + // Allocate parameters + struct mram_heap_allocator_t allocator; + init_allocator(&allocator); + dpuParams_m[dpuIdx] = + mram_heap_alloc(&allocator, sizeof(struct DPUParams)); + + // Find DPU's nodes + uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU; + uint32_t dpuNumNodes; + if (dpuStartNodeIdx > numNodes) { + dpuNumNodes = 0; + } else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) { + dpuNumNodes = numNodes - dpuStartNodeIdx; + } else { + dpuNumNodes = numNodesPerDPU; + } + dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes; + PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx); + PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", + dpuNumNodes); + + // Partition edges and copy data + if (dpuNumNodes > 0) { + + // Find DPU's CSR graph partition + uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx]; + uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0]; + uint32_t *dpuNeighborIdxs_h = + neighborIdxs + dpuNodePtrsOffset; + uint32_t dpuNumNeighbors = + dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset; + uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx]; + + // Allocate MRAM + uint32_t dpuNodePtrs_m = + mram_heap_alloc(&allocator, + (dpuNumNodes + + 1) * sizeof(uint32_t)); + uint32_t dpuNeighborIdxs_m = + mram_heap_alloc(&allocator, + dpuNumNeighbors * sizeof(uint32_t)); + uint32_t dpuNodeLevel_m = + mram_heap_alloc(&allocator, + dpuNumNodes * sizeof(uint32_t)); + uint32_t dpuVisited_m = + mram_heap_alloc(&allocator, + numNodes / 64 * sizeof(uint64_t)); + uint32_t dpuCurrentFrontier_m = + mram_heap_alloc(&allocator, + dpuNumNodes / 64 * + sizeof(uint64_t)); + uint32_t dpuNextFrontier_m = + mram_heap_alloc(&allocator, + numNodes / 64 * sizeof(uint64_t)); + PRINT_INFO(p.verbosity >= 2, + " Total memory allocated is %d bytes", + allocator.totalAllocated); + + // Set up DPU parameters + dpuParams[dpuIdx].numNodes = numNodes; + dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx; + dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset; + dpuParams[dpuIdx].level = level; + dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m; + dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m; + dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m; + dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m; + dpuParams[dpuIdx].dpuCurrentFrontier_m = + dpuCurrentFrontier_m; + dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m; + + // Send data to DPU + PRINT_INFO(p.verbosity >= 2, + " Copying data to DPU"); + startTimer(&timer, 2, t0ini++); + copyToDPU(dpu, (uint8_t *) dpuNodePtrs_h, dpuNodePtrs_m, + (dpuNumNodes + 1) * sizeof(uint32_t)); + copyToDPU(dpu, (uint8_t *) dpuNeighborIdxs_h, + dpuNeighborIdxs_m, + dpuNumNeighbors * sizeof(uint32_t)); + copyToDPU(dpu, (uint8_t *) dpuNodeLevel_h, + dpuNodeLevel_m, + dpuNumNodes * sizeof(uint32_t)); + copyToDPU(dpu, (uint8_t *) visited, dpuVisited_m, + numNodes / 64 * sizeof(uint64_t)); + copyToDPU(dpu, (uint8_t *) nextFrontier, + dpuNextFrontier_m, + numNodes / 64 * sizeof(uint64_t)); + // NOTE: No need to copy current frontier because it is written before being read + stopTimer(&timer, 2); + //loadTime += getElapsedTime(timer); + + } + // Send parameters to DPU + PRINT_INFO(p.verbosity >= 2, + " Copying parameters to DPU"); + startTimer(&timer, 2, t1ini++); + copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], + dpuParams_m[dpuIdx], sizeof(struct DPUParams)); + stopTimer(&timer, 2); + //loadTime += getElapsedTime(timer); + + ++dpuIdx; + + } + + // Iterate until next frontier is empty + uint32_t nextFrontierEmpty = 0; + while (!nextFrontierEmpty) { + + PRINT_INFO(p.verbosity >= 1, + "Processing current frontier for level %u", level); + +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + // Run all DPUs + PRINT_INFO(p.verbosity >= 1, " Booting DPUs"); + startTimer(&timer, 3, t2ini++); + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + stopTimer(&timer, 3); + //dpuTime += getElapsedTime(timer); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); + double energy; + DPU_ASSERT(dpu_probe_get + (&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + tenergy += energy; +#endif + + // Copy back next frontier from all DPUs and compute their union as the current frontier + startTimer(&timer, 4, t3ini++); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + if (dpuIdx == 0) { + copyFromDPU(dpu, + dpuParams[dpuIdx]. + dpuNextFrontier_m, + (uint8_t *) currentFrontier, + numNodes / 64 * + sizeof(uint64_t)); + } else { + copyFromDPU(dpu, + dpuParams[dpuIdx]. + dpuNextFrontier_m, + (uint8_t *) nextFrontier, + numNodes / 64 * + sizeof(uint64_t)); + for (uint32_t i = 0; i < numNodes / 64; + ++i) { + currentFrontier[i] |= + nextFrontier[i]; + } + } + ++dpuIdx; + } + } + + // Check if the next frontier is empty, and copy data to DPU if not empty + nextFrontierEmpty = 1; + for (uint32_t i = 0; i < numNodes / 64; ++i) { + if (currentFrontier[i]) { + nextFrontierEmpty = 0; + break; + } + } + if (!nextFrontierEmpty) { + ++level; + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = + dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier) + copyToDPU(dpu, + (uint8_t *) currentFrontier, + dpuParams[dpuIdx]. + dpuNextFrontier_m, + numNodes / 64 * + sizeof(uint64_t)); + // Copy new level to DPU + dpuParams[dpuIdx].level = level; + copyToDPU(dpu, + (uint8_t *) & + dpuParams[dpuIdx], + dpuParams_m[dpuIdx], + sizeof(struct DPUParams)); + ++dpuIdx; + } + } + } + stopTimer(&timer, 4); + //hostTime += getElapsedTime(timer); + + } + + // Copy back node levels + PRINT_INFO(p.verbosity >= 1, "Copying back the result"); + startTimer(&timer, 5, 0); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; + if (dpuNumNodes > 0) { + uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU; + copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, + (uint8_t *) (nodeLevel + dpuStartNodeIdx), + dpuNumNodes * sizeof(float)); + } + ++dpuIdx; + } + stopTimer(&timer, 5); + //retrieveTime += getElapsedTime(timer); + //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3); + + // Calculating result on CPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); + uint32_t *nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t)); + setBit(nextFrontier[0], 0); // Initialize frontier to first node + nextFrontierEmpty = 0; + level = 1; + startTimer(&timer, 6, 0); + while (!nextFrontierEmpty) { + // Update current frontier and visited list based on the next frontier from the previous iteration + for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64; + ++nodeTileIdx) { + uint64_t nextFrontierTile = nextFrontier[nodeTileIdx]; + currentFrontier[nodeTileIdx] = nextFrontierTile; + if (nextFrontierTile) { + visited[nodeTileIdx] |= nextFrontierTile; + nextFrontier[nodeTileIdx] = 0; + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; ++node) { + if (isSet(nextFrontierTile, node % 64)) { + nodeLevelReference[node] = + level; + } + } + } + } + // Visit neighbors of the current frontier + nextFrontierEmpty = 1; + for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64; + ++nodeTileIdx) { + uint64_t currentFrontierTile = + currentFrontier[nodeTileIdx]; + if (currentFrontierTile) { + for (uint32_t node = nodeTileIdx * 64; + node < (nodeTileIdx + 1) * 64; ++node) { + if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier + // Visit its neighbors + uint32_t nodePtr = + nodePtrs[node]; + uint32_t nextNodePtr = + nodePtrs[node + 1]; + for (uint32_t i = nodePtr; + i < nextNodePtr; ++i) { + uint32_t neighbor = + neighborIdxs[i]; + if (!isSet(visited[neighbor / 64], neighbor % 64)) { // Neighbor not previously visited + // Add neighbor to next frontier + setBit + (nextFrontier + [neighbor / + 64], + neighbor % + 64); + nextFrontierEmpty + = 0; + } + } + } + } + } + } + ++level; + } + stopTimer(&timer, 6); + +#if WITH_FREE_OVERHEAD + startTimer(&timer, 7); +#endif + DPU_ASSERT(dpu_free(dpu_set)); +#if WITH_FREE_OVERHEAD + stopTimer(&timer, 7); +#else + timer.time[7] = 0; +#endif + + // Verify the result + PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + int status = 1; + for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) { + if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) { + PRINT_ERROR + ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", + nodeIdx, nodeLevelReference[nodeIdx], + nodeLevel[nodeIdx]); + status = 0; + } + } + + if (status) { + printf + ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d " + "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, + NR_TASKLETS, "uint32_t", numNodes, + numNodes * sizeof(uint32_t) / (timer.time[2] + + timer.time[3]), + numNodes * sizeof(uint32_t) / (timer.time[0] + + timer.time[1] + + timer.time[2] + + timer.time[3] + + timer.time[4])); + printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", + numNodes / (timer.time[2] + timer.time[3]), + numNodes / (timer.time[0] + timer.time[1] + + timer.time[2] + timer.time[3] + + timer.time[4])); + printf + (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[4], timer.time[5], timer.time[6], + timer.time[7]); + } + // Display DPU Logs + if (p.verbosity >= 2) { + PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:"); + dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { + PRINT("DPU %u:", dpuIdx); + DPU_ASSERT(dpu_log_read(dpu, stdout)); + ++dpuIdx; + } + } + // Deallocate data structures + freeCOOGraph(cooGraph); + freeCSRGraph(csrGraph); + free(nodeLevel); + free(visited); + free(currentFrontier); + free(nextFrontier); + free(nodeLevelReference); + + return 0; + +} diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h index 627dfde..f2ee031 100644 --- a/BFS/host/mram-management.h +++ b/BFS/host/mram-management.h @@ -5,33 +5,45 @@ #include "../support/common.h" #include "../support/utils.h" -#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB +#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB struct mram_heap_allocator_t { - uint32_t totalAllocated; + uint32_t totalAllocated; }; -static void init_allocator(struct mram_heap_allocator_t* allocator) { - allocator->totalAllocated = 0; +static void init_allocator(struct mram_heap_allocator_t *allocator) +{ + allocator->totalAllocated = 0; } -static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) { - uint32_t ret = allocator->totalAllocated; - allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size); - if(allocator->totalAllocated > DPU_CAPACITY) { - PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY); - exit(0); - } - return ret; +static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator, + uint32_t size) +{ + uint32_t ret = allocator->totalAllocated; + allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size); + if (allocator->totalAllocated > DPU_CAPACITY) { + PRINT_ERROR + (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", + allocator->totalAllocated, DPU_CAPACITY); + exit(0); + } + return ret; } -static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) { - DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); +static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx, + uint32_t size) +{ + DPU_ASSERT(dpu_copy_to + (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, + ROUND_UP_TO_MULTIPLE_OF_8(size))); } -static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) { - DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size))); +static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, + uint8_t *hostPtr, uint32_t size) +{ + DPU_ASSERT(dpu_copy_from + (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, + ROUND_UP_TO_MULTIPLE_OF_8(size))); } #endif - diff --git a/BFS/support/common.h b/BFS/support/common.h index ced324c..5f2aa0d 100644 --- a/BFS/support/common.h +++ b/BFS/support/common.h @@ -9,18 +9,17 @@ #define isSet(val, idx) ((val) & (1 << (idx))) struct DPUParams { - uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */ - uint32_t numNodes; /* Total number of nodes in the graph */ - uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */ - uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */ - uint32_t level; /* The current BFS level */ - uint32_t dpuNodePtrs_m; - uint32_t dpuNeighborIdxs_m; - uint32_t dpuNodeLevel_m; - uint32_t dpuVisited_m; - uint32_t dpuCurrentFrontier_m; - uint32_t dpuNextFrontier_m; + uint32_t dpuNumNodes; /* The number of nodes assigned to this DPU */ + uint32_t numNodes; /* Total number of nodes in the graph */ + uint32_t dpuStartNodeIdx; /* The index of the first node assigned to this DPU */ + uint32_t dpuNodePtrsOffset; /* Offset of the node pointers */ + uint32_t level; /* The current BFS level */ + uint32_t dpuNodePtrs_m; + uint32_t dpuNeighborIdxs_m; + uint32_t dpuNodeLevel_m; + uint32_t dpuVisited_m; + uint32_t dpuCurrentFrontier_m; + uint32_t dpuNextFrontier_m; }; #endif - diff --git a/BFS/support/graph.h b/BFS/support/graph.h index f89ff5c..2a19f67 100644 --- a/BFS/support/graph.h +++ b/BFS/support/graph.h @@ -9,108 +9,125 @@ #include "utils.h" struct COOGraph { - uint32_t numNodes; - uint32_t numEdges; - uint32_t* nodeIdxs; - uint32_t* neighborIdxs; + uint32_t numNodes; + uint32_t numEdges; + uint32_t *nodeIdxs; + uint32_t *neighborIdxs; }; struct CSRGraph { - uint32_t numNodes; - uint32_t numEdges; - uint32_t* nodePtrs; - uint32_t* neighborIdxs; + uint32_t numNodes; + uint32_t numEdges; + uint32_t *nodePtrs; + uint32_t *neighborIdxs; }; -static struct COOGraph readCOOGraph(const char* fileName) { - - struct COOGraph cooGraph; - - // Initialize fields - FILE* fp = fopen(fileName, "r"); - uint32_t numNodes, numCols; - assert(fscanf(fp, "%u", &numNodes)); - assert(fscanf(fp, "%u", &numCols)); - if(numNodes == numCols) { - cooGraph.numNodes = numNodes; - } else { - PRINT_WARNING(" Adjacency matrix is not square. Padding matrix to be square."); - cooGraph.numNodes = (numNodes > numCols)? numNodes : numCols; - } - if(cooGraph.numNodes%64 != 0) { - PRINT_WARNING(" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", cooGraph.numNodes); - cooGraph.numNodes += (64 - cooGraph.numNodes%64); - PRINT_WARNING(" Padding to %u which is a multiple of 64 nodes.", cooGraph.numNodes); - } - assert(fscanf(fp, "%u", &cooGraph.numEdges)); - cooGraph.nodeIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t)); - cooGraph.neighborIdxs = (uint32_t*) malloc(cooGraph.numEdges*sizeof(uint32_t)); - - // Read the neighborIdxs - for(uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) { - uint32_t nodeIdx; - assert(fscanf(fp, "%u", &nodeIdx)); - cooGraph.nodeIdxs[edgeIdx] = nodeIdx; - uint32_t neighborIdx; - assert(fscanf(fp, "%u", &neighborIdx)); - cooGraph.neighborIdxs[edgeIdx] = neighborIdx; - } - - return cooGraph; +static struct COOGraph readCOOGraph(const char *fileName) +{ + + struct COOGraph cooGraph; + + // Initialize fields + FILE *fp = fopen(fileName, "r"); + uint32_t numNodes, numCols; + assert(fscanf(fp, "%u", &numNodes)); + assert(fscanf(fp, "%u", &numCols)); + if (numNodes == numCols) { + cooGraph.numNodes = numNodes; + } else { + PRINT_WARNING + (" Adjacency matrix is not square. Padding matrix to be square."); + cooGraph.numNodes = (numNodes > numCols) ? numNodes : numCols; + } + if (cooGraph.numNodes % 64 != 0) { + PRINT_WARNING + (" Adjacency matrix dimension is %u which is not a multiple of 64 nodes.", + cooGraph.numNodes); + cooGraph.numNodes += (64 - cooGraph.numNodes % 64); + PRINT_WARNING + (" Padding to %u which is a multiple of 64 nodes.", + cooGraph.numNodes); + } + assert(fscanf(fp, "%u", &cooGraph.numEdges)); + cooGraph.nodeIdxs = + (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t)); + cooGraph.neighborIdxs = + (uint32_t *) malloc(cooGraph.numEdges * sizeof(uint32_t)); + + // Read the neighborIdxs + for (uint32_t edgeIdx = 0; edgeIdx < cooGraph.numEdges; ++edgeIdx) { + uint32_t nodeIdx; + assert(fscanf(fp, "%u", &nodeIdx)); + cooGraph.nodeIdxs[edgeIdx] = nodeIdx; + uint32_t neighborIdx; + assert(fscanf(fp, "%u", &neighborIdx)); + cooGraph.neighborIdxs[edgeIdx] = neighborIdx; + } + + return cooGraph; } -static void freeCOOGraph(struct COOGraph cooGraph) { - free(cooGraph.nodeIdxs); - free(cooGraph.neighborIdxs); +static void freeCOOGraph(struct COOGraph cooGraph) +{ + free(cooGraph.nodeIdxs); + free(cooGraph.neighborIdxs); } -static struct CSRGraph coo2csr(struct COOGraph cooGraph) { - - struct CSRGraph csrGraph; - - // Initialize fields - csrGraph.numNodes = cooGraph.numNodes; - csrGraph.numEdges = cooGraph.numEdges; - csrGraph.nodePtrs = (uint32_t*) calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), sizeof(uint32_t)); - csrGraph.neighborIdxs = (uint32_t*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(csrGraph.numEdges*sizeof(uint32_t))); - - // Histogram nodeIdxs - for(uint32_t i = 0; i < cooGraph.numEdges; ++i) { - uint32_t nodeIdx = cooGraph.nodeIdxs[i]; - csrGraph.nodePtrs[nodeIdx]++; - } - - // Prefix sum nodePtrs - uint32_t sumBeforeNextNode = 0; - for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { - uint32_t sumBeforeNode = sumBeforeNextNode; - sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx]; - csrGraph.nodePtrs[nodeIdx] = sumBeforeNode; - } - csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode; - - // Bin the neighborIdxs - for(uint32_t i = 0; i < cooGraph.numEdges; ++i) { - uint32_t nodeIdx = cooGraph.nodeIdxs[i]; - uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++; - csrGraph.neighborIdxs[neighborListIdx] = cooGraph.neighborIdxs[i]; - } - - // Restore nodePtrs - for(uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) { - csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1]; - } - csrGraph.nodePtrs[0] = 0; - - return csrGraph; +static struct CSRGraph coo2csr(struct COOGraph cooGraph) +{ + + struct CSRGraph csrGraph; + + // Initialize fields + csrGraph.numNodes = cooGraph.numNodes; + csrGraph.numEdges = cooGraph.numEdges; + csrGraph.nodePtrs = + (uint32_t *) + calloc(ROUND_UP_TO_MULTIPLE_OF_2(csrGraph.numNodes + 1), + sizeof(uint32_t)); + csrGraph.neighborIdxs = + (uint32_t *) + malloc(ROUND_UP_TO_MULTIPLE_OF_8 + (csrGraph.numEdges * sizeof(uint32_t))); + + // Histogram nodeIdxs + for (uint32_t i = 0; i < cooGraph.numEdges; ++i) { + uint32_t nodeIdx = cooGraph.nodeIdxs[i]; + csrGraph.nodePtrs[nodeIdx]++; + } + + // Prefix sum nodePtrs + uint32_t sumBeforeNextNode = 0; + for (uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { + uint32_t sumBeforeNode = sumBeforeNextNode; + sumBeforeNextNode += csrGraph.nodePtrs[nodeIdx]; + csrGraph.nodePtrs[nodeIdx] = sumBeforeNode; + } + csrGraph.nodePtrs[csrGraph.numNodes] = sumBeforeNextNode; + + // Bin the neighborIdxs + for (uint32_t i = 0; i < cooGraph.numEdges; ++i) { + uint32_t nodeIdx = cooGraph.nodeIdxs[i]; + uint32_t neighborListIdx = csrGraph.nodePtrs[nodeIdx]++; + csrGraph.neighborIdxs[neighborListIdx] = + cooGraph.neighborIdxs[i]; + } + + // Restore nodePtrs + for (uint32_t nodeIdx = csrGraph.numNodes - 1; nodeIdx > 0; --nodeIdx) { + csrGraph.nodePtrs[nodeIdx] = csrGraph.nodePtrs[nodeIdx - 1]; + } + csrGraph.nodePtrs[0] = 0; + + return csrGraph; } -static void freeCSRGraph(struct CSRGraph csrGraph) { - free(csrGraph.nodePtrs); - free(csrGraph.neighborIdxs); +static void freeCSRGraph(struct CSRGraph csrGraph) +{ + free(csrGraph.nodePtrs); + free(csrGraph.neighborIdxs); } #endif - diff --git a/BFS/support/params.h b/BFS/support/params.h index f4f12e7..f9169bc 100644 --- a/BFS/support/params.h +++ b/BFS/support/params.h @@ -5,42 +5,63 @@ #include "common.h" #include "utils.h" -static void usage() { - PRINT( "\nUsage: ./program [options]" - "\n" - "\nBenchmark-specific options:" - "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)" - "\n" - "\nGeneral options:" - "\n -v <V> verbosity" - "\n -h help" - "\n\n"); +static void usage() +{ + PRINT("\nUsage: ./program [options]" + "\n" + "\nBenchmark-specific options:" + "\n -f <F> input matrix file name (default=data/roadNet-CA.txt)" + "\n" + "\nGeneral options:" + "\n -v <V> verbosity" "\n -h help" "\n\n"); } typedef struct Params { - const char* fileName; - unsigned int verbosity; + const char *fileName; + unsigned int verbosity; +#if NUMA + struct bitmask *bitmask_in; + int numa_node_cpu; +#endif } Params; -static struct Params input_params(int argc, char **argv) { - struct Params p; - p.fileName = "data/roadNet-CA.txt"; - p.verbosity = 0; - int opt; - while((opt = getopt(argc, argv, "f:v:h")) >= 0) { - switch(opt) { - case 'f': p.fileName = optarg; break; - case 'v': p.verbosity = atoi(optarg); break; - case 'h': usage(); exit(0); - default: - PRINT_ERROR("Unrecognized option!"); - usage(); - exit(0); - } - } +static struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.fileName = "data/roadNet-CA.txt"; + p.verbosity = 0; +#if NUMA + p.bitmask_in = NULL; + p.numa_node_cpu = -1; +#endif + int opt; + while ((opt = getopt(argc, argv, "f:v:hA:C:")) >= 0) { + switch (opt) { + case 'f': + p.fileName = optarg; + break; + case 'v': + p.verbosity = atoi(optarg); + break; +#if NUMA + case 'A': + p.bitmask_in = numa_parse_nodestring(optarg); + break; + case 'C': + p.numa_node_cpu = atoi(optarg); + break; +#endif + case 'h': + usage(); + exit(0); + default: + PRINT_ERROR("Unrecognized option!"); + usage(); + exit(0); + } + } - return p; + return p; } #endif - diff --git a/BFS/support/timer.h b/BFS/support/timer.h index 80719cf..63b5567 100644 --- a/BFS/support/timer.h +++ b/BFS/support/timer.h @@ -6,29 +6,26 @@ #include <sys/time.h> typedef struct Timer { - struct timeval startTime[5]; - struct timeval stopTime[5]; - double time[5]; + struct timeval startTime[8]; + struct timeval stopTime[8]; + double time[8]; } Timer; -static void startTimer(Timer *timer, int i, int rep) { - if(rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); +static void startTimer(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); } -static void stopTimer(Timer *timer, int i) { - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -static void printAll(Timer *timer, int maxt) { - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); +static void stopTimer(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); } #endif diff --git a/BFS/support/utils.h b/BFS/support/utils.h index ddb1e2c..ccd8fbd 100644 --- a/BFS/support/utils.h +++ b/BFS/support/utils.h @@ -8,4 +8,3 @@ #define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) #endif - diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile index b67602f..4c30f65 100644 --- a/BS/baselines/cpu/Makefile +++ b/BS/baselines/cpu/Makefile @@ -1,16 +1,30 @@ -NUMA ?= 0 -NUMA_MEMCPY ?= 0 -FLAGS = +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 +numa_memcpy ?= 0 + +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif -ifeq (${NUMA}, 1) - FLAGS += -lnuma +ifeq (${numa}, 1) + LDFLAGS += -lnuma endif .PHONY: all all: bs_omp bs_omp: bs_omp.c - gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS} + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} bs_omp.c -o bs_omp -fopenmp ${LDFLAGS} bs_omp_O0: bs_omp.c gcc bs_omp.c -o bs_omp_O0 -fopenmp diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c index 874299b..2e4c300 100644 --- a/BS/baselines/cpu/bs_omp.c +++ b/BS/baselines/cpu/bs_omp.c @@ -7,265 +7,286 @@ #include <assert.h> #include <time.h> #include <stdint.h> + +#if WITH_BENCHMARK #include "timer.h" +#else +#define start(...) +#define stop(...) +#endif #if NUMA #include <numaif.h> #include <numa.h> -void* mp_pages[1]; +void *mp_pages[1]; int mp_status[1]; int mp_nodes[1]; -struct bitmask* bitmask_in; +struct bitmask *bitmask_in; int numa_node_in = -1; int numa_node_cpu = -1; #endif - #if NUMA_MEMCPY -struct bitmask* bitmask_cpu; +struct bitmask *bitmask_cpu; int numa_node_cpu_memcpy = -1; int numa_node_local = -1; int numa_node_in_is_local = 0; #endif - #define DTYPE uint64_t /* * @brief creates a "test file" by filling a bufferwith values */ -void create_test_file(DTYPE * input, uint64_t nr_elements, DTYPE * querys, uint64_t n_querys) { +void create_test_file(DTYPE *input, uint64_t nr_elements, DTYPE *querys, + uint64_t n_querys) +{ - srand(time(NULL)); + srand(time(NULL)); - input[0] = 1; - for (uint64_t i = 1; i < nr_elements; i++) { - input[i] = input[i - 1] + (rand() % 10) + 1; - } + input[0] = 1; + for (uint64_t i = 1; i < nr_elements; i++) { + input[i] = input[i - 1] + (rand() % 10) + 1; + } - for(uint64_t i = 0; i < n_querys; i++) - { - querys[i] = input[rand() % (nr_elements - 2)]; - } + for (uint64_t i = 0; i < n_querys; i++) { + querys[i] = input[rand() % nr_elements]; + } } /** * @brief compute output in the host */ -uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigned n_querys) +uint64_t binarySearch(DTYPE *input, uint64_t input_size, DTYPE *querys, + unsigned n_querys) { uint64_t found = -1; uint64_t q, r, l, m; - - #pragma omp parallel for private(q,r,l,m) - for(q = 0; q < n_querys; q++) - { + +#pragma omp parallel for private(q,r,l,m) + for (q = 0; q < n_querys; q++) { l = 0; r = input_size; - while (l <= r) - { - m = l + (r - l) / 2; - - // Check if x is present at mid - if (input[m] == querys[q]) - { - found += m; + while (l <= r) { + m = l + (r - l) / 2; + + // Check if x is present at mid + if (input[m] == querys[q]) { + found += m; break; } - // If x greater, ignore left half - if (input[m] < querys[q]) - l = m + 1; + // If x greater, ignore left half + if (input[m] < querys[q]) + l = m + 1; - // If x is smaller, ignore right half + // If x is smaller, ignore right half else - r = m - 1; - + r = m - 1; + } - } + } - return found; + return found; } /** * @brief Main of the Host Application. */ - int main(int argc, char **argv) { - (void)argc; - Timer timer; - uint64_t input_size = atol(argv[1]); - uint64_t n_querys = atol(argv[2]); +int main(int argc, char **argv) +{ + (void)argc; +#if WITH_BENCHMARK + Timer timer; +#endif + uint64_t input_size = atol(argv[1]); + uint64_t n_querys = atol(argv[2]); #if NUMA - bitmask_in = numa_parse_nodestring(argv[3]); - numa_node_cpu = atoi(argv[4]); + bitmask_in = numa_parse_nodestring(argv[3]); + numa_node_cpu = atoi(argv[4]); #endif #if NUMA_MEMCPY - bitmask_cpu = numa_parse_nodestring(argv[5]); - numa_node_cpu_memcpy = atoi(argv[6]); + bitmask_cpu = numa_parse_nodestring(argv[5]); + numa_node_cpu_memcpy = atoi(argv[6]); #endif - printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys); + printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys); #if NUMA - if (bitmask_in) { - numa_set_membind(bitmask_in); - numa_free_nodemask(bitmask_in); - } - DTYPE * input = numa_alloc((input_size) * sizeof(DTYPE)); - DTYPE * querys = numa_alloc((n_querys) * sizeof(DTYPE)); + if (bitmask_in) { + numa_set_membind(bitmask_in); + numa_free_nodemask(bitmask_in); + } + DTYPE *input = numa_alloc((input_size) * sizeof(DTYPE)); + DTYPE *querys = numa_alloc((n_querys) * sizeof(DTYPE)); #else - DTYPE * input = malloc((input_size) * sizeof(DTYPE)); - DTYPE * querys = malloc((n_querys) * sizeof(DTYPE)); + DTYPE *input = malloc((input_size) * sizeof(DTYPE)); + DTYPE *querys = malloc((n_querys) * sizeof(DTYPE)); #endif #if NUMA #if NUMA_MEMCPY - if (bitmask_cpu) { - numa_set_membind(bitmask_cpu); - numa_free_nodemask(bitmask_cpu); - } + if (bitmask_cpu) { + numa_set_membind(bitmask_cpu); + numa_free_nodemask(bitmask_cpu); + } #else - struct bitmask *bitmask_all = numa_allocate_nodemask(); - numa_bitmask_setall(bitmask_all); - numa_set_membind(bitmask_all); - numa_free_nodemask(bitmask_all); -#endif // NUMA_MEMCPY + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY #endif - DTYPE result_host = -1; + DTYPE result_host = -1; - // Create an input file with arbitrary data. - create_test_file(input, input_size, querys, n_querys); + // Create an input file with arbitrary data. + create_test_file(input, input_size, querys, n_querys); #if NUMA - mp_pages[0] = input; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_in = mp_status[0]; - } - - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } + mp_pages[0] = input; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_in = mp_status[0]; + } + + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } #endif #if NUMA_MEMCPY - numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) + || (numa_node_cpu + 8 == numa_node_in)) * 1; #endif #if NUMA_MEMCPY - DTYPE *input_local = input; - DTYPE *querys_local = querys; - start(&timer, 1, 0); - if (!numa_node_in_is_local) { - input_local = numa_alloc((input_size) * sizeof(DTYPE)); - querys_local = numa_alloc((n_querys) * sizeof(DTYPE)); - } - stop(&timer, 1); - if (!numa_node_in_is_local) { - if (numa_node_cpu_memcpy != -1) { - if (numa_run_on_node(numa_node_cpu_memcpy) == -1) { - perror("numa_run_on_node"); - numa_node_cpu_memcpy = -1; - } - } - } - start(&timer, 2, 0); - if (!numa_node_in_is_local) { - memcpy(input_local, input, input_size * sizeof(DTYPE)); - memcpy(querys_local, querys, n_querys * sizeof(DTYPE)); - } else { - input_local = input; - querys_local = querys; - } - stop(&timer, 2); - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } - mp_pages[0] = input_local; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(input_local)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_local = mp_status[0]; - } + DTYPE *input_local = input; + DTYPE *querys_local = querys; + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + input_local = numa_alloc((input_size) * sizeof(DTYPE)); + querys_local = numa_alloc((n_querys) * sizeof(DTYPE)); + } + stop(&timer, 1); + if (!numa_node_in_is_local) { + if (numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(numa_node_cpu_memcpy) == -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + memcpy(input_local, input, input_size * sizeof(DTYPE)); + memcpy(querys_local, querys, n_querys * sizeof(DTYPE)); + } else { + input_local = input; + querys_local = querys; + } + stop(&timer, 2); + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + mp_pages[0] = input_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(input_local)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_local = mp_status[0]; + } +#endif + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } #endif - start(&timer, 0, 0); + start(&timer, 0, 0); #if NUMA_MEMCPY - result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys); + result_host = + binarySearch(input_local, input_size - 1, querys_local, n_querys); #else - result_host = binarySearch(input, input_size - 1, querys, n_querys); + result_host = binarySearch(input, input_size - 1, querys, n_querys); +#endif + stop(&timer, 0); + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } #endif - stop(&timer, 0); #if NUMA_MEMCPY - start(&timer, 3, 0); - if (!numa_node_in_is_local) { - numa_free(input_local, input_size * sizeof(DTYPE)); - numa_free(querys_local, n_querys * sizeof(DTYPE)); - } - stop(&timer, 3); + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(input_local, input_size * sizeof(DTYPE)); + numa_free(querys_local, n_querys * sizeof(DTYPE)); + } + stop(&timer, 3); #endif - unsigned int nr_threads = 0; + int status = (result_host); +#if WITH_BENCHMARK + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; + nr_threads++; - int status = (result_host); - if (status) { + if (status) { #if NUMA_MEMCPY - printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu" - " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d" - " | throughput_MBps=%f throughput_MOpps=%f" - " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", - nr_threads, "uint64_t", input_size, - numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), - n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0], - timer.time[0], timer.time[1], timer.time[2], timer.time[3], - timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); + printf + ("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu" + " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d" + " | throughput_MBps=%f throughput_MOpps=%f" + " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + nr_threads, "uint64_t", input_size, numa_node_in, + numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, + numa_distance(numa_node_in, numa_node_cpu), + n_querys * sizeof(DTYPE) / timer.time[0], + n_querys / timer.time[0], timer.time[0], timer.time[1], + timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + + timer.time[3]); #else - printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu" + printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu" #if NUMA - " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" + " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" #endif - " | throughput_MBps=%f", - nr_threads, "uint64_t", input_size, + " | throughput_MBps=%f", + nr_threads, "uint64_t", input_size, #if NUMA - numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), + numa_node_in, numa_node_cpu, numa_distance(numa_node_in, + numa_node_cpu), #endif - n_querys * sizeof(DTYPE) / timer.time[0]); - printf(" throughput_MOpps=%f latency_us=%f\n", - n_querys / timer.time[0], timer.time[0]); + n_querys * sizeof(DTYPE) / timer.time[0]); + printf(" throughput_MOpps=%f latency_us=%f\n", + n_querys / timer.time[0], timer.time[0]); #endif - } else { - printf("[ERROR]\n"); - } + } else { + printf("[ERROR]\n"); + } +#endif // WITH_BENCHMARK #if NUMA - numa_free(input, input_size * sizeof(DTYPE)); - numa_free(querys, n_querys * sizeof(DTYPE)); + numa_free(input, input_size * sizeof(DTYPE)); + numa_free(querys, n_querys * sizeof(DTYPE)); #else - free(input); - free(querys); + free(input); + free(querys); #endif - - return status ? 0 : 1; + return status ? 0 : 1; } - diff --git a/BS/baselines/cpu/run-perf.sh b/BS/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..5b671e0 --- /dev/null +++ b/BS/baselines/cpu/run-perf.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B numa=1 + +OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4 +OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./bs_omp $((2**29)) 16777216 4 4 diff --git a/BS/dimes-hetsim-hbm.sh b/BS/dimes-hetsim-hbm.sh index 4e1500d..4a775ae 100755 --- a/BS/dimes-hetsim-hbm.sh +++ b/BS/dimes-hetsim-hbm.sh @@ -1,7 +1,7 @@ #!/bin/bash cd baselines/cpu -make -B NUMA=1 +make -B numa=1 mkdir -p log/$(hostname) fn=log/$(hostname)/dimes-hetsim-hbm diff --git a/BS/dimes-hetsim-nmc.sh b/BS/dimes-hetsim-nmc.sh index 195334b..fa697bf 100755 --- a/BS/dimes-hetsim-nmc.sh +++ b/BS/dimes-hetsim-nmc.sh @@ -3,6 +3,8 @@ mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) fn=log/$(hostname)/dimes-hetsim-nmc +source /opt/upmem/upmem-2024.1.0-Linux-x86_64/upmem_env.sh + # upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB) # upstream DPU version uses 2 queries input_size_upstream=2048576 @@ -11,6 +13,8 @@ num_queries_upstream=2 input_size_dpu=$(perl -E 'say 2 ** 22') num_queries_dpu=1048576 +# Make sure that num_queries > input_size! + run_benchmark_nmc() { local "$@" set -e @@ -69,7 +73,7 @@ cd baselines/cpu ( -make -B NUMA=1 NUMA_MEMCPY=1 +make -B numa=1 numa_memcpy=1 echo "CPU single-node upstream-ref with memcpy, copy node == input node (1/6)" >&2 @@ -97,7 +101,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ :::+ cpu 0 1 \ ::: nr_threads 1 2 4 8 12 16 -make -B NUMA=1 +make -B numa=1 echo "CPU single-node upstream-ref (3/6)" >&2 diff --git a/BS/dpu/task.c b/BS/dpu/task.c index acf66f2..5881dd1 100644 --- a/BS/dpu/task.c +++ b/BS/dpu/task.c @@ -17,140 +17,168 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; // Search -static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size) { - DTYPE found = -2; - if(bufferA[0] <= searching_for) - { - found = -1; - for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++){ - if(bufferA[i] == searching_for) - { - found = i; - break; - } - } - } - return found; +static DTYPE search(DTYPE *bufferA, DTYPE searching_for, size_t search_size) +{ + DTYPE found = -2; + if (bufferA[0] <= searching_for) { + found = -1; + for (uint32_t i = 0; i < search_size / sizeof(DTYPE); i++) { + if (bufferA[i] == searching_for) { + found = i; + break; + } + } + } + return found; } BARRIER_INIT(my_barrier, NR_TASKLETS); extern int main_kernel1(void); -int(*kernels[nr_kernels])(void) = {main_kernel1}; +int (*kernels[nr_kernels])(void) = { main_kernel1 }; -int main(void){ - // Kernel - return kernels[DPU_INPUT_ARGUMENTS.kernel](); +int main(void) +{ + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel] (); } // main_kernel1 -int main_kernel1() { - unsigned int tasklet_id = me(); - #if PRINT - printf("tasklet_id = %u\n", tasklet_id); - #endif - if(tasklet_id == 0){ - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); - - DTYPE searching_for, found; - uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size; - - // Address of the current processing block in MRAM - uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; - uint32_t start_mram_block_addr_aux = start_mram_block_addr_A; - uint32_t end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size; - uint32_t current_mram_block_addr_query = end_mram_block_addr_A + tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) * sizeof(DTYPE); - - // Initialize a local cache to store the MRAM block - DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE); - - dpu_results_t *result = &DPU_RESULTS[tasklet_id]; - - for(uint64_t targets = 0; targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS); targets++) - { - found = -1; - - mram_read((__mram_ptr void const *) current_mram_block_addr_query, &searching_for, 8); - current_mram_block_addr_query += 8; - - // Initialize input vector boundaries - start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; - start_mram_block_addr_aux = start_mram_block_addr_A; - end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size; - - uint32_t current_mram_block_addr_A = start_mram_block_addr_A; - - // Bring first and last values to WRAM - mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_aux_A, BLOCK_SIZE); - mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)), cache_aux_B, BLOCK_SIZE); - - while(1) - { - // Locate the address of the mid mram block - current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2; - current_mram_block_addr_A &= WORD_MASK; - - // Boundary check - if(current_mram_block_addr_A < (start_mram_block_addr_A + BLOCK_SIZE)) - { - // Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE) - mram_read((__mram_ptr void const *) start_mram_block_addr_A, cache_A, BLOCK_SIZE); - found = search(cache_A, searching_for, BLOCK_SIZE); - - if(found > -1) - { - result->found = found + (start_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - } - // Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A) - else - { - size_t remain_bytes_to_search = end_mram_block_addr_A - (start_mram_block_addr_A + BLOCK_SIZE); - mram_read((__mram_ptr void const *) start_mram_block_addr_A + BLOCK_SIZE, cache_A, remain_bytes_to_search); - found = search(cache_A, searching_for, remain_bytes_to_search); - - if(found > -1) - { - result->found = found + (start_mram_block_addr_A + BLOCK_SIZE - start_mram_block_addr_aux) / sizeof(DTYPE); - } - else - { - printf("%lld NOT found\n", searching_for); - } +int main_kernel1() +{ + unsigned int tasklet_id = me(); +#if PRINT + printf("tasklet_id = %u\n", tasklet_id); +#endif + if (tasklet_id == 0) { + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + DTYPE searching_for, found; + uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size; + + // Address of the current processing block in MRAM + uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; + uint32_t start_mram_block_addr_aux = start_mram_block_addr_A; + uint32_t end_mram_block_addr_A = + start_mram_block_addr_A + sizeof(DTYPE) * input_size; + uint32_t current_mram_block_addr_query = + end_mram_block_addr_A + + tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) * + sizeof(DTYPE); + + // Initialize a local cache to store the MRAM block + DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE); + + dpu_results_t *result = &DPU_RESULTS[tasklet_id]; + + for (uint64_t targets = 0; + targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS); + targets++) { + found = -1; + + mram_read((__mram_ptr void const *) + current_mram_block_addr_query, &searching_for, 8); + current_mram_block_addr_query += 8; + + // Initialize input vector boundaries + start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; + start_mram_block_addr_aux = start_mram_block_addr_A; + end_mram_block_addr_A = + start_mram_block_addr_A + sizeof(DTYPE) * input_size; + + uint32_t current_mram_block_addr_A = start_mram_block_addr_A; + + // Bring first and last values to WRAM + mram_read((__mram_ptr void const *)current_mram_block_addr_A, + cache_aux_A, BLOCK_SIZE); + mram_read((__mram_ptr void const *)(end_mram_block_addr_A - + BLOCK_SIZE * sizeof(DTYPE)), + cache_aux_B, BLOCK_SIZE); + + while (1) { + // Locate the address of the mid mram block + current_mram_block_addr_A = + (start_mram_block_addr_A + + end_mram_block_addr_A) / 2; + current_mram_block_addr_A &= WORD_MASK; + + // Boundary check + if (current_mram_block_addr_A < + (start_mram_block_addr_A + BLOCK_SIZE)) { + // Search inside (start_mram_block_addr_A, start_mram_block_addr_A + BLOCK_SIZE) + mram_read((__mram_ptr void const *) + start_mram_block_addr_A, cache_A, + BLOCK_SIZE); + found = + search(cache_A, searching_for, BLOCK_SIZE); + + if (found > -1) { + result->found = + found + (start_mram_block_addr_A - + start_mram_block_addr_aux) + / sizeof(DTYPE); + } + // Search inside (start_mram_block_addr_A + BLOCK_SIZE, end_mram_block_addr_A) + else { + size_t remain_bytes_to_search = + end_mram_block_addr_A - + (start_mram_block_addr_A + + BLOCK_SIZE); + mram_read((__mram_ptr void const *) + start_mram_block_addr_A + + BLOCK_SIZE, cache_A, + remain_bytes_to_search); + found = + search(cache_A, searching_for, + remain_bytes_to_search); + + if (found > -1) { + result->found = + found + + (start_mram_block_addr_A + + BLOCK_SIZE - + start_mram_block_addr_aux) + / sizeof(DTYPE); + } else { + printf("%lld NOT found\n", + searching_for); + } + } + break; + } + // Load cache with current MRAM block + mram_read((__mram_ptr void const *) + current_mram_block_addr_A, cache_A, + BLOCK_SIZE); + + // Search inside block + found = search(cache_A, searching_for, BLOCK_SIZE); + + // If found > -1, we found the searching_for query + if (found > -1) { + result->found = + found + (current_mram_block_addr_A - + start_mram_block_addr_aux) / + sizeof(DTYPE); + break; + } + // If found == -2, we need to discard right part of the input vector + if (found == -2) { + end_mram_block_addr_A = + current_mram_block_addr_A; + } + // If found == -1, we need to discard left part of the input vector + else if (found == -1) { + start_mram_block_addr_A = + current_mram_block_addr_A; + } + } } - break; - } - - // Load cache with current MRAM block - mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE); - - // Search inside block - found = search(cache_A, searching_for, BLOCK_SIZE); - - // If found > -1, we found the searching_for query - if(found > -1) - { - result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE); - break; - } - - // If found == -2, we need to discard right part of the input vector - if(found == -2) - { - end_mram_block_addr_A = current_mram_block_addr_A; - } - - // If found == -1, we need to discard left part of the input vector - else if (found == -1) - { - start_mram_block_addr_A = current_mram_block_addr_A; - } - } - } - return 0; + return 0; } diff --git a/BS/host/app.c b/BS/host/app.c index 10d76f1..217ea99 100644 --- a/BS/host/app.c +++ b/BS/host/app.c @@ -31,24 +31,28 @@ #define DPU_BINARY "./bin/bs_dpu" // Create input arrays -void create_test_file(DTYPE * input, DTYPE * querys, uint64_t nr_elements, uint64_t nr_querys) { +void create_test_file(DTYPE *input, DTYPE *querys, uint64_t nr_elements, + uint64_t nr_querys) +{ + + srand(time(NULL)); input[0] = 1; for (uint64_t i = 1; i < nr_elements; i++) { - input[i] = input[i - 1] + 1; + input[i] = input[i - 1] + (rand() % 10) + 1; } for (uint64_t i = 0; i < nr_querys; i++) { - querys[i] = i; + querys[i] = input[rand() % nr_elements]; } } // Compute output in the host -int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t num_querys) +int64_t binarySearch(DTYPE *input, DTYPE *querys, DTYPE input_size, + uint64_t num_querys) { uint64_t result = -1; DTYPE r; - for(uint64_t q = 0; q < num_querys; q++) - { + for (uint64_t q = 0; q < num_querys; q++) { DTYPE l = 0; r = input_size; while (l <= r) { @@ -57,92 +61,96 @@ int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t n // XXX shouldn't this short-circuit? // Check if x is present at mid if (input[m] == querys[q]) - result = m; + result = m; // If x greater, ignore left half if (input[m] < querys[q]) - l = m + 1; + l = m + 1; // If x is smaller, ignore right half else - r = m - 1; + r = m - 1; } } return result; } - // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ struct Params p = input_params(argc, argv); struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + uint32_t nr_of_ranks; uint64_t input_size = INPUT_SIZE; uint64_t num_querys = p.num_querys; DTYPE result_host = -1; - DTYPE result_dpu = -1; + DTYPE result_dpu = -1; - // Timer declaration - Timer timer; + // Timer declaration + Timer timer; - int numa_node_rank = -2; + int numa_node_rank = -2; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + timer.time[0] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[1] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[6] = 0; // free #endif - #if ENERGY +#if ENERGY struct dpu_probe_t probe; DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); - #endif +#endif // Query number adjustement for proper partitioning - if(num_querys % (NR_DPUS * NR_TASKLETS)) - num_querys = num_querys + (NR_DPUS * NR_TASKLETS - num_querys % (NR_DPUS * NR_TASKLETS)); + if (num_querys % (NR_DPUS * NR_TASKLETS)) + num_querys = + num_querys + (NR_DPUS * NR_TASKLETS - + num_querys % (NR_DPUS * NR_TASKLETS)); - assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors + assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors - DTYPE * input = malloc((input_size) * sizeof(DTYPE)); - DTYPE * querys = malloc((num_querys) * sizeof(DTYPE)); + DTYPE *input = malloc((input_size) * sizeof(DTYPE)); + DTYPE *querys = malloc((num_querys) * sizeof(DTYPE)); // Create an input file with arbitrary data create_test_file(input, querys, input_size, num_querys); // Create kernel arguments - uint64_t slice_per_dpu = num_querys / NR_DPUS; - dpu_arguments_t input_arguments = {input_size, slice_per_dpu, 0}; + uint64_t slice_per_dpu = num_querys / NR_DPUS; + dpu_arguments_t input_arguments = { input_size, slice_per_dpu, 0 }; for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Perform input transfers uint64_t i = 0; #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 0, 0); } DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 0); } #endif #if WITH_DPUINFO printf("DPUs:"); - DPU_FOREACH (dpu_set, dpu) { - int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; + DPU_FOREACH(dpu_set, dpu) { + int rank = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; int slice = dpu_get_slice_id(dpu_from_set(dpu)); int member = dpu_get_member_id(dpu_from_set(dpu)); printf(" %d(%d.%d)", rank, slice, member); @@ -150,11 +158,11 @@ int main(int argc, char **argv) { printf("\n"); #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 1, 0); } DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 1); } DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); @@ -164,27 +172,35 @@ int main(int argc, char **argv) { // int prev_rank_id = -1; int rank_id = -1; - DPU_FOREACH (dpu_set, dpu) { - rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) { + DPU_FOREACH(dpu_set, dpu) { + rank_id = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + if ((numa_node_rank != -2) + && numa_node_rank != + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu)))) { numa_node_rank = -1; } else { - numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu))); + numa_node_rank = + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu))); } /* - if (rank_id != prev_rank_id) { - printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); - prev_rank_id = rank_id; - } - */ + if (rank_id != prev_rank_id) { + printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); + prev_rank_id = rank_id; + } + */ } // Compute host solution - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 2, 0); } - result_host = binarySearch(input, querys, input_size - 1, num_querys); - if(rep >= p.n_warmup) { + result_host = + binarySearch(input, querys, input_size - 1, num_querys); + if (rep >= p.n_warmup) { stop(&timer, 2); } @@ -192,103 +208,110 @@ int main(int argc, char **argv) { start(&timer, 3, 0); } - DPU_FOREACH(dpu_set, dpu, i) - { + DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(input_arguments), DPU_XFER_DEFAULT)); i = 0; - DPU_FOREACH(dpu_set, dpu, i) - { + DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, input)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size * sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + input_size * sizeof(DTYPE), DPU_XFER_DEFAULT)); i = 0; - DPU_FOREACH(dpu_set, dpu, i) - { - DPU_ASSERT(dpu_prepare_xfer(dpu, querys + slice_per_dpu * i)); + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, querys + slice_per_dpu * i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size * sizeof(DTYPE), slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + input_size * sizeof(DTYPE), + slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT)); if (rep >= p.n_warmup) { stop(&timer, 3); } - // Run kernel on DPUs - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { start(&timer, 4, 0); - #if ENERGY +#if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); - #endif +#endif } DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { stop(&timer, 4); - #if ENERGY +#if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); - #endif +#endif } // Print logs if required - #if PRINT +#if PRINT unsigned int each_dpu = 0; printf("Display DPU Logs\n"); - DPU_FOREACH(dpu_set, dpu) - { + DPU_FOREACH(dpu_set, dpu) { printf("DPU#%d:\n", each_dpu); DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); each_dpu++; } - #endif +#endif // Retrieve results - dpu_results_t* results_retrieve[NR_DPUS]; + dpu_results_t *results_retrieve[NR_DPUS]; if (rep >= p.n_warmup) { start(&timer, 5, 0); } i = 0; - DPU_FOREACH(dpu_set, dpu, i) - { - results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t)); + DPU_FOREACH(dpu_set, dpu, i) { + results_retrieve[i] = + (dpu_results_t *) malloc(NR_TASKLETS * + sizeof(dpu_results_t)); DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i])); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT)); - - DPU_FOREACH(dpu_set, dpu, i) - { - for(unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) - { - if(results_retrieve[i][each_tasklet].found > result_dpu) - { - result_dpu = results_retrieve[i][each_tasklet].found; + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, + NR_TASKLETS * sizeof(dpu_results_t), + DPU_XFER_DEFAULT)); + + DPU_FOREACH(dpu_set, dpu, i) { + for (unsigned int each_tasklet = 0; + each_tasklet < NR_TASKLETS; each_tasklet++) { + if (results_retrieve[i][each_tasklet].found > + result_dpu) { + result_dpu = + results_retrieve[i][each_tasklet]. + found; } } free(results_retrieve[i]); } - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 5); } - #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 6, 0); } #endif DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 6); } #endif @@ -296,58 +319,91 @@ int main(int argc, char **argv) { int status = (result_dpu == result_host); if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n"); + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] results are equal\n"); if (rep >= p.n_warmup) { - printf("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", - NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, input_size); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", - timer.time[0], - timer.time[1], - timer.time[2], - timer.time[3], - timer.time[4], - timer.time[5], - timer.time[6]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - num_querys * sizeof(DTYPE) / timer.time[2], - num_querys * sizeof(DTYPE) / (timer.time[4]), - num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - num_querys * sizeof(DTYPE) / (timer.time[3] + timer.time[4] + timer.time[5]), - num_querys * sizeof(DTYPE) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - num_querys * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - num_querys / timer.time[2], - num_querys / (timer.time[4]), - num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - num_querys / (timer.time[3] + timer.time[4] + timer.time[5]), - num_querys / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - num_querys / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); + printf + ("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", + NR_DPUS, nr_of_ranks, NR_TASKLETS, + XSTR(DTYPE), BLOCK_SIZE, input_size); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD, numa_node_rank); + printf + ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + timer.time[0], timer.time[1], + timer.time[2], timer.time[3], + timer.time[4], timer.time[5], + timer.time[6]); + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + num_querys * sizeof(DTYPE) / timer.time[2], + num_querys * sizeof(DTYPE) / + (timer.time[4]), + num_querys * sizeof(DTYPE) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5] + timer.time[6])); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + num_querys * sizeof(DTYPE) / + (timer.time[3] + timer.time[4] + + timer.time[5]), + num_querys * sizeof(DTYPE) / + (timer.time[1] + timer.time[3] + + timer.time[4] + timer.time[5]), + num_querys * sizeof(DTYPE) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + num_querys / timer.time[2], + num_querys / (timer.time[4]), + num_querys / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5] + + timer.time[6])); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + num_querys / (timer.time[3] + + timer.time[4] + + timer.time[5]), + num_querys / (timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5]), + num_querys / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5])); } } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n"); + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] results differ!\n"); } } // Print timing results /* - printf("CPU Version Time (ms): "); - print(&timer, 0, p.n_reps); - printf("CPU-DPU Time (ms): "); - print(&timer, 1, p.n_reps); - printf("DPU Kernel Time (ms): "); - print(&timer, 2, p.n_reps); - printf("DPU-CPU Time (ms): "); - print(&timer, 3, p.n_reps); - */ - - #if ENERGY + printf("CPU Version Time (ms): "); + print(&timer, 0, p.n_reps); + printf("CPU-DPU Time (ms): "); + print(&timer, 1, p.n_reps); + printf("DPU Kernel Time (ms): "); + print(&timer, 2, p.n_reps); + printf("DPU-CPU Time (ms): "); + print(&timer, 3, p.n_reps); + */ + +#if ENERGY double energy; DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); printf("DPU Energy (J): %f\t", energy * num_iterations); - #endif +#endif free(input); #if !WITH_ALLOC_OVERHEAD diff --git a/BS/support/common.h b/BS/support/common.h index dbd050c..54adc39 100755 --- a/BS/support/common.h +++ b/BS/support/common.h @@ -38,7 +38,7 @@ typedef struct { // Structures used by both the host and the dpu to communicate information typedef struct { - DTYPE found; + DTYPE found; } dpu_results_t; #ifndef ENERGY diff --git a/BS/support/params.h b/BS/support/params.h index 02bd750..c91202f 100644 --- a/BS/support/params.h +++ b/BS/support/params.h @@ -4,49 +4,56 @@ #include "common.h" typedef struct Params { - long num_querys; - unsigned n_warmup; - unsigned n_reps; -}Params; + long num_querys; + unsigned n_warmup; + unsigned n_reps; +} Params; -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> problem size (default=2 queries)" - "\n"); - } +void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> problem size (default=2 queries)" "\n"); +} - struct Params input_params(int argc, char **argv) { - struct Params p; - p.num_querys = PROBLEM_SIZE; - p.n_warmup = 1; - p.n_reps = 3; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.num_querys = PROBLEM_SIZE; + p.n_warmup = 1; + p.n_reps = 3; - int opt; - while((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.num_querys = atol(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.num_querys = atol(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; - } - #endif + return p; +} +#endif diff --git a/BS/support/timer.h b/BS/support/timer.h index ff1ae1b..256447a 100755 --- a/BS/support/timer.h +++ b/BS/support/timer.h @@ -1,66 +1,71 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> +typedef struct Timer { + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) +{ + printf("%f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/COUNT/baselines/cpu/Makefile b/COUNT/baselines/cpu/Makefile index 4608944..ede0498 100644 --- a/COUNT/baselines/cpu/Makefile +++ b/COUNT/baselines/cpu/Makefile @@ -1,8 +1,23 @@ -NUMA ?= 0 -FLAGS = +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 +numa_memcpy ?= 0 -ifeq (${NUMA}, 1) - FLAGS += -lnuma +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma endif .PHONY: all @@ -11,7 +26,7 @@ all: count TYPE ?= uint64_t count: app_baseline.c - gcc -Wall -Wextra -pedantic -march=native -O2 -o count -fopenmp -DT=${TYPE} -DNUMA=${NUMA} app_baseline.c ${FLAGS} + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} -o count -DT=${TYPE} -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} app_baseline.c -fopenmp ${LDFLAGS} .PHONY: run run: count @@ -19,4 +34,4 @@ run: count .PHONY: clean clean: - rm -f count count_O0 count_O2 + rm -f count diff --git a/COUNT/baselines/cpu/app_baseline.c b/COUNT/baselines/cpu/app_baseline.c index d52257a..13e3f51 100644 --- a/COUNT/baselines/cpu/app_baseline.c +++ b/COUNT/baselines/cpu/app_baseline.c @@ -12,13 +12,19 @@ #include <assert.h> #include <stdint.h> #include <omp.h> + +#if WITH_BENCHMARK #include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif #if NUMA #include <numaif.h> #include <numa.h> -void* mp_pages[1]; +void *mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_in = -1; @@ -37,71 +43,70 @@ volatile int total_count; // Params --------------------------------------------------------------------- typedef struct Params { - char* dpu_type; - int input_size; - int n_warmup; - int n_reps; - int n_threads; + char *dpu_type; + int input_size; + int n_warmup; + int n_reps; + int n_threads; #if NUMA - struct bitmask* bitmask_in; - struct bitmask* bitmask_out; - int numa_node_cpu; + struct bitmask *bitmask_in; + struct bitmask *bitmask_out; + int numa_node_cpu; #endif -}Params; +} Params; struct Params p; static T *A; -bool pred(const T x){ - return (x % 2) == 0; +bool pred(const T x) +{ + return (x % 2) == 0; } - -void create_test_file(unsigned int nr_elements) { - //srand(0); +void create_test_file(unsigned int nr_elements) +{ + //srand(0); #if NUMA - if (p.bitmask_in) { - numa_set_membind(p.bitmask_in); - numa_free_nodemask(p.bitmask_in); - } - A = (T*) numa_alloc(nr_elements * sizeof(T)); + if (p.bitmask_in) { + numa_set_membind(p.bitmask_in); + numa_free_nodemask(p.bitmask_in); + } + A = (T *) numa_alloc(nr_elements * sizeof(T)); #else - A = (T*) malloc(nr_elements * sizeof(T)); + A = (T *) malloc(nr_elements * sizeof(T)); #endif #if NUMA - struct bitmask *bitmask_all = numa_allocate_nodemask(); - numa_bitmask_setall(bitmask_all); - numa_set_membind(bitmask_all); - numa_free_nodemask(bitmask_all); + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); #endif - for (unsigned int i = 0; i < nr_elements; i++) { - //A[i] = (unsigned int) (rand()); - A[i] = i+1; - } + for (unsigned int i = 0; i < nr_elements; i++) { + //A[i] = (unsigned int) (rand()); + A[i] = i + 1; + } #if NUMA - mp_pages[0] = A; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_in = mp_status[0]; - } - - numa_node_cpu = p.numa_node_cpu; - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_in = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } #endif } @@ -109,116 +114,152 @@ void create_test_file(unsigned int nr_elements) { /** * @brief compute output in the host */ -static int count_host(int size, int t) { - int count = 0; - - omp_set_num_threads(t); - #pragma omp parallel for reduction(+:count) - for(int my = 0; my < size; my++) { - if(!pred(A[my])) { - count++; - } - } - return count; +static int count_host(int size, int t) +{ + int count = 0; + + omp_set_num_threads(t); +#pragma omp parallel for reduction(+:count) + for (int my = 0; my < size; my++) { + if (!pred(A[my])) { + count++; + } + } + return count; } -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -d <D> DPU type (default=fsim)" - "\n -t <T> # of threads (default=8)" - "\n -w <W> # of untimed warmup iterations (default=2)" - "\n -e <E> # of timed repetition iterations (default=5)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=8M elements)" - "\n"); +void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -d <D> DPU type (default=fsim)" + "\n -t <T> # of threads (default=8)" + "\n -w <W> # of untimed warmup iterations (default=2)" + "\n -e <E> # of timed repetition iterations (default=5)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=8M elements)" "\n"); } -void input_params(int argc, char **argv) { - p.input_size = 16 << 20; - p.n_warmup = 1; - p.n_reps = 3; - p.n_threads = 5; +void input_params(int argc, char **argv) +{ + p.input_size = 16 << 20; + p.n_warmup = 1; + p.n_reps = 3; + p.n_threads = 5; #if NUMA - p.bitmask_in = NULL; - p.bitmask_out = NULL; - p.numa_node_cpu = -1; + p.bitmask_in = NULL; + p.bitmask_out = NULL; + p.numa_node_cpu = -1; #endif - int opt; - while((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 't': p.n_threads = atoi(optarg); break; + int opt; + while ((opt = getopt(argc, argv, "hi:w:e:t:a:b:c:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 't': + p.n_threads = atoi(optarg); + break; #if NUMA - case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break; - case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break; - case 'c': p.numa_node_cpu = atoi(optarg); break; + case 'a': + p.bitmask_in = numa_parse_nodestring(optarg); + break; + case 'b': + p.bitmask_out = numa_parse_nodestring(optarg); + break; + case 'c': + p.numa_node_cpu = atoi(optarg); + break; #endif - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(p.n_threads > 0 && "Invalid # of ranks!"); + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.n_threads > 0 && "Invalid # of ranks!"); } /** * @brief Main of the Host Application. */ -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - input_params(argc, argv); + input_params(argc, argv); - const unsigned int file_size = p.input_size; + const unsigned int file_size = p.input_size; - // Create an input file with arbitrary data. - create_test_file(file_size); + // Create an input file with arbitrary data. + create_test_file(file_size); - Timer timer; +#if WITH_BENCHMARK + Timer timer; +#endif + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } +#endif - for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { - start(&timer, 0, 0); - total_count = count_host(file_size, p.n_threads); - stop(&timer, 0); + for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + start(&timer, 0, 0); + total_count = count_host(file_size, p.n_threads); + stop(&timer, 0); - unsigned int nr_threads = 0; +#if WITH_BENCHMARK + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; + nr_threads++; - if (rep >= p.n_warmup) { - printf("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d" + if (rep >= p.n_warmup) { + printf + ("[::] COUNT-CPU | n_threads=%d e_type=%s n_elements=%d" #if NUMA - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" #endif - " | throughput_MBps=%f", - nr_threads, XSTR(T), file_size, + " | throughput_MBps=%f", + nr_threads, XSTR(T), file_size, #if NUMA - numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_out, numa_node_cpu, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), +#endif + file_size * 2 * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f", + file_size / timer.time[0]); + printall(&timer, 0); + } +#endif // WITH_BENCHMARK + } + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } #endif - file_size * 2 * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f", - file_size / timer.time[0]); - printall(&timer, 0); - } - } #if NUMA - numa_free(A, file_size * sizeof(T)); + numa_free(A, file_size * sizeof(T)); #else - free(A); + free(A); #endif - return 0; + return 0; } diff --git a/COUNT/dpu/task.c b/COUNT/dpu/task.c index b2ed79b..8ba6aaf 100644 --- a/COUNT/dpu/task.c +++ b/COUNT/dpu/task.c @@ -21,33 +21,36 @@ uint32_t message[NR_TASKLETS]; uint32_t message_partial_count; // COUNT in each tasklet -static unsigned int count(T *input){ - unsigned int cnt = 0; - #pragma unroll - for(unsigned int j = 0; j < REGS; j++) { - if(!pred(input[j])) { - cnt++; - } - } - return cnt; +static unsigned int count(T *input) +{ + unsigned int cnt = 0; +#pragma unroll + for (unsigned int j = 0; j < REGS; j++) { + if (!pred(input[j])) { + cnt++; + } + } + return cnt; } // Handshake with adjacent tasklets -static unsigned int handshake_sync(unsigned int l_count, unsigned int tasklet_id){ - unsigned int p_count; - // Wait and read message - if(tasklet_id != 0){ - handshake_wait_for(tasklet_id - 1); - p_count = message[tasklet_id]; - } else { - p_count = 0; - } - // Write message and notify - if(tasklet_id < NR_TASKLETS - 1){ - message[tasklet_id + 1] = p_count + l_count; - handshake_notify(); - } - return p_count; +static unsigned int handshake_sync(unsigned int l_count, + unsigned int tasklet_id) +{ + unsigned int p_count; + // Wait and read message + if (tasklet_id != 0) { + handshake_wait_for(tasklet_id - 1); + p_count = message[tasklet_id]; + } else { + p_count = 0; + } + // Write message and notify + if (tasklet_id < NR_TASKLETS - 1) { + message[tasklet_id + 1] = p_count + l_count; + handshake_notify(); + } + return p_count; } // Barrier @@ -55,63 +58,70 @@ BARRIER_INIT(my_barrier, NR_TASKLETS); extern int main_kernel1(void); -int (*kernels[nr_kernels])(void) = {main_kernel1}; +int (*kernels[nr_kernels])(void) = { main_kernel1 }; -int main(void) { - // Kernel - return kernels[DPU_INPUT_ARGUMENTS.kernel](); +int main(void) +{ + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel] (); } // main_kernel1 -int main_kernel1() { - unsigned int tasklet_id = me(); +int main_kernel1() +{ + unsigned int tasklet_id = me(); #if PRINT - printf("tasklet_id = %u\n", tasklet_id); + printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); - dpu_results_t *result = &DPU_RESULTS[tasklet_id]; + dpu_results_t *result = &DPU_RESULTS[tasklet_id]; - uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; + uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; - // Address of the current processing block in MRAM - uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; - uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER; + // Address of the current processing block in MRAM + uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; + uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; - // Initialize a local cache to store the MRAM block - T *cache_A = (T *) mem_alloc(BLOCK_SIZE); + // Initialize a local cache to store the MRAM block + T *cache_A = (T *) mem_alloc(BLOCK_SIZE); - // Initialize shared variable - if(tasklet_id == NR_TASKLETS - 1) - message_partial_count = 0; - // Barrier - barrier_wait(&my_barrier); + // Initialize shared variable + if (tasklet_id == NR_TASKLETS - 1) + message_partial_count = 0; + // Barrier + barrier_wait(&my_barrier); - for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){ + for (unsigned int byte_index = base_tasklet; + byte_index < input_size_dpu_bytes; + byte_index += BLOCK_SIZE * NR_TASKLETS) { - // Load cache with current MRAM block - mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, BLOCK_SIZE); + // Load cache with current MRAM block + mram_read((__mram_ptr void const *)(mram_base_addr_A + + byte_index), cache_A, + BLOCK_SIZE); - // COUNT in each tasklet - uint32_t l_count = count(cache_A); + // COUNT in each tasklet + uint32_t l_count = count(cache_A); - // Sync with adjacent tasklets - uint32_t p_count = handshake_sync(l_count, tasklet_id); + // Sync with adjacent tasklets + uint32_t p_count = handshake_sync(l_count, tasklet_id); - // Barrier - barrier_wait(&my_barrier); + // Barrier + barrier_wait(&my_barrier); - // Total count in this DPU - if(tasklet_id == NR_TASKLETS - 1){ - result->t_count = message_partial_count + p_count + l_count; - message_partial_count = result->t_count; - } + // Total count in this DPU + if (tasklet_id == NR_TASKLETS - 1) { + result->t_count = + message_partial_count + p_count + l_count; + message_partial_count = result->t_count; + } - } + } - return 0; + return 0; } diff --git a/COUNT/host/app.c b/COUNT/host/app.c index 7708f6d..dad674f 100644 --- a/COUNT/host/app.c +++ b/COUNT/host/app.c @@ -33,287 +33,350 @@ #include <dpu_target_macros.h> // Pointer declaration -static T* A; +static T *A; // Create input arrays -static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) { - //srand(0); - printf("nr_elements\t%u\t", nr_elements); - for (unsigned int i = 0; i < nr_elements; i++) { - //A[i] = (T) (rand()); - A[i] = i + 1; - } - for (unsigned int i = nr_elements; i < nr_elements_round; i++) { // Complete with removable elements - A[i] = 0; - } +static void read_input(T *A, unsigned int nr_elements, + unsigned int nr_elements_round) +{ + //srand(0); + printf("nr_elements\t%u\t", nr_elements); + for (unsigned int i = 0; i < nr_elements; i++) { + //A[i] = (T) (rand()); + A[i] = i + 1; + } + for (unsigned int i = nr_elements; i < nr_elements_round; i++) { // Complete with removable elements + A[i] = 0; + } } // Compute output in the host -static unsigned int count_host(T* A, unsigned int nr_elements) { - unsigned int count = 0; - for (unsigned int i = 0; i < nr_elements; i++) { - if(!pred(A[i])) { - count++; - } - } - return count; +static unsigned int count_host(T *A, unsigned int nr_elements) +{ + unsigned int count = 0; + for (unsigned int i = 0; i < nr_elements; i++) { + if (!pred(A[i])) { + count++; + } + } + return count; } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - struct Params p = input_params(argc, argv); + struct Params p = input_params(argc, argv); - struct dpu_set_t dpu_set, dpu; - uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus; + uint32_t nr_of_ranks; - // Timer declaration - Timer timer; + // Timer declaration + Timer timer; - int numa_node_rank = -2; + int numa_node_rank = -2; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + timer.time[TMR_ALLOC] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[TMR_LOAD] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[TMR_FREE] = 0; // free #endif #if ENERGY - struct dpu_probe_t probe; - DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); #endif - unsigned int i = 0; - uint32_t accum = 0; - uint32_t total_count = 0; + unsigned int i = 0; + uint32_t accum = 0; + uint32_t total_count = 0; - const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; // Total input size (weak or strong scaling) - const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS); // Input size per DPU (max.) - const unsigned int input_size_dpu_round = - (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned + const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; // Total input size (weak or strong scaling) + const unsigned int input_size_dpu_ = divceil(input_size, NR_DPUS); // Input size per DPU (max.) + const unsigned int input_size_dpu_round = (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned - // Input allocation - A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); - T *bufferA = A; + // Input allocation + A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); + T *bufferA = A; - dpu_results_t* results_retrieve[NR_DPUS]; - for (i = 0; i < NR_DPUS; i++) { - results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t)); - } + dpu_results_t *results_retrieve[NR_DPUS]; + for (i = 0; i < NR_DPUS; i++) { + results_retrieve[i] = + (dpu_results_t *) malloc(NR_TASKLETS * + sizeof(dpu_results_t)); + } - // Create an input file with arbitrary data - read_input(A, input_size, input_size_dpu_round * NR_DPUS); + // Create an input file with arbitrary data + read_input(A, input_size, input_size_dpu_round * NR_DPUS); - printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); + printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL); - // Loop over main kernel - for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + // Loop over main kernel + for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 0, 0); - } - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - if(rep >= p.n_warmup) { - stop(&timer, 0); - } + if (rep >= p.n_warmup) { + start(&timer, TMR_ALLOC, 0); + } + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + if (rep >= p.n_warmup) { + stop(&timer, TMR_ALLOC); + } #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 1, 0); - } - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { - stop(&timer, 1); - } - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); + if (rep >= p.n_warmup) { + start(&timer, TMR_LOAD, 0); + } + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + if (rep >= p.n_warmup) { + stop(&timer, TMR_LOAD); + } + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); #endif - // int prev_rank_id = -1; - int rank_id = -1; - DPU_FOREACH (dpu_set, dpu) { - rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) { - numa_node_rank = -1; - } else { - numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu))); - } - /* - if (rank_id != prev_rank_id) { - printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); - prev_rank_id = rank_id; - } - */ - } - - // Compute output on CPU (performance comparison and verification purposes) - if(rep >= p.n_warmup) - start(&timer, 2, 0); - total_count = count_host(A, input_size); - if(rep >= p.n_warmup) - stop(&timer, 2); - - printf("Load input data\n"); - if(rep >= p.n_warmup) - start(&timer, 3, 0); - // Input arguments - const unsigned int input_size_dpu = input_size_dpu_round; - unsigned int kernel = 0; - dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel}; - // Copy input arrays - i = 0; - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT)); - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) - stop(&timer, 3); - - printf("Run program on DPU(s) \n"); - // Run DPU kernel - if(rep >= p.n_warmup) { - start(&timer, 4, 0); - #if ENERGY - DPU_ASSERT(dpu_probe_start(&probe)); - #endif - } - DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if(rep >= p.n_warmup) { - stop(&timer, 4); - #if ENERGY - DPU_ASSERT(dpu_probe_stop(&probe)); - #endif - } - + // int prev_rank_id = -1; + int rank_id = -1; + DPU_FOREACH(dpu_set, dpu) { + rank_id = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + if ((numa_node_rank != -2) + && numa_node_rank != + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu)))) { + numa_node_rank = -1; + } else { + numa_node_rank = + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu))); + } + /* + if (rank_id != prev_rank_id) { + printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); + prev_rank_id = rank_id; + } + */ + } + + // Compute output on CPU (performance comparison and verification purposes) + if (rep >= p.n_warmup) + start(&timer, TMR_CPU, 0); + total_count = count_host(A, input_size); + if (rep >= p.n_warmup) + stop(&timer, TMR_CPU); + + printf("Load input data\n"); + if (rep >= p.n_warmup) + start(&timer, TMR_WRITE, 0); + // Input arguments + const unsigned int input_size_dpu = input_size_dpu_round; + unsigned int kernel = 0; + dpu_arguments_t input_arguments = + { input_size_dpu * sizeof(T), kernel }; + // Copy input arrays + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(input_arguments), DPU_XFER_DEFAULT)); + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferA + input_size_dpu * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + input_size_dpu * sizeof(T), DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) + stop(&timer, TMR_WRITE); + + printf("Run program on DPU(s) \n"); + // Run DPU kernel + if (rep >= p.n_warmup) { + start(&timer, TMR_KERNEL, 0); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if (rep >= p.n_warmup) { + stop(&timer, TMR_KERNEL); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } #if PRINT - { - unsigned int each_dpu = 0; - printf("Display DPU Logs\n"); - DPU_FOREACH (dpu_set, dpu) { - printf("DPU#%d:\n", each_dpu); - DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); - each_dpu++; - } - } + { + unsigned int each_dpu = 0; + printf("Display DPU Logs\n"); + DPU_FOREACH(dpu_set, dpu) { + printf("DPU#%d:\n", each_dpu); + DPU_ASSERT(dpulog_read_for_dpu + (dpu.dpu, stdout)); + each_dpu++; + } + } #endif - printf("Retrieve results\n"); - dpu_results_t results[NR_DPUS]; - i = 0; - accum = 0; - - if(rep >= p.n_warmup) - start(&timer, 5, 0); - // PARALLEL RETRIEVE TRANSFER - - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i])); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT)); - - DPU_FOREACH(dpu_set, dpu, i) { - // Retrieve tasklet timings - for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) { - // Count of this DPU - if(each_tasklet == NR_TASKLETS - 1){ - results[i].t_count = results_retrieve[i][each_tasklet].t_count; - } - } - // Sequential scan - accum += results[i].t_count; - } - if(rep >= p.n_warmup) - stop(&timer, 5); - - i = 0; + printf("Retrieve results\n"); + dpu_results_t results[NR_DPUS]; + i = 0; + accum = 0; + + if (rep >= p.n_warmup) + start(&timer, TMR_READ, 0); + // PARALLEL RETRIEVE TRANSFER + + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i])); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, + NR_TASKLETS * sizeof(dpu_results_t), + DPU_XFER_DEFAULT)); + + DPU_FOREACH(dpu_set, dpu, i) { + // Retrieve tasklet timings + for (unsigned int each_tasklet = 0; + each_tasklet < NR_TASKLETS; each_tasklet++) { + // Count of this DPU + if (each_tasklet == NR_TASKLETS - 1) { + results[i].t_count = + results_retrieve[i][each_tasklet]. + t_count; + } + } + // Sequential scan + accum += results[i].t_count; + } + if (rep >= p.n_warmup) + stop(&timer, TMR_READ); + + i = 0; #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 8, 0); - } + if (rep >= p.n_warmup) { + start(&timer, TMR_FREE, 0); + } #endif - DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - stop(&timer, 8); - } + if (rep >= p.n_warmup) { + stop(&timer, TMR_FREE); + } #endif #endif - // Check output - bool status = true; - if(accum != total_count) status = false; - if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - if (rep >= p.n_warmup) { - printf("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d", - NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size_dpu_round); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", - timer.time[0], - timer.time[1], - timer.time[2], - timer.time[3], // write - timer.time[4], // kernel - timer.time[5], // read - timer.time[8]); - printf(" latency_total_us=%f", - timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - input_size * sizeof(T) / timer.time[2], - input_size * sizeof(T) / timer.time[4], - input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8]) - ); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - input_size / timer.time[2], - input_size / timer.time[4], - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[8]) - ); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - input_size / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - } - } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); - } - } - - #if ENERGY - double energy; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); - printf("DPU Energy (J): %f\t", energy); - #endif - - // Deallocation - free(A); + // Check output + bool status = true; + if (accum != total_count) + status = false; + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] Outputs are equal\n"); + if (rep >= p.n_warmup) { + printf + ("[::] COUNT-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d", + NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), + BLOCK_SIZE, input_size, + input_size_dpu_round); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD, numa_node_rank); + printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3], // write + timer.time[4], // kernel + timer.time[5], // read + timer.time[8]); + printf(" latency_total_us=%f", + timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5] + timer.time[8]); + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + input_size * sizeof(T) / timer.time[2], + input_size * sizeof(T) / timer.time[4], + input_size * sizeof(T) / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5] + + timer.time[8]) + ); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + input_size * sizeof(T) / (timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size * sizeof(T) / (timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size * sizeof(T) / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + input_size / timer.time[2], + input_size / timer.time[4], + input_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5] + + timer.time[8]) + ); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + input_size / (timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size / (timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5])); + } + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] Outputs differ!\n"); + } + } + +#if ENERGY + double energy; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + printf("DPU Energy (J): %f\t", energy); +#endif + + // Deallocation + free(A); #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_free(dpu_set)); #endif - return 0; + return 0; } diff --git a/COUNT/support/common.h b/COUNT/support/common.h index 72270b0..afd5b2d 100755 --- a/COUNT/support/common.h +++ b/COUNT/support/common.h @@ -3,15 +3,15 @@ // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; + uint32_t size; enum kernels { - kernel1 = 0, - nr_kernels = 1, + kernel1 = 0, + nr_kernels = 1, } kernel; } dpu_arguments_t; typedef struct { - uint32_t t_count; + uint32_t t_count; } dpu_results_t; // Transfer size between MRAM and WRAM @@ -26,11 +26,12 @@ typedef struct { // Data type #define T uint64_t -#define REGS (BLOCK_SIZE >> 3) // 64 bits +#define REGS (BLOCK_SIZE >> 3) // 64 bits // Sample predicate -bool pred(const T x){ - return (x % 2) == 0; +bool pred(const T x) +{ + return (x % 2) == 0; } #ifndef ENERGY diff --git a/COUNT/support/params.h b/COUNT/support/params.h index bb86211..dd1505e 100644 --- a/COUNT/support/params.h +++ b/COUNT/support/params.h @@ -4,53 +4,62 @@ #include "common.h" typedef struct Params { - unsigned int input_size; - int n_warmup; - int n_reps; - int exp; -}Params; + unsigned int input_size; + int n_warmup; + int n_reps; + int exp; +} Params; -static void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=3932160 elements)" - "\n"); +static void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=3932160 elements)" "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size = 3932160; - p.n_warmup = 1; - p.n_reps = 3; - p.exp = 0; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size = 3932160; + p.n_warmup = 1; + p.n_reps = 3; + p.exp = 0; - int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'x': p.exp = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'x': + p.exp = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; + return p; } #endif diff --git a/COUNT/support/timer.h b/COUNT/support/timer.h index 3ec6d87..76fbcff 100755 --- a/COUNT/support/timer.h +++ b/COUNT/support/timer.h @@ -1,66 +1,80 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer { + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; +} Timer; + +#define TMR_ALLOC 0 +#define TMR_LOAD 1 +#define TMR_CPU 2 +#define TMR_WRITE 3 +#define TMR_KERNEL 4 +#define TMR_READ 5 +#define TMR_FREE 6 + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) +{ + printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/COUNT/vamos25.sh b/COUNT/vamos25.sh new file mode 100755 index 0000000..a518c67 --- /dev/null +++ b/COUNT/vamos25.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) +fn=log/$(hostname)/dimes-hetsim-nmc + +# 2^24 elem == 128 MiB +# 2^28 elem == 2 GiB +# (upstream version uses 1.875 GiB) +# upstrem DPU and upstream CPU use uint64_t + +source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + +run_benchmark_nmc() { + local "$@" + if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 PARALLEL_READ=1; then + bin/host_code -w 0 -e 40 -i ${input_size} -x 1 + fi + return $? +} + +export -f run_benchmark_nmc + +( + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size $(( 2 ** 24 )) $(( 2 ** 25 )) $(( 2 ** 26 )) $(( 2 ** 27 )) $(( 2 ** 28 )) + +) >> ${fn}.txt + +cd baselines/cpu + +make -B NUMA=1 + +( +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + ./count -i {input_size} -a {ram} -c {cpu} -t {nr_threads} -w 0 -e 20 \ + ::: ram 0 1 \ + ::: cpu 0 1 \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: input_size $(( 2 ** 24 )) $(( 2 ** 25 )) $(( 2 ** 26 )) $(( 2 ** 27 )) $(( 2 ** 28 )) +) >> ${fn}.txt diff --git a/GEMV/baselines/cpu/Makefile b/GEMV/baselines/cpu/Makefile index 016d561..60c662c 100644 --- a/GEMV/baselines/cpu/Makefile +++ b/GEMV/baselines/cpu/Makefile @@ -1,17 +1,24 @@ -NUMA ?= 0 -NUMA_MEMCPY ?= 0 -FLAGS = +native ?= 1 +numa ?= 0 +numa_memcpy ?= 0 + +CFLAGS = +LDFLAGS = TYPE ?= double -ifeq (${NUMA}, 1) - FLAGS += -lnuma +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma endif .PHONY: all all: gemv gemv: gemv_openmp.c - gcc -ggdb -Wall -Wextra -pedantic -march=native -O2 -o gemv -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${FLAGS} + gcc -ggdb -Wall -Wextra -pedantic ${CFLAGS} -O3 -o gemv -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DT=${TYPE} -DTYPE_${TYPE} gemv_openmp.c ${LDFLAGS} gemv_O0: gemv_openmp.c gcc -o gemv_O0 -fopenmp gemv_openmp.c diff --git a/GEMV/baselines/cpu/gemv_openmp.c b/GEMV/baselines/cpu/gemv_openmp.c index 21e24cb..99bba55 100644 --- a/GEMV/baselines/cpu/gemv_openmp.c +++ b/GEMV/baselines/cpu/gemv_openmp.c @@ -10,10 +10,10 @@ #include <numaif.h> #include <numa.h> -struct bitmask* bitmask_in; -struct bitmask* bitmask_out; +struct bitmask *bitmask_in; +struct bitmask *bitmask_out; -void* mp_pages[1]; +void *mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_in = -1; @@ -22,7 +22,7 @@ int numa_node_cpu = -1; #endif #if NUMA_MEMCPY -struct bitmask* bitmask_cpu; +struct bitmask *bitmask_cpu; int numa_node_cpu_memcpy = -1; int numa_node_local = -1; int numa_node_in_is_local = 0; @@ -35,284 +35,292 @@ int numa_node_in_is_local = 0; int main(int argc, char *argv[]) { - (void) argc; + (void)argc; /* // upstream config: const size_t rows = 20480; const size_t cols = 8192; */ - // DPU config: 163840 -n 4096 - const size_t rows = 163840; - const size_t cols = 4096; + // DPU config: 163840 -n 4096 + const size_t rows = 163840; + const size_t cols = 4096; - T **A, *b, *x; + T **A, *b, *x; - T **A_local, *x_local; + T **A_local, *x_local; #if NUMA - bitmask_in = numa_parse_nodestring(argv[1]); - bitmask_out = numa_parse_nodestring(argv[2]); - numa_node_cpu = atoi(argv[3]); + bitmask_in = numa_parse_nodestring(argv[1]); + bitmask_out = numa_parse_nodestring(argv[2]); + numa_node_cpu = atoi(argv[3]); #if NUMA_MEMCPY - bitmask_cpu = numa_parse_nodestring(argv[4]); - numa_node_cpu_memcpy = atoi(argv[5]); -#endif // NUMA_MEMCPY + bitmask_cpu = numa_parse_nodestring(argv[4]); + numa_node_cpu_memcpy = atoi(argv[5]); +#endif // NUMA_MEMCPY #else - (void) argv; -#endif // NUMA + (void)argv; +#endif // NUMA #if NUMA - if (bitmask_out) { - numa_set_membind(bitmask_out); - numa_free_nodemask(bitmask_out); - } - b = (T*) numa_alloc(sizeof(T)*rows); + if (bitmask_out) { + numa_set_membind(bitmask_out); + numa_free_nodemask(bitmask_out); + } + b = (T *) numa_alloc(sizeof(T) * rows); #else - b = (T*) malloc(sizeof(T)*rows); + b = (T *) malloc(sizeof(T) * rows); #endif #if NUMA - if (bitmask_in) { - numa_set_membind(bitmask_in); - // no free yet, re-used in allocate_dense - } - x = (T*) numa_alloc(sizeof(T)*cols); + if (bitmask_in) { + numa_set_membind(bitmask_in); + // no free yet, re-used in allocate_dense + } + x = (T *) numa_alloc(sizeof(T) * cols); #else - x = (T*) malloc(sizeof(T)*cols); + x = (T *) malloc(sizeof(T) * cols); #endif - allocate_dense(rows, cols, &A); + allocate_dense(rows, cols, &A); #if NUMA - if (bitmask_in) { - numa_free_nodemask(bitmask_in); - } + if (bitmask_in) { + numa_free_nodemask(bitmask_in); + } #endif - make_hilbert_mat(rows,cols, &A); + make_hilbert_mat(rows, cols, &A); #if NUMA #if NUMA_MEMCPY - if (bitmask_cpu) { - numa_set_membind(bitmask_cpu); - numa_free_nodemask(bitmask_cpu); - } + if (bitmask_cpu) { + numa_set_membind(bitmask_cpu); + numa_free_nodemask(bitmask_cpu); + } #else - struct bitmask *bitmask_all = numa_allocate_nodemask(); - numa_bitmask_setall(bitmask_all); - numa_set_membind(bitmask_all); - numa_free_nodemask(bitmask_all); -#endif // NUMA_MEMCPY -#endif // NUMA + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY +#endif // NUMA - A_local = A; - x_local = x; + A_local = A; + x_local = x; #if NUMA - mp_pages[0] = A; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A)"); - } - else if (mp_status[0] < 0) { - printf("move_pages(A) error: %d", mp_status[0]); - } - else { - numa_node_in = mp_status[0]; - } - - mp_pages[0] = b; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(b)"); - } - else if (mp_status[0] < 0) { - printf("move_pages(b) error: %d", mp_status[0]); - } - else { - numa_node_out = mp_status[0]; - } - - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages(A) error: %d", mp_status[0]); + } else { + numa_node_in = mp_status[0]; + } + + mp_pages[0] = b; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(b)"); + } else if (mp_status[0] < 0) { + printf("move_pages(b) error: %d", mp_status[0]); + } else { + numa_node_out = mp_status[0]; + } + + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } #endif #if NUMA_MEMCPY - numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) + || (numa_node_cpu + 8 == numa_node_in)) * 1; #endif - Timer timer; - for (int i = 0; i < 20; i++) { + Timer timer; + for (int i = 0; i < 20; i++) { #pragma omp parallel - { + { #pragma omp for - for (size_t i = 0; i < cols; i++) { - x[i] = (T) i+1 ; - } + for (size_t i = 0; i < cols; i++) { + x[i] = (T) i + 1; + } #pragma omp for - for (size_t i = 0; i < rows; i++) { - b[i] = (T) 0; - } - } + for (size_t i = 0; i < rows; i++) { + b[i] = (T) 0; + } + } #if NUMA_MEMCPY - start(&timer, 1, 0); - if (!numa_node_in_is_local) { - x_local = (T*) numa_alloc(sizeof(T)*cols); - allocate_dense(rows, cols, &A_local); - } - stop(&timer, 1); - - if (x_local == NULL) { - return 1; - } - if (A_local == NULL) { - return 1; - } - - if (!numa_node_in_is_local) { - if (numa_node_cpu_memcpy != -1) { - if (numa_run_on_node(numa_node_cpu_memcpy) == -1) { - perror("numa_run_on_node"); - numa_node_cpu_memcpy = -1; - } - } - } - - start(&timer, 2, 0); - if (!numa_node_in_is_local) { - //for (size_t i=0; i < rows; i++ ) { - // memcpy(A_local[i], A[i], cols * sizeof(T)); - //} - memcpy(*A_local, *A, rows * cols * sizeof(T)); - memcpy(x_local, x, cols * sizeof(T)); - } else { - A_local = A; - x_local = x; - } - stop(&timer, 2); - - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } - - mp_pages[0] = A_local; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A_local)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_local = mp_status[0]; - } + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + x_local = (T *) numa_alloc(sizeof(T) * cols); + allocate_dense(rows, cols, &A_local); + } + stop(&timer, 1); + + if (x_local == NULL) { + return 1; + } + if (A_local == NULL) { + return 1; + } + + if (!numa_node_in_is_local) { + if (numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(numa_node_cpu_memcpy) == + -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + //for (size_t i=0; i < rows; i++ ) { + // memcpy(A_local[i], A[i], cols * sizeof(T)); + //} + memcpy(*A_local, *A, rows * cols * sizeof(T)); + memcpy(x_local, x, cols * sizeof(T)); + } else { + A_local = A; + x_local = x; + } + stop(&timer, 2); + + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + + mp_pages[0] = A_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A_local)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_local = mp_status[0]; + } #endif - unsigned int nr_threads = 0; + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; + nr_threads++; - start(&timer, 0, 0); - gemv(A_local, x_local, rows, cols, &b); - stop(&timer, 0); + start(&timer, 0, 0); + gemv(A_local, x_local, rows, cols, &b); + stop(&timer, 0); #if NUMA_MEMCPY - start(&timer, 3, 0); - if (!numa_node_in_is_local) { - numa_free(x_local, sizeof(T) * cols); - numa_free(*A_local, sizeof(T) * rows * cols); - numa_free(A_local, sizeof(void*) * rows); - } - stop(&timer, 3); + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(x_local, sizeof(T) * cols); + numa_free(*A_local, sizeof(T) * rows * cols); + numa_free(A_local, sizeof(void *) * rows); + } + stop(&timer, 3); #endif #if NUMA_MEMCPY - printf("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld" - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" - " | throughput_MBps=%f throughput_MOpps=%f", - nr_threads, XSTR(T), rows * cols, - numa_node_in, numa_node_out, numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), - rows * cols * sizeof(T) / timer.time[0], - rows * cols / timer.time[0]); - printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", - timer.time[0], timer.time[1], timer.time[2], timer.time[3], - timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); + printf + ("[::] GEMV-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_node_local=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " | throughput_MBps=%f throughput_MOpps=%f", nr_threads, + XSTR(T), rows * cols, numa_node_in, numa_node_out, + numa_node_cpu, numa_node_local, numa_node_cpu_memcpy, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), + rows * cols * sizeof(T) / timer.time[0], + rows * cols / timer.time[0]); + printf + (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + + timer.time[3]); #else - printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld" + printf("[::] GEMV-CPU | n_threads=%d e_type=%s n_elements=%ld" #if NUMA - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" #endif - " | throughput_MBps=%f", - nr_threads, XSTR(T), rows * cols, + " | throughput_MBps=%f", + nr_threads, XSTR(T), rows * cols, #if NUMA - numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_out, numa_node_cpu, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), #endif - rows * cols * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f latency_us=%f\n", - rows * cols / timer.time[0], timer.time[0]); + rows * cols * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f latency_us=%f\n", + rows * cols / timer.time[0], timer.time[0]); #endif - } - + } #if 0 - print_vec(x, rows); - print_mat(A, rows, cols); - print_vec(b, rows); + print_vec(x, rows); + print_mat(A, rows, cols); + print_vec(b, rows); #endif #if TYPE_double || TYPE_float - printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows)); + printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x, cols), + sum_vec(b, rows)); #else - printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x,cols), sum_vec(b,rows)); + printf("sum(x) = %d, sum(Ax) = %d\n", sum_vec(x, cols), + sum_vec(b, rows)); #endif #if NUMA - numa_free(b, sizeof(T)*rows); - numa_free(x, sizeof(T)*cols); - numa_free(*A, sizeof(T)*rows*cols); - numa_free(A, sizeof(void*)*rows); + numa_free(b, sizeof(T) * rows); + numa_free(x, sizeof(T) * cols); + numa_free(*A, sizeof(T) * rows * cols); + numa_free(A, sizeof(void *) * rows); #else - free(b); - free(x); - free(*A); - free(A); + free(b); + free(x); + free(*A); + free(A); #endif - return 0; + return 0; } -void gemv(T** A, T* x, size_t rows, size_t cols, T** b) { +void gemv(T **A, T *x, size_t rows, size_t cols, T **b) +{ #pragma omp parallel for - for (size_t i = 0; i < rows; i ++ ) - for (size_t j = 0; j < cols; j ++ ) { - (*b)[i] = (*b)[i] + A[i][j]*x[j]; - } + for (size_t i = 0; i < rows; i++) + for (size_t j = 0; j < cols; j++) { + (*b)[i] = (*b)[i] + A[i][j] * x[j]; + } } -void make_hilbert_mat(size_t rows, size_t cols, T*** A) { +void make_hilbert_mat(size_t rows, size_t cols, T ***A) +{ #pragma omp parallel for - for (size_t i = 0; i < rows; i++) { - for (size_t j = 0; j < cols; j++) { + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { #if TYPE_double || TYPE_float - (*A)[i][j] = 1.0/( (T) i + (T) j + 1.0); + (*A)[i][j] = 1.0 / ((T) i + (T) j + 1.0); #else - (*A)[i][j] = (T)(((i+j)%10)); + (*A)[i][j] = (T) (((i + j) % 10)); #endif - } - } + } + } } -T sum_vec(T* vec, size_t rows) { - T sum = 0; +T sum_vec(T *vec, size_t rows) +{ + T sum = 0; #pragma omp parallel for reduction(+:sum) - for (int i = 0; i < rows; i++) sum = sum + vec[i]; - return sum; + for (int i = 0; i < rows; i++) + sum = sum + vec[i]; + return sum; } diff --git a/GEMV/baselines/cpu/run-perf.sh b/GEMV/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..5eae822 --- /dev/null +++ b/GEMV/baselines/cpu/run-perf.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B numa=1 + +OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4 +OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./gemv 4 4 4 diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c index 0226437..3bf52e8 100644 --- a/GEMV/dpu/task.c +++ b/GEMV/dpu/task.c @@ -17,7 +17,8 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; // GEMV -static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { +static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) +{ for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) { bufferC[pos] += bufferA[i] * bufferB[i]; } @@ -28,13 +29,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { BARRIER_INIT(my_barrier, NR_TASKLETS); // main -int main() { +int main() +{ unsigned int tasklet_id = me(); #if PRINT // printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap } // Barrier barrier_wait(&my_barrier); @@ -44,15 +46,15 @@ int main() { uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows; uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows; - unsigned int element_per_cacheC = 8/sizeof(T); + unsigned int element_per_cacheC = 8 / sizeof(T); unsigned int nrows = nr_rows; - unsigned int rows_per_tasklet; + unsigned int rows_per_tasklet; unsigned int start_row; unsigned int chunks = nrows / (NR_TASKLETS * element_per_cacheC); - unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks; + unsigned int dbl_chunks = chunks * element_per_cacheC; //chunks + chunks; rows_per_tasklet = dbl_chunks; - unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS); + unsigned int rest_rows = nrows % (NR_TASKLETS * element_per_cacheC); //(NR_TASKLETS + NR_TASKLETS); if ((tasklet_id * element_per_cacheC) < rest_rows) rows_per_tasklet += element_per_cacheC; @@ -60,22 +62,32 @@ int main() { if ((tasklet_id * element_per_cacheC) >= rest_rows) { // unsigned int hlf_rest_rows = rest_rows >> 1; if ((rest_rows % element_per_cacheC) != 0) - start_row = roundup(rest_rows, element_per_cacheC) + tasklet_id * dbl_chunks; - // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; + start_row = + roundup(rest_rows, + element_per_cacheC) + + tasklet_id * dbl_chunks; + // start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; else - start_row = rest_rows + tasklet_id * dbl_chunks; - // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; - } else - start_row = tasklet_id * (dbl_chunks + element_per_cacheC); - // start_row = tasklet_id * (dbl_chunks + 2); + start_row = rest_rows + tasklet_id * dbl_chunks; + // start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; + } else + start_row = + tasklet_id * (dbl_chunks + element_per_cacheC); + // start_row = tasklet_id * (dbl_chunks + 2); } else { start_row = tasklet_id * (dbl_chunks); } // Address of the current row in MRAM - uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); - uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T)); - uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T)); + uint32_t mram_base_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); + uint32_t mram_base_addr_B = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T)); + uint32_t mram_base_addr_C = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T) + start_row * sizeof(T)); uint32_t mram_temp_addr_A = mram_base_addr_A; uint32_t mram_temp_addr_B = mram_base_addr_B; @@ -87,55 +99,65 @@ int main() { int offset = 0; - #if PRINT - printf("id: %d, rows_per_tasklet = %d\n",tasklet_id, rows_per_tasklet); - printf("id: %d, start_row = %d\n",tasklet_id, start_row); - #endif +#if PRINT + printf("id: %d, rows_per_tasklet = %d\n", tasklet_id, rows_per_tasklet); + printf("id: %d, start_row = %d\n", tasklet_id, start_row); +#endif // Iterate over nr_rows // for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) { - for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += element_per_cacheC) { + for (unsigned int i = start_row; i < start_row + rows_per_tasklet; + i += element_per_cacheC) { - mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); + mram_temp_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; // cache_C[0] = 0; // cache_C[1] = 0; // clear the cache - for(unsigned int c = 0; c < element_per_cacheC; c++){ - cache_C[c] = 0; + for (unsigned int c = 0; c < element_per_cacheC; c++) { + cache_C[c] = 0; } // for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){ // for(unsigned int pos = 0; (pos < element_per_cacheC) && ((i + pos) < (start_row + rows_per_tasklet)); pos++){ // for(unsigned int pos = 0; pos < element_per_cacheC && i + pos < nr_rows; pos++){ - for(unsigned int pos = 0; pos < element_per_cacheC; pos++){ - if(i + pos >= nr_rows){ + for (unsigned int pos = 0; pos < element_per_cacheC; pos++) { + if (i + pos >= nr_rows) { // printf("id: %d, nrows: %d, error\n", tasklet_id, nrows); break; - } + } int n = 0, j; - for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T))) - { - - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - if(offset) - { - - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++) - { + for (n = 0; + n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T))); + n += (BLOCK_SIZE / sizeof(T))) { + + mram_read((__mram_ptr void const + *)(mram_temp_addr_A), cache_A, + BLOCK_SIZE); + mram_read((__mram_ptr void const + *)(mram_temp_addr_B), cache_B, + BLOCK_SIZE); + + if (offset) { + + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + + BLOCK_SIZE), cache_A_aux, + 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } - // Compute GEMV gemv(cache_C, cache_A, cache_B, pos); @@ -144,53 +166,55 @@ int main() { mram_temp_addr_B += BLOCK_SIZE; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); - + mram_read((__mram_ptr void const *)(mram_temp_addr_A), + cache_A, BLOCK_SIZE); - if(offset) - { - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++) - { + if (offset) { + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + BLOCK_SIZE), + cache_A_aux, 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } + mram_read((__mram_ptr void const *)(mram_temp_addr_B), + cache_B, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - for (j = 0; j < (int) (n_size - n); j++) { + for (j = 0; j < (int)(n_size - n); j++) { // Compute GEMV - if(j >= (int)(BLOCK_SIZE / sizeof(T))){ + if (j >= (int)(BLOCK_SIZE / sizeof(T))) { printf("error\n"); break; } cache_C[pos] += cache_A[j] * cache_B[j]; } - - mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T)); + mram_temp_addr_A += + (BLOCK_SIZE - + ((BLOCK_SIZE / sizeof(T)) - + (n_size - n)) * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; - if(mram_temp_addr_A % 8 != 0) - { + if (mram_temp_addr_A % 8 != 0) { offset = 1; - } - else - { + } else { offset = 0; } } // Write cache to current MRAM block - mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8); + mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8); // Update memory address // mram_base_addr_C += 2 * sizeof(T); - mram_base_addr_C += 8; + mram_base_addr_C += 8; } diff --git a/GEMV/host/app.c b/GEMV/host/app.c index ebd0336..6553774 100644 --- a/GEMV/host/app.c +++ b/GEMV/host/app.c @@ -33,69 +33,69 @@ #define DPU_BINARY "./bin/gemv_dpu" #endif -static T* A; -static T* B; -static T* C; -static T* C_dpu; +static T *A; +static T *B; +static T *C; +static T *C_dpu; // Create input arrays -static void init_data(T* A, T* B, unsigned int m_size, unsigned int n_size) { +static void init_data(T *A, T *B, unsigned int m_size, unsigned int n_size) +{ srand(0); - for (unsigned int i = 0; i < m_size * n_size; i++) - { - A[i] = (unsigned int) (rand()%50); + for (unsigned int i = 0; i < m_size * n_size; i++) { + A[i] = (unsigned int)(rand() % 50); } - for (unsigned int i = 0; i < n_size; i++) - { - B[i] = (unsigned int) (rand()%50); + for (unsigned int i = 0; i < n_size; i++) { + B[i] = (unsigned int)(rand() % 50); } } // Compute output in the host -static void gemv_host(T* C, T* A, T* B, unsigned int m_size, unsigned int n_size) { - for (unsigned int i = 0; i < m_size; i++) - { +static void gemv_host(T *C, T *A, T *B, unsigned int m_size, + unsigned int n_size) +{ + for (unsigned int i = 0; i < m_size; i++) { C[i] = 0; } for (unsigned int m = 0; m < m_size; m++) { - for (unsigned int n = 0; n < n_size; n++) - { + for (unsigned int n = 0; n < n_size; n++) { C[m] += A[m * n_size + n] * B[n]; } } } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ struct Params p = input_params(argc, argv); struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + uint32_t nr_of_ranks; // Timer Timer timer; - int numa_node_rank = -2; + int numa_node_rank = -2; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + timer.time[0] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[1] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[8] = 0; // free + timer.time[8] = 0; // free #endif #if ENERGY @@ -108,12 +108,13 @@ int main(int argc, char **argv) { unsigned int n_size = p.n_size; // Initialize help data - dpu_info = (struct dpu_info_t *) malloc(NR_DPUS * sizeof(struct dpu_info_t)); - dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t)); + dpu_info = + (struct dpu_info_t *)malloc(NR_DPUS * sizeof(struct dpu_info_t)); + dpu_arguments_t *input_args = + (dpu_arguments_t *) malloc(NR_DPUS * sizeof(dpu_arguments_t)); uint32_t max_rows_per_dpu = 0; uint32_t n_size_pad = n_size; - if(n_size % 2 == 1) - { + if (n_size % 2 == 1) { n_size_pad++; } @@ -127,7 +128,10 @@ int main(int argc, char **argv) { rows_per_dpu++; if (rest_rows > 0) { if (i >= rest_rows) - prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks; + prev_rows_dpu = + rest_rows * (chunks + 1) + (i - + rest_rows) * + chunks; else prev_rows_dpu = i * (chunks + 1); } else { @@ -136,7 +140,7 @@ int main(int argc, char **argv) { // Keep max rows for parallel transfers uint32_t rows_per_dpu_pad = rows_per_dpu; - if (rows_per_dpu_pad % 2 == 1) // 4-byte elements + if (rows_per_dpu_pad % 2 == 1) // 4-byte elements rows_per_dpu_pad++; if (rows_per_dpu_pad > max_rows_per_dpu) max_rows_per_dpu = rows_per_dpu_pad; @@ -163,20 +167,20 @@ int main(int argc, char **argv) { for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 0, 0); } DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 0); } #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 1, 0); } DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 1); } DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); @@ -186,26 +190,33 @@ int main(int argc, char **argv) { // int prev_rank_id = -1; int rank_id = -1; - DPU_FOREACH (dpu_set, dpu) { - rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) { + DPU_FOREACH(dpu_set, dpu) { + rank_id = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + if ((numa_node_rank != -2) + && numa_node_rank != + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu)))) { numa_node_rank = -1; } else { - numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu))); + numa_node_rank = + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu))); } /* - if (rank_id != prev_rank_id) { - printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); - prev_rank_id = rank_id; - } - */ + if (rank_id != prev_rank_id) { + printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); + prev_rank_id = rank_id; + } + */ } - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 2, 0); } gemv_host(C, A, B, m_size, n_size); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 2); } if (rep >= p.n_warmup) { @@ -220,23 +231,30 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 3); } if (rep >= p.n_warmup) { start(&timer, 6, 0); } - // Copy input array and vector i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, A + dpu_info[i].prev_rows_dpu * n_size)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, + A + dpu_info[i].prev_rows_dpu * n_size)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + max_rows_per_dpu * n_size_pad * sizeof(T), + DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 6); } if (rep >= p.n_warmup) { @@ -246,12 +264,15 @@ int main(int argc, char **argv) { DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, B)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T), + n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); if (rep >= p.n_warmup) { stop(&timer, 7); } - // Run kernel on DPUs if (rep >= p.n_warmup) { start(&timer, 4, 0); @@ -280,89 +301,140 @@ int main(int argc, char **argv) { start(&timer, 5, 0); i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, C_dpu + i * max_rows_per_dpu)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T), + max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) { stop(&timer, 5); } - #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 8, 0); } #endif DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 8); } #endif #endif - // Check output bool status = true; - unsigned int n,j; + unsigned int n, j; i = 0; for (n = 0; n < NR_DPUS; n++) { for (j = 0; j < dpu_info[n].rows_per_dpu; j++) { - if(C[i] != C_dpu[n * max_rows_per_dpu + j]) { + if (C[i] != C_dpu[n * max_rows_per_dpu + j]) { status = false; #if PRINT - // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]); + // printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]); #endif } i++; } } if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] Outputs are equal\n"); if (rep >= p.n_warmup) { - printf("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d", - NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, n_size * m_size); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", - timer.time[0], - timer.time[1], - timer.time[2], - timer.time[3] + timer.time[6] + timer.time[7], - timer.time[4], - timer.time[5], - timer.time[8]); - printf(" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f", - timer.time[3], - timer.time[6], - timer.time[7] - ); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - n_size * m_size * sizeof(T) / timer.time[2], - n_size * m_size * sizeof(T) / (timer.time[4]), - n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - n_size * m_size * sizeof(T) / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]), - n_size * m_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]), - n_size * m_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - n_size * m_size / timer.time[2], - n_size * m_size / (timer.time[4]), - n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - n_size * m_size / (timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]), - n_size * m_size / (timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5]), - n_size * m_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5])); + printf + ("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d", + NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), + BLOCK_SIZE, n_size * m_size); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD, numa_node_rank); + printf + ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + timer.time[0], timer.time[1], + timer.time[2], + timer.time[3] + timer.time[6] + + timer.time[7], timer.time[4], + timer.time[5], timer.time[8]); + printf + (" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f", + timer.time[3], timer.time[6], timer.time[7] + ); + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + n_size * m_size * sizeof(T) / + timer.time[2], + n_size * m_size * sizeof(T) / + (timer.time[4]), + n_size * m_size * sizeof(T) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[6] + + timer.time[7] + timer.time[4] + + timer.time[5] + timer.time[8])); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + n_size * m_size * sizeof(T) / + (timer.time[3] + timer.time[6] + + timer.time[7] + timer.time[4] + + timer.time[5]), + n_size * m_size * sizeof(T) / + (timer.time[1] + timer.time[3] + + timer.time[6] + timer.time[7] + + timer.time[4] + timer.time[5]), + n_size * m_size * sizeof(T) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[6] + + timer.time[7] + timer.time[4] + + timer.time[5])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + n_size * m_size / timer.time[2], + n_size * m_size / (timer.time[4]), + n_size * m_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[6] + + timer.time[7] + + timer.time[4] + + timer.time[5] + + timer.time[8])); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + n_size * m_size / (timer.time[3] + + timer.time[6] + + timer.time[7] + + timer.time[4] + + timer.time[5]), + n_size * m_size / (timer.time[1] + + timer.time[3] + + timer.time[6] + + timer.time[7] + + timer.time[4] + + timer.time[5]), + n_size * m_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[6] + + timer.time[7] + + timer.time[4] + + timer.time[5])); } } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] Outputs differ!\n"); } } #if ENERGY double acc_energy, avg_energy, acc_time, avg_time; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get + (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); @@ -370,15 +442,15 @@ int main(int argc, char **argv) { // Print timing results /* - printf("CPU Version Time (ms): "); - print(&timer, 0, 1); - printf("CPU-DPU Time (ms): "); - print(&timer, 1, p.n_reps); - printf("DPU Kernel Time (ms): "); - print(&timer, 2, p.n_reps); - printf("DPU-CPU Time (ms): "); - print(&timer, 3, p.n_reps); - */ + printf("CPU Version Time (ms): "); + print(&timer, 0, 1); + printf("CPU-DPU Time (ms): "); + print(&timer, 1, p.n_reps); + printf("DPU Kernel Time (ms): "); + print(&timer, 2, p.n_reps); + printf("DPU-CPU Time (ms): "); + print(&timer, 3, p.n_reps); + */ #if ENERGY printf("Energy (J): %f J\t", avg_energy); diff --git a/GEMV/support/common.h b/GEMV/support/common.h index 0deebcb..47a9628 100755 --- a/GEMV/support/common.h +++ b/GEMV/support/common.h @@ -3,17 +3,17 @@ // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t n_size; - uint32_t n_size_pad; - uint32_t nr_rows; - uint32_t max_rows; + uint32_t n_size; + uint32_t n_size_pad; + uint32_t nr_rows; + uint32_t max_rows; } dpu_arguments_t; // Specific information for each DPU struct dpu_info_t { - uint32_t rows_per_dpu; - uint32_t rows_per_dpu_pad; - uint32_t prev_rows_dpu; + uint32_t rows_per_dpu; + uint32_t rows_per_dpu_pad; + uint32_t prev_rows_dpu; }; struct dpu_info_t *dpu_info; diff --git a/GEMV/support/params.h b/GEMV/support/params.h index 526c71c..c72b0c1 100644 --- a/GEMV/support/params.h +++ b/GEMV/support/params.h @@ -4,53 +4,62 @@ #include "common.h" typedef struct Params { - unsigned int m_size; - unsigned int n_size; - unsigned int n_warmup; - unsigned int n_reps; -}Params; + unsigned int m_size; + unsigned int n_size; + unsigned int n_warmup; + unsigned int n_reps; +} Params; -static void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n" - "\nBenchmark-specific options:" - "\n -m <I> m_size (default=8192 elements)" - "\n -n <I> n_size (default=8192 elements)" - "\n"); +static void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -m <I> m_size (default=8192 elements)" + "\n -n <I> n_size (default=8192 elements)" "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.m_size = 8192; - p.n_size = 8192; - p.n_warmup = 1; - p.n_reps = 3; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.m_size = 8192; + p.n_size = 8192; + p.n_warmup = 1; + p.n_reps = 3; - int opt; - while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'm': p.m_size = atoi(optarg); break; - case 'n': p.n_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'm': + p.m_size = atoi(optarg); + break; + case 'n': + p.n_size = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; + return p; } #endif diff --git a/GEMV/support/timer.h b/GEMV/support/timer.h index 99d79f4..b2b9148 100755 --- a/GEMV/support/timer.h +++ b/GEMV/support/timer.h @@ -1,69 +1,74 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
- //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
-
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> +typedef struct Timer { + struct timeval startTime[9]; + struct timeval stopTime[9]; + double time[9]; +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); + + //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + + // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000); +} + +void print(Timer *timer, int i, int REP) +{ + printf("%f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/HST-S/baselines/cpu/app_baseline.c b/HST-S/baselines/cpu/app_baseline.c index 745e384..bb4e28a 100644 --- a/HST-S/baselines/cpu/app_baseline.c +++ b/HST-S/baselines/cpu/app_baseline.c @@ -24,10 +24,10 @@ #include <numaif.h> #include <numa.h> -struct bitmask* bitmask_in; -struct bitmask* bitmask_out; +struct bitmask *bitmask_in; +struct bitmask *bitmask_out; -void* mp_pages[1]; +void *mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_in = -1; @@ -41,7 +41,6 @@ int numa_node_local = -1; int numa_node_in_is_local = 0; #endif - #include "../../support/common.h" #include "../../support/timer.h" @@ -49,364 +48,399 @@ int numa_node_in_is_local = 0; #define STR(x) #x // Pointer declaration -static T* A; +static T *A; static T *A_local; -static unsigned int* histo_host; +static unsigned int *histo_host; typedef struct Params { - unsigned int input_size; - unsigned int bins; - int n_warmup; - int n_reps; - const char *file_name; - int exp; - int n_threads; + unsigned int input_size; + unsigned int bins; + int n_warmup; + int n_reps; + const char *file_name; + int exp; + int n_threads; #if NUMA - struct bitmask* bitmask_in; - struct bitmask* bitmask_out; - int numa_node_cpu; + struct bitmask *bitmask_in; + struct bitmask *bitmask_out; + int numa_node_cpu; #endif #if NUMA_MEMCPY - int numa_node_cpu_memcpy; - struct bitmask* bitmask_cpu; + int numa_node_cpu_memcpy; + struct bitmask *bitmask_cpu; #endif -}Params; +} Params; /** * @brief creates input arrays * @param nr_elements how many elements in input arrays */ -static void read_input(T* A, const Params p) { - - char dctFileName[100]; - FILE *File = NULL; - - // Open input file - unsigned short temp; - sprintf(dctFileName, "%s", p.file_name); - if((File = fopen(dctFileName, "rb")) != NULL) { - for(unsigned int y = 0; y < p.input_size; y++) { - if (fread(&temp, sizeof(unsigned short), 1, File) == 1) { - A[y] = (unsigned int)ByteSwap16(temp); - if(A[y] >= 4096) - A[y] = 4095; - } else { - //printf("out of bounds read at offset %d -- seeking back to 0\n", y); - rewind(File); - } - } - fclose(File); - } else { - printf("%s does not exist\n", dctFileName); - exit(1); - } +static void read_input(T *A, const Params p) +{ + + char dctFileName[100]; + FILE *File = NULL; + + // Open input file + unsigned short temp; + sprintf(dctFileName, "%s", p.file_name); + if ((File = fopen(dctFileName, "rb")) != NULL) { + for (unsigned int y = 0; y < p.input_size; y++) { + if (fread(&temp, sizeof(unsigned short), 1, File) == 1) { + A[y] = (unsigned int)ByteSwap16(temp); + if (A[y] >= 4096) + A[y] = 4095; + } else { + //printf("out of bounds read at offset %d -- seeking back to 0\n", y); + rewind(File); + } + } + fclose(File); + } else { + printf("%s does not exist\n", dctFileName); + exit(1); + } } /** * @brief compute output in the host */ -static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus, int t) { - - omp_set_num_threads(t); - - if(!exp){ - #pragma omp parallel for - for (unsigned int i = 0; i < nr_of_dpus; i++) { - for (unsigned int j = 0; j < nr_elements; j++) { - T d = A[j]; - histo[i * bins + ((d * bins) >> DEPTH)] += 1; - } - } - } - else{ - #pragma omp parallel for - for (unsigned int j = 0; j < nr_elements; j++) { - T d = A[j]; - #pragma omp atomic update - histo[(d * bins) >> DEPTH] += 1; - } - } +static void histogram_host(unsigned int *histo, T *A, unsigned int bins, + unsigned int nr_elements, int exp, + unsigned int nr_of_dpus, int t) +{ + + omp_set_num_threads(t); + + if (!exp) { +#pragma omp parallel for + for (unsigned int i = 0; i < nr_of_dpus; i++) { + for (unsigned int j = 0; j < nr_elements; j++) { + T d = A[j]; + histo[i * bins + ((d * bins) >> DEPTH)] += 1; + } + } + } else { +#pragma omp parallel for + for (unsigned int j = 0; j < nr_elements; j++) { + T d = A[j]; +#pragma omp atomic update + histo[(d * bins) >> DEPTH] += 1; + } + } } // Params --------------------------------------------------------------------- -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n -t <T> # of threads (default=8)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=1536*1024 elements)" - "\n -b <B> histogram size (default=256 bins)" - "\n -f <F> input image file (default=../input/image_VanHateren.iml)" - "\n"); +void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n -t <T> # of threads (default=8)" + "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=1536*1024 elements)" + "\n -b <B> histogram size (default=256 bins)" + "\n -f <F> input image file (default=../input/image_VanHateren.iml)" + "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size = 1536 * 1024; - p.bins = 256; - p.n_warmup = 1; - p.n_reps = 3; - p.n_threads = 8; - p.exp = 1; - p.file_name = "../../input/image_VanHateren.iml"; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size = 1536 * 1024; + p.bins = 256; + p.n_warmup = 1; + p.n_reps = 3; + p.n_threads = 8; + p.exp = 1; + p.file_name = "../../input/image_VanHateren.iml"; #if NUMA - p.bitmask_in = NULL; - p.bitmask_out = NULL; - p.numa_node_cpu = -1; + p.bitmask_in = NULL; + p.bitmask_out = NULL; + p.numa_node_cpu = -1; #endif #if NUMA_MEMCPY - p.numa_node_cpu_memcpy = -1; - p.bitmask_cpu = NULL; + p.numa_node_cpu_memcpy = -1; + p.bitmask_cpu = NULL; #endif - int opt; - while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'b': p.bins = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'f': p.file_name = optarg; break; - case 'x': p.exp = atoi(optarg); break; - case 't': p.n_threads = atoi(optarg); break; + int opt; + while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:A:B:C:D:M:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atoi(optarg); + break; + case 'b': + p.bins = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'f': + p.file_name = optarg; + break; + case 'x': + p.exp = atoi(optarg); + break; + case 't': + p.n_threads = atoi(optarg); + break; #if NUMA - case 'A': p.bitmask_in = numa_parse_nodestring(optarg); break; - case 'B': p.bitmask_out = numa_parse_nodestring(optarg); break; - case 'C': p.numa_node_cpu = atoi(optarg); break; + case 'A': + p.bitmask_in = numa_parse_nodestring(optarg); + break; + case 'B': + p.bitmask_out = numa_parse_nodestring(optarg); + break; + case 'C': + p.numa_node_cpu = atoi(optarg); + break; #if NUMA_MEMCPY - case 'D': p.bitmask_cpu = numa_parse_nodestring(optarg); break; - case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break; -#endif // NUMA_MEMCPY -#endif // NUMA - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(p.n_threads > 0 && "Invalid # of ranks!"); - - return p; + case 'D': + p.bitmask_cpu = numa_parse_nodestring(optarg); + break; + case 'M': + p.numa_node_cpu_memcpy = atoi(optarg); + break; +#endif // NUMA_MEMCPY +#endif // NUMA + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.n_threads > 0 && "Invalid # of ranks!"); + + return p; } /** * @brief Main of the Host Application. */ -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ + + struct Params p = input_params(argc, argv); - struct Params p = input_params(argc, argv); + uint32_t nr_of_dpus; - uint32_t nr_of_dpus; - - const unsigned int input_size = p.input_size; // Size of input image - if(!p.exp) - assert(input_size % p.n_threads == 0 && "Input size!"); - else - assert(input_size % p.n_threads == 0 && "Input size!"); + const unsigned int input_size = p.input_size; // Size of input image + if (!p.exp) + assert(input_size % p.n_threads == 0 && "Input size!"); + else + assert(input_size % p.n_threads == 0 && "Input size!"); - // Input/output allocation + // Input/output allocation #if NUMA - if (p.bitmask_in) { - numa_set_membind(p.bitmask_in); - numa_free_nodemask(p.bitmask_in); - } - A = numa_alloc(input_size * sizeof(T)); + if (p.bitmask_in) { + numa_set_membind(p.bitmask_in); + numa_free_nodemask(p.bitmask_in); + } + A = numa_alloc(input_size * sizeof(T)); #else - A = malloc(input_size * sizeof(T)); + A = malloc(input_size * sizeof(T)); #endif - // Create an input file with arbitrary data. - read_input(A, p); + // Create an input file with arbitrary data. + read_input(A, p); #if NUMA - if (p.bitmask_out) { - numa_set_membind(p.bitmask_out); - numa_free_nodemask(p.bitmask_out); - } + if (p.bitmask_out) { + numa_set_membind(p.bitmask_out); + numa_free_nodemask(p.bitmask_out); + } #endif - if(!p.exp) { - // upstream code left nr_of_dpus uninitialized - nr_of_dpus = p.n_threads; + if (!p.exp) { + // upstream code left nr_of_dpus uninitialized + nr_of_dpus = p.n_threads; #if NUMA - histo_host = numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int)); + histo_host = + numa_alloc(nr_of_dpus * p.bins * sizeof(unsigned int)); #else - histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int)); + histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int)); #endif - } else { + } else { #if NUMA - histo_host = numa_alloc(p.bins * sizeof(unsigned int)); + histo_host = numa_alloc(p.bins * sizeof(unsigned int)); #else - histo_host = malloc(p.bins * sizeof(unsigned int)); + histo_host = malloc(p.bins * sizeof(unsigned int)); #endif - } + } #if NUMA #if NUMA_MEMCPY - if (p.bitmask_cpu) { - numa_set_membind(p.bitmask_cpu); - numa_free_nodemask(p.bitmask_cpu); - } + if (p.bitmask_cpu) { + numa_set_membind(p.bitmask_cpu); + numa_free_nodemask(p.bitmask_cpu); + } #else - struct bitmask *bitmask_all = numa_allocate_nodemask(); - numa_bitmask_setall(bitmask_all); - numa_set_membind(bitmask_all); - numa_free_nodemask(bitmask_all); -#endif // NUMA_MEMCPY -#endif // NUMA + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY +#endif // NUMA #if NUMA - mp_pages[0] = A; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_in = mp_status[0]; - } - - mp_pages[0] = histo_host; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(C)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_out = mp_status[0]; - } - - numa_node_cpu = p.numa_node_cpu; - if (numa_node_cpu != -1) { - if (numa_run_on_node(numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_in = mp_status[0]; + } + + mp_pages[0] = histo_host; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(C)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_out = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } #endif #if NUMA_MEMCPY - numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) + || (numa_node_cpu + 8 == numa_node_in)) * 1; #endif - Timer timer; + Timer timer; #if NUMA_MEMCPY - numa_node_cpu_memcpy = p.numa_node_cpu_memcpy; - start(&timer, 1, 0); - if (!numa_node_in_is_local) { - A_local = (T*) numa_alloc(input_size * sizeof(T)); - } - stop(&timer, 1); - if (!numa_node_in_is_local) { - if (p.numa_node_cpu_memcpy != -1) { - if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) { - perror("numa_run_on_node"); - numa_node_cpu_memcpy = -1; - } - } - } - start(&timer, 2, 0); - if (!numa_node_in_is_local) { - memcpy(A_local, A, input_size * sizeof(T)); - } else { - A_local = A; - } - stop(&timer, 2); - if (p.numa_node_cpu != -1) { - if (numa_run_on_node(p.numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } - mp_pages[0] = A_local; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A_local)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_local = mp_status[0]; - } + numa_node_cpu_memcpy = p.numa_node_cpu_memcpy; + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + A_local = (T *) numa_alloc(input_size * sizeof(T)); + } + stop(&timer, 1); + if (!numa_node_in_is_local) { + if (p.numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + memcpy(A_local, A, input_size * sizeof(T)); + } else { + A_local = A; + } + stop(&timer, 2); + if (p.numa_node_cpu != -1) { + if (numa_run_on_node(p.numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + mp_pages[0] = A_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A_local)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_local = mp_status[0]; + } #else - A_local = A; + A_local = A; #endif - start(&timer, 0, 0); + start(&timer, 0, 0); - if(!p.exp) - memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int)); - else - memset(histo_host, 0, p.bins * sizeof(unsigned int)); + if (!p.exp) + memset(histo_host, 0, + nr_of_dpus * p.bins * sizeof(unsigned int)); + else + memset(histo_host, 0, p.bins * sizeof(unsigned int)); - histogram_host(histo_host, A_local, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads); + histogram_host(histo_host, A_local, p.bins, input_size, p.exp, + nr_of_dpus, p.n_threads); - stop(&timer, 0); + stop(&timer, 0); #if NUMA_MEMCPY - start(&timer, 3, 0); - if (!numa_node_in_is_local) { - numa_free(A_local, input_size * sizeof(T)); - } - stop(&timer, 3); + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(A_local, input_size * sizeof(T)); + } + stop(&timer, 3); #endif - unsigned int nr_threads = 0; + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; + nr_threads++; #if NUMA_MEMCPY - printf("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d" - " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" - " | throughput_MBps=%f", - nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus, - numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), - input_size * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f", - input_size / timer.time[0]); - printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", - timer.time[0], timer.time[1], timer.time[2], timer.time[3], - timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); + printf + ("[::] HST-S-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d n_bins=%d" + " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " | throughput_MBps=%f", nr_threads, XSTR(T), input_size, + p.exp ? p.bins : p.bins * nr_of_dpus, numa_node_in, + numa_node_local, numa_node_out, numa_node_cpu, + numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), + input_size * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f", input_size / timer.time[0]); + printf + (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); #else - printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d" + printf("[::] HST-S-CPU | n_threads=%d e_type=%s n_elements=%d n_bins=%d" #if NUMA - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" #endif - " | throughput_MBps=%f", - nr_threads, XSTR(T), input_size, p.exp ? p.bins : p.bins * nr_of_dpus, + " | throughput_MBps=%f", + nr_threads, XSTR(T), input_size, + p.exp ? p.bins : p.bins * nr_of_dpus, #if NUMA - numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_out, numa_node_cpu, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), #endif - input_size * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f latency_us=%f\n", - input_size / timer.time[0], timer.time[0]); -#endif // NUMA_MEMCPY + input_size * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f latency_us=%f\n", + input_size / timer.time[0], timer.time[0]); +#endif // NUMA_MEMCPY #if NUMA - numa_free(A, input_size * sizeof(T)); - if (!p.exp) { - numa_free(histo_host, nr_of_dpus * p.bins * sizeof(unsigned int)); - } else { - numa_free(histo_host, p.bins * sizeof(unsigned int)); - } + numa_free(A, input_size * sizeof(T)); + if (!p.exp) { + numa_free(histo_host, + nr_of_dpus * p.bins * sizeof(unsigned int)); + } else { + numa_free(histo_host, p.bins * sizeof(unsigned int)); + } #else - free(A); - free(histo_host); + free(A); + free(histo_host); #endif - return 0; + return 0; } diff --git a/HST-S/dpu/task.c b/HST-S/dpu/task.c index 135f0d1..0333072 100644 --- a/HST-S/dpu/task.c +++ b/HST-S/dpu/task.c @@ -15,102 +15,121 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; // Array for communication between adjacent tasklets -uint32_t* message[NR_TASKLETS]; +uint32_t *message[NR_TASKLETS]; // DPU histogram -uint32_t* histo_dpu; +uint32_t *histo_dpu; // Barrier BARRIER_INIT(my_barrier, NR_TASKLETS); // Histogram in each tasklet -static void histogram(uint32_t* histo, uint32_t bins, T *input, unsigned int l_size){ - for(unsigned int j = 0; j < l_size; j++) { - T d = input[j]; - histo[(d * bins) >> DEPTH] += 1; - } +static void histogram(uint32_t *histo, uint32_t bins, T *input, + unsigned int l_size) +{ + for (unsigned int j = 0; j < l_size; j++) { + T d = input[j]; + histo[(d * bins) >> DEPTH] += 1; + } } extern int main_kernel1(void); -int (*kernels[nr_kernels])(void) = {main_kernel1}; +int (*kernels[nr_kernels])(void) = { main_kernel1 }; -int main(void) { - // Kernel - return kernels[DPU_INPUT_ARGUMENTS.kernel](); +int main(void) +{ + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel] (); } // main_kernel1 -int main_kernel1() { - unsigned int tasklet_id = me(); +int main_kernel1() +{ + unsigned int tasklet_id = me(); #if PRINT - printf("tasklet_id = %u\n", tasklet_id); + printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); - - uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; - uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes - uint32_t bins = DPU_INPUT_ARGUMENTS.bins; - - // Address of the current processing block in MRAM - uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; - uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER; - uint32_t mram_base_addr_histo = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer); - - // Initialize a local cache to store the MRAM block - T *cache_A = (T *) mem_alloc(BLOCK_SIZE); - - // Local histogram - uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t)); - - // Initialize local histogram - for(unsigned int i = 0; i < bins; i++){ - histo[i] = 0; - } - - // Compute histogram - for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){ - - // Bound checking - uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE; - - // Load cache with current MRAM block - mram_read((const __mram_ptr void*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes); - - // Histogram in each tasklet - histogram(histo, bins, cache_A, l_size_bytes >> DIV); - - } - message[tasklet_id] = histo; - - // Barrier - barrier_wait(&my_barrier); - - uint32_t *histo_dpu = message[0]; - - for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS){ - uint32_t b = 0; - for (unsigned int j = 0; j < NR_TASKLETS; j++){ - b += *(message[j] + i); - } - histo_dpu[i] = b; - } - - // Barrier - barrier_wait(&my_barrier); - - // Write dpu histogram to current MRAM block - if(tasklet_id == 0){ - if(bins * sizeof(uint32_t) <= 2048) - mram_write(histo_dpu, (__mram_ptr void*)(mram_base_addr_histo), bins * sizeof(uint32_t)); - else - for(unsigned int offset = 0; offset < ((bins * sizeof(uint32_t)) >> 11); offset++){ - mram_write(histo_dpu + (offset << 9), (__mram_ptr void*)(mram_base_addr_histo + (offset << 11)), 2048); - } - } - - return 0; + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; + uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes + uint32_t bins = DPU_INPUT_ARGUMENTS.bins; + + // Address of the current processing block in MRAM + uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; + uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; + uint32_t mram_base_addr_histo = + (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer); + + // Initialize a local cache to store the MRAM block + T *cache_A = (T *) mem_alloc(BLOCK_SIZE); + + // Local histogram + uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t)); + + // Initialize local histogram + for (unsigned int i = 0; i < bins; i++) { + histo[i] = 0; + } + + // Compute histogram + for (unsigned int byte_index = base_tasklet; + byte_index < input_size_dpu_bytes; + byte_index += BLOCK_SIZE * NR_TASKLETS) { + + // Bound checking + uint32_t l_size_bytes = + (byte_index + BLOCK_SIZE >= + input_size_dpu_bytes) ? (input_size_dpu_bytes - + byte_index) : BLOCK_SIZE; + + // Load cache with current MRAM block + mram_read((const __mram_ptr void *)(mram_base_addr_A + + byte_index), cache_A, + l_size_bytes); + + // Histogram in each tasklet + histogram(histo, bins, cache_A, l_size_bytes >> DIV); + + } + message[tasklet_id] = histo; + + // Barrier + barrier_wait(&my_barrier); + + uint32_t *histo_dpu = message[0]; + + for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS) { + uint32_t b = 0; + for (unsigned int j = 0; j < NR_TASKLETS; j++) { + b += *(message[j] + i); + } + histo_dpu[i] = b; + } + + // Barrier + barrier_wait(&my_barrier); + + // Write dpu histogram to current MRAM block + if (tasklet_id == 0) { + if (bins * sizeof(uint32_t) <= 2048) + mram_write(histo_dpu, + (__mram_ptr void *)(mram_base_addr_histo), + bins * sizeof(uint32_t)); + else + for (unsigned int offset = 0; + offset < ((bins * sizeof(uint32_t)) >> 11); + offset++) { + mram_write(histo_dpu + (offset << 9), + (__mram_ptr void + *)(mram_base_addr_histo + + (offset << 11)), 2048); + } + } + + return 0; } diff --git a/HST-S/host/app.c b/HST-S/host/app.c index 2c4e6a5..7f66f6e 100644 --- a/HST-S/host/app.c +++ b/HST-S/host/app.c @@ -40,362 +40,415 @@ #endif // Pointer declaration -static T* A; -static unsigned int* histo_host; -static unsigned int* histo; +static T *A; +static unsigned int *histo_host; +static unsigned int *histo; // Create input arrays -static void read_input(T* A, const Params p) { - - char dctFileName[100]; - FILE *File = NULL; - - // Open input file - unsigned short temp; - sprintf(dctFileName, "%s", p.file_name); - if((File = fopen(dctFileName, "rb")) != NULL) { - for(unsigned int y = 0; y < p.input_size; y++) { - if (fread(&temp, sizeof(unsigned short), 1, File) == 1) { - A[y] = (unsigned int)ByteSwap16(temp); - if(A[y] >= 4096) - A[y] = 4095; - } else { - //printf("out of bounds read at offset %d -- seeking back to 0\n", y); - rewind(File); - } - } - fclose(File); - } else { - printf("%s does not exist\n", dctFileName); - exit(1); - } +static void read_input(T *A, const Params p) +{ + + char dctFileName[100]; + FILE *File = NULL; + + // Open input file + unsigned short temp; + sprintf(dctFileName, "%s", p.file_name); + if ((File = fopen(dctFileName, "rb")) != NULL) { + for (unsigned int y = 0; y < p.input_size; y++) { + if (fread(&temp, sizeof(unsigned short), 1, File) == 1) { + A[y] = (unsigned int)ByteSwap16(temp); + if (A[y] >= 4096) + A[y] = 4095; + } else { + //printf("out of bounds read at offset %d -- seeking back to 0\n", y); + rewind(File); + } + } + fclose(File); + } else { + printf("%s does not exist\n", dctFileName); + exit(1); + } } // Compute output in the host -static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus) { - if(!exp){ - for (unsigned int i = 0; i < nr_of_dpus; i++) { - for (unsigned int j = 0; j < nr_elements; j++) { - T d = A[j]; - histo[i * bins + ((d * bins) >> DEPTH)] += 1; - } - } - } - else{ - for (unsigned int j = 0; j < nr_elements; j++) { - T d = A[j]; - histo[(d * bins) >> DEPTH] += 1; - } - } +static void histogram_host(unsigned int *histo, T *A, unsigned int bins, + unsigned int nr_elements, int exp, + unsigned int nr_of_dpus) +{ + if (!exp) { + for (unsigned int i = 0; i < nr_of_dpus; i++) { + for (unsigned int j = 0; j < nr_elements; j++) { + T d = A[j]; + histo[i * bins + ((d * bins) >> DEPTH)] += 1; + } + } + } else { + for (unsigned int j = 0; j < nr_elements; j++) { + T d = A[j]; + histo[(d * bins) >> DEPTH] += 1; + } + } } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - struct Params p = input_params(argc, argv); + struct Params p = input_params(argc, argv); - struct dpu_set_t dpu_set, dpu; - uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus; + uint32_t nr_of_ranks; #if ENERGY - struct dpu_probe_t probe; - DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); #endif - // Timer declaration - Timer timer; + // Timer declaration + Timer timer; - int numa_node_rank = -2; + int numa_node_rank = -2; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set)); + timer.time[0] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[1] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[6] = 0; // free #endif - unsigned int i = 0; - unsigned int input_size; // Size of input image - unsigned int dpu_s = p.dpu_s; - if(p.exp == 0) - input_size = p.input_size * NR_DPUS; // Size of input image - else if(p.exp == 1) - input_size = p.input_size; // Size of input image - else - input_size = p.input_size * dpu_s; // Size of input image - - const unsigned int input_size_8bytes = - ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned - const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) - const unsigned int input_size_dpu_8bytes = - ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned - - // Input/output allocation - A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - T *bufferA = A; - histo_host = malloc(p.bins * sizeof(unsigned int)); - histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int)); - - // Create an input file with arbitrary data - read_input(A, p); - if(p.exp == 0){ - for(unsigned int j = 1; j < NR_DPUS; j++){ - memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T)); - } - } - else if(p.exp == 2){ - for(unsigned int j = 1; j < dpu_s; j++) - memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T)); - } - - // Loop over main kernel - for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { - memset(histo_host, 0, p.bins * sizeof(unsigned int)); - memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int)); + unsigned int i = 0; + unsigned int input_size; // Size of input image + unsigned int dpu_s = p.dpu_s; + if (p.exp == 0) + input_size = p.input_size * NR_DPUS; // Size of input image + else if (p.exp == 1) + input_size = p.input_size; // Size of input image + else + input_size = p.input_size * dpu_s; // Size of input image + + const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned + const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) + const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned + + // Input/output allocation + A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + T *bufferA = A; + histo_host = malloc(p.bins * sizeof(unsigned int)); + histo = malloc(NR_DPUS * p.bins * sizeof(unsigned int)); + + // Create an input file with arbitrary data + read_input(A, p); + if (p.exp == 0) { + for (unsigned int j = 1; j < NR_DPUS; j++) { + memcpy(&A[j * input_size_dpu_8bytes], &A[0], + input_size_dpu_8bytes * sizeof(T)); + } + } else if (p.exp == 2) { + for (unsigned int j = 1; j < dpu_s; j++) + memcpy(&A[j * p.input_size], &A[0], + p.input_size * sizeof(T)); + } + // Loop over main kernel + for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + memset(histo_host, 0, p.bins * sizeof(unsigned int)); + memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int)); #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 0, 0); - } - DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set)); - if(rep >= p.n_warmup) { - stop(&timer, 0); - } + if (rep >= p.n_warmup) { + start(&timer, 0, 0); + } + DPU_ASSERT(dpu_alloc(NR_DPUS, DPU_ALLOC_PROFILE, &dpu_set)); + if (rep >= p.n_warmup) { + stop(&timer, 0); + } #endif #if WITH_DPUINFO - printf("DPUs:"); - DPU_FOREACH (dpu_set, dpu) { - int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - int slice = dpu_get_slice_id(dpu_from_set(dpu)); - int member = dpu_get_member_id(dpu_from_set(dpu)); - printf(" %d(%d.%d)", rank, slice, member); - } - printf("\n"); + printf("DPUs:"); + DPU_FOREACH(dpu_set, dpu) { + int rank = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + int slice = dpu_get_slice_id(dpu_from_set(dpu)); + int member = dpu_get_member_id(dpu_from_set(dpu)); + printf(" %d(%d.%d)", rank, slice, member); + } + printf("\n"); #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 1, 0); - } - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { - stop(&timer, 1); - } - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); + if (rep >= p.n_warmup) { + start(&timer, 1, 0); + } + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + if (rep >= p.n_warmup) { + stop(&timer, 1); + } + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); #endif - // int prev_rank_id = -1; - int rank_id = -1; - DPU_FOREACH (dpu_set, dpu) { - rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) { - numa_node_rank = -1; - } else { - numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu))); - } - /* - if (rank_id != prev_rank_id) { - printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); - prev_rank_id = rank_id; - } - */ - } - - // Compute output on CPU (performance comparison and verification purposes) - if(rep >= p.n_warmup) { - start(&timer, 2, 0); - } - histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS); - if(rep >= p.n_warmup) { - stop(&timer, 2); - } - - if(rep >= p.n_warmup) { - start(&timer, 3, 0); - } - // Input arguments - unsigned int kernel = 0; - i = 0; - dpu_arguments_t input_arguments[NR_DPUS]; - for(i=0; i<NR_DPUS-1; i++) { - input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); - input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); - input_arguments[i].bins=p.bins; - input_arguments[i].kernel=kernel; - } - input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); - input_arguments[NR_DPUS-1].transfer_size=input_size_dpu_8bytes * sizeof(T); - input_arguments[NR_DPUS-1].bins=p.bins; - input_arguments[NR_DPUS-1].kernel=kernel; - - // Copy input arrays - i = 0; - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i])); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT)); - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { - stop(&timer, 3); - } - - // Run DPU kernel - if(rep >= p.n_warmup) { - start(&timer, 4, 0); - #if ENERGY - DPU_ASSERT(dpu_probe_start(&probe)); - #endif - } - - DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if(rep >= p.n_warmup) { - stop(&timer, 4); - #if ENERGY - DPU_ASSERT(dpu_probe_stop(&probe)); - #endif - } + // int prev_rank_id = -1; + int rank_id = -1; + DPU_FOREACH(dpu_set, dpu) { + rank_id = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + if ((numa_node_rank != -2) + && numa_node_rank != + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu)))) { + numa_node_rank = -1; + } else { + numa_node_rank = + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu))); + } + /* + if (rank_id != prev_rank_id) { + printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); + prev_rank_id = rank_id; + } + */ + } + + // Compute output on CPU (performance comparison and verification purposes) + if (rep >= p.n_warmup) { + start(&timer, 2, 0); + } + histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS); + if (rep >= p.n_warmup) { + stop(&timer, 2); + } + + if (rep >= p.n_warmup) { + start(&timer, 3, 0); + } + // Input arguments + unsigned int kernel = 0; + i = 0; + dpu_arguments_t input_arguments[NR_DPUS]; + for (i = 0; i < NR_DPUS - 1; i++) { + input_arguments[i].size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[i].transfer_size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[i].bins = p.bins; + input_arguments[i].kernel = kernel; + } + input_arguments[NR_DPUS - 1].size = + (input_size_8bytes - + input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T); + input_arguments[NR_DPUS - 1].transfer_size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[NR_DPUS - 1].bins = p.bins; + input_arguments[NR_DPUS - 1].kernel = kernel; + + // Copy input arrays + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i])); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(input_arguments[0]), DPU_XFER_DEFAULT)); + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferA + input_size_dpu_8bytes * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + input_size_dpu_8bytes * sizeof(T), + DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) { + stop(&timer, 3); + } + // Run DPU kernel + if (rep >= p.n_warmup) { + start(&timer, 4, 0); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if (rep >= p.n_warmup) { + stop(&timer, 4); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } #if PRINT - { - unsigned int each_dpu = 0; - printf("Display DPU Logs\n"); - DPU_FOREACH (dpu_set, dpu) { - printf("DPU#%d:\n", each_dpu); - DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); - each_dpu++; - } - } + { + unsigned int each_dpu = 0; + printf("Display DPU Logs\n"); + DPU_FOREACH(dpu_set, dpu) { + printf("DPU#%d:\n", each_dpu); + DPU_ASSERT(dpulog_read_for_dpu + (dpu.dpu, stdout)); + each_dpu++; + } + } #endif - i = 0; - if(rep >= p.n_warmup) { - start(&timer, 5, 0); - } - // PARALLEL RETRIEVE TRANSFER - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT)); - - // Final histogram merging - for(i = 1; i < NR_DPUS; i++){ - for(unsigned int j = 0; j < p.bins; j++){ - histo[j] += histo[j + i * p.bins]; - } - } - if(rep >= p.n_warmup) { - stop(&timer, 5); - } - + i = 0; + if (rep >= p.n_warmup) { + start(&timer, 5, 0); + } + // PARALLEL RETRIEVE TRANSFER + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + input_size_dpu_8bytes * sizeof(T), + p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT)); + + // Final histogram merging + for (i = 1; i < NR_DPUS; i++) { + for (unsigned int j = 0; j < p.bins; j++) { + histo[j] += histo[j + i * p.bins]; + } + } + if (rep >= p.n_warmup) { + stop(&timer, 5); + } #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 6, 0); - } + if (rep >= p.n_warmup) { + start(&timer, 6, 0); + } #endif - DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - stop(&timer, 6); - } + if (rep >= p.n_warmup) { + stop(&timer, 6); + } #endif #endif - if (rep >= p.n_warmup) { - printf("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d", - nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), input_size, p.bins); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", - timer.time[0], - timer.time[1], - timer.time[2], - timer.time[3], - timer.time[4], - timer.time[5], - timer.time[6]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - input_size * sizeof(T) / timer.time[2], - input_size * sizeof(T) / (timer.time[4]), - input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - input_size / timer.time[2], - input_size / (timer.time[4]), - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - input_size / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - } - - } - - #if ENERGY - double energy; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); - printf("DPU Energy (J): %f\t", energy); - #endif - - // Check output - bool status = true; - if(p.exp == 1) - for (unsigned int j = 0; j < p.bins; j++) { - if(histo_host[j] != histo[j]){ - status = false; + if (rep >= p.n_warmup) { + printf + ("[::] HST-S-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d", + nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), + input_size, p.bins); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD, numa_node_rank); + printf + ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + timer.time[0], timer.time[1], timer.time[2], + timer.time[3], timer.time[4], timer.time[5], + timer.time[6]); + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + input_size * sizeof(T) / timer.time[2], + input_size * sizeof(T) / (timer.time[4]), + input_size * sizeof(T) / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5] + + timer.time[6])); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + input_size * sizeof(T) / (timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size * sizeof(T) / (timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size * sizeof(T) / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + input_size / timer.time[2], + input_size / (timer.time[4]), + input_size / (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5] + timer.time[6])); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + input_size / (timer.time[3] + timer.time[4] + + timer.time[5]), + input_size / (timer.time[1] + timer.time[3] + + timer.time[4] + timer.time[5]), + input_size / (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5])); + } + + } + +#if ENERGY + double energy; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + printf("DPU Energy (J): %f\t", energy); +#endif + + // Check output + bool status = true; + if (p.exp == 1) + for (unsigned int j = 0; j < p.bins; j++) { + if (histo_host[j] != histo[j]) { + status = false; #if PRINT - printf("%u - %u: %u -- %u\n", j, j, histo_host[j], histo[j]); + printf("%u - %u: %u -- %u\n", j, j, + histo_host[j], histo[j]); #endif - } - } - else if(p.exp == 2) - for (unsigned int j = 0; j < p.bins; j++) { - if(dpu_s * histo_host[j] != histo[j]){ - status = false; + } + } else if (p.exp == 2) + for (unsigned int j = 0; j < p.bins; j++) { + if (dpu_s * histo_host[j] != histo[j]) { + status = false; #if PRINT - printf("%u - %u: %u -- %u\n", j, j, dpu_s * histo_host[j], histo[j]); + printf("%u - %u: %u -- %u\n", j, j, + dpu_s * histo_host[j], histo[j]); #endif - } - } - else - for (unsigned int j = 0; j < p.bins; j++) { - if(NR_DPUS * histo_host[j] != histo[j]){ - status = false; + } + } else + for (unsigned int j = 0; j < p.bins; j++) { + if (NR_DPUS * histo_host[j] != histo[j]) { + status = false; #if PRINT - printf("%u - %u: %u -- %u\n", j, j, NR_DPUS * histo_host[j], histo[j]); + printf("%u - %u: %u -- %u\n", j, j, + NR_DPUS * histo_host[j], histo[j]); #endif - } - } - if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); - } - - // Deallocation - free(A); - free(histo_host); - free(histo); + } + } + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] Outputs are equal\n"); + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] Outputs differ!\n"); + } + + // Deallocation + free(A); + free(histo_host); + free(histo); #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_free(dpu_set)); #endif - - return status ? 0 : -1; + + return status ? 0 : -1; } diff --git a/HST-S/support/common.h b/HST-S/support/common.h index 30df40d..e0cacbb 100755 --- a/HST-S/support/common.h +++ b/HST-S/support/common.h @@ -13,8 +13,8 @@ // Data type #define T uint32_t -#define DIV 2 // Shift right to divide by sizeof(T) -#define REGS (BLOCK_SIZE >> 2) // 32 bits +#define DIV 2 // Shift right to divide by sizeof(T) +#define REGS (BLOCK_SIZE >> 2) // 32 bits // Pixel depth #define DEPTH 12 @@ -22,19 +22,19 @@ // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; - uint32_t transfer_size; - uint32_t bins; + uint32_t size; + uint32_t transfer_size; + uint32_t bins; enum kernels { - kernel1 = 0, - nr_kernels = 1, + kernel1 = 0, + nr_kernels = 1, } kernel; } dpu_arguments_t; #ifndef ENERGY #define ENERGY 0 #endif -#define PRINT 0 +#define PRINT 0 #define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_GREEN "\x1b[32m" diff --git a/HST-S/support/params.h b/HST-S/support/params.h index e29449b..3028a50 100644 --- a/HST-S/support/params.h +++ b/HST-S/support/params.h @@ -4,64 +4,80 @@ #include "common.h" typedef struct Params { - unsigned int input_size; - unsigned int bins; - int n_warmup; - int n_reps; - const char *file_name; - int exp; - int dpu_s; -}Params; + unsigned int input_size; + unsigned int bins; + int n_warmup; + int n_reps; + const char *file_name; + int exp; + int dpu_s; +} Params; -static void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1, 2) scaling (default=0)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=1536*1024 elements)" - "\n -b <B> histogram size (default=256 bins)" - "\n -f <F> input image file (default=../input/image_VanHateren.iml)" - "\n"); +static void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n -x <X> Weak (0) or strong (1, 2) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=1536*1024 elements)" + "\n -b <B> histogram size (default=256 bins)" + "\n -f <F> input image file (default=../input/image_VanHateren.iml)" + "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size = 1536 * 1024; - p.bins = 256; - p.n_warmup = 1; - p.n_reps = 3; - p.exp = 0; - p.file_name = "./input/image_VanHateren.iml"; - p.dpu_s = 64; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size = 1536 * 1024; + p.bins = 256; + p.n_warmup = 1; + p.n_reps = 3; + p.exp = 0; + p.file_name = "./input/image_VanHateren.iml"; + p.dpu_s = 64; - int opt; - while((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'b': p.bins = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'f': p.file_name = optarg; break; - case 'x': p.exp = atoi(optarg); break; - case 'z': p.dpu_s = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atoi(optarg); + break; + case 'b': + p.bins = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'f': + p.file_name = optarg; + break; + case 'x': + p.exp = atoi(optarg); + break; + case 'z': + p.dpu_s = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; + return p; } #endif diff --git a/HST-S/support/timer.h b/HST-S/support/timer.h index 4d597b9..df68334 100755 --- a/HST-S/support/timer.h +++ b/HST-S/support/timer.h @@ -1,66 +1,74 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer { + + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; + +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) +{ + printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/MLP/baselines/cpu/Makefile b/MLP/baselines/cpu/Makefile index e2e6780..7eb5f00 100644 --- a/MLP/baselines/cpu/Makefile +++ b/MLP/baselines/cpu/Makefile @@ -1,7 +1,28 @@ +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 + +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) + LDFLAGS += -lnuma +endif + all: mlp_openmp mlp_openmp: mlp_openmp.c - gcc -O2 mlp_openmp.c -o mlp_openmp -fopenmp -std=c99 + gcc -Wall -Wextra -pedantic -O3 ${CFLAGS} mlp_openmp.c -o mlp_openmp -DNUMA=${numa} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -fopenmp -std=c99 ${LDFLAGS} mlp_openmp_O0: mlp_openmp.c gcc mlp_openmp.c -o mlp_openmp_O0 -fopenmp -std=c99 @@ -18,4 +39,7 @@ run_O0: mlp_openmp_O0 run_O2: mlp_openmp_O2 ./mlp_openmp_O2 -.PHONY: all run run_O0 run_O2 +clean: + rm -f mlp_openmp mlp_openmp_O0 mlp_openmp_O2 + +.PHONY: all run run_O0 run_O2 clean diff --git a/MLP/baselines/cpu/mlp_openmp.c b/MLP/baselines/cpu/mlp_openmp.c index 8f95e7c..b473d7a 100644 --- a/MLP/baselines/cpu/mlp_openmp.c +++ b/MLP/baselines/cpu/mlp_openmp.c @@ -11,173 +11,261 @@ #include <getopt.h> #include <assert.h> #include <stdint.h> -#include "../../support/timer.h" #include "../../support/common.h" +#if WITH_BENCHMARK +#include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif + +#if NUMA +#include <numaif.h> +#include <numa.h> + +void *mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +int numa_node_data = -1; +int numa_node_cpu = -1; +#endif + #define XSTR(x) STR(x) #define STR(x) #x -T** A; -T* B; -T* C; +// weights +T **A; + +// input/output +T *B; + +// intermediate +T *C; // Create input arrays -static void init_data(T** A, T* B, unsigned int m_size, unsigned int n_size){ - for (unsigned int l = 0; l < NUM_LAYERS; l++) - for (unsigned int i = 0; i < m_size * n_size; i++){ - if(i % 100 < 98){ +static void init_data(T **A, unsigned int m_size, unsigned int n_size) +{ + for (unsigned int l = 0; l < NUM_LAYERS; l++) { + for (unsigned int i = 0; i < m_size * n_size; i++) { + if (i % 100 < 98) { A[l][i] = 0; - }else{ - A[l][i] = (l+i) % 2; + } else { + A[l][i] = (l + i) % 2; } } - for (unsigned int i = 0; i < n_size; i++){ - if(i % 50 < 48){ + } +} + +static void init_B(T *B, unsigned int n_size) +{ + for (unsigned int i = 0; i < n_size; i++) { + if (i % 50 < 48) { B[i] = 0; - } - else{ + } else { B[i] = i % 2; } } } // Compute output in the host -static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) { - for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){ - for (unsigned int m = 0; m < m_size; m++){ +static void mlp_host(T *C, T **A, T *B, unsigned int m_size, + unsigned int n_size) +{ + for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) { + for (unsigned int m = 0; m < m_size; m++) { C[m] = 0; } - #pragma omp parallel for - for (unsigned int m = 0; m < m_size; m++){ - for (unsigned int n = 0; n < n_size; n++){ +#pragma omp parallel for + for (unsigned int m = 0; m < m_size; m++) { + for (unsigned int n = 0; n < n_size; n++) { C[m] += A[nl][m * n_size + n] * B[n]; } C[m] = max(0, C[m]); } - for (unsigned int n = 0; n < n_size; n++){ + for (unsigned int n = 0; n < n_size; n++) { B[n] = C[n]; } } } -static uint64_t mlp_host_sum(uint64_t n_size, uint64_t m_size) { - uint64_t sum = 0; - for (uint64_t m = 0; m < n_size; m++){ - sum += B[m]; - } - return sum; +static uint64_t mlp_host_sum(uint64_t n_size) +{ + uint64_t sum = 0; + for (uint64_t m = 0; m < n_size; m++) { + sum += B[m]; + } + return sum; } // Params --------------------------------------------------------------------- typedef struct Params { - char* dpu_type; - int nr_of_ranks; - int input_size_n; - int input_size_m; - int n_warmup; - int n_reps; -}Params; - -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -d <D> DPU type (default=fsim)" - "\n -r <R> # of ranks (default=2)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=8M elements)" - "\n"); - } - - struct Params input_params(int argc, char **argv) { - struct Params p; - p.dpu_type = "fsim"; - p.nr_of_ranks = 1; - p.input_size_n = 1 << 9; - p.input_size_m = 1 << 9; - p.n_warmup = 2; - p.n_reps = 3; - - int opt; - while((opt = getopt(argc, argv, "hd:r:i:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'd': p.dpu_type = optarg; break; - case 'r': p.nr_of_ranks = atoi(optarg); break; - case 'n': p.input_size_n = atoi(optarg); break; - case 'm': p.input_size_m = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(p.nr_of_ranks > 0 && "Invalid # of ranks!"); - - return p; - } + int input_size_n; + int input_size_m; + int n_reps; +#if NUMA + struct bitmask *bitmask; + int numa_node_cpu; +#endif +} Params; + +void usage() +{ + fprintf(stderr, "\nUsage: ./program [options]" "\n"); +} + +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size_n = 8192; + p.input_size_m = 20480; + p.n_reps = 100; +#if NUMA + p.bitmask = NULL; + p.numa_node_cpu = -1; +#endif + + int opt; + while ((opt = getopt(argc, argv, "e:n:m:A:C:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'n': + p.input_size_n = atoi(optarg); + break; + case 'm': + p.input_size_m = atoi(optarg); + break; +#if NUMA + case 'A': + p.bitmask = numa_parse_nodestring(optarg); + break; + case 'C': + p.numa_node_cpu = atoi(optarg); + break; +#endif + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + + return p; +} /** * @brief Main of the Host Application. */ - int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - struct Params p = input_params(argc, argv); - uint64_t n_size = 8192; - uint64_t m_size = 20480; + struct Params p = input_params(argc, argv); + uint64_t n_size = p.input_size_n; + uint64_t m_size = p.input_size_m; - Timer timer; - A = malloc(NUM_LAYERS * sizeof(T*)); - for(int l = 0; l < NUM_LAYERS; l++) - A[l] = malloc(n_size*m_size*sizeof(unsigned int)); - B = malloc(m_size*sizeof(unsigned int)); - C = malloc(m_size*sizeof(unsigned int)); +#if WITH_BENCHMARK + Timer timer; +#endif - for (int i = 0; i < 100; i++) { - // Create an input file with arbitrary data. - init_data(A, B, m_size, n_size); +#if NUMA + if (p.bitmask) { + numa_set_membind(p.bitmask); + numa_free_nodemask(p.bitmask); + } + A = numa_alloc(NUM_LAYERS * sizeof(T *)); + for (int l = 0; l < NUM_LAYERS; l++) { + A[l] = numa_alloc(n_size * m_size * sizeof(unsigned int)); + } + B = numa_alloc(m_size * sizeof(unsigned int)); + C = numa_alloc(m_size * sizeof(unsigned int)); + + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_data = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } +#else + A = malloc(NUM_LAYERS * sizeof(T *)); + for (int l = 0; l < NUM_LAYERS; l++) { + A[l] = malloc(n_size * m_size * sizeof(unsigned int)); + } + B = malloc(m_size * sizeof(unsigned int)); + C = malloc(m_size * sizeof(unsigned int)); +#endif + + // Create an input file with arbitrary data. + init_data(A, m_size, n_size); + + for (int i = 0; i < p.n_reps; i++) { + init_B(B, n_size); - start(&timer, 0, 0); - mlp_host(C, A, B, n_size, m_size); - stop(&timer, 0); + start(&timer, 0, 0); + mlp_host(C, A, B, n_size, m_size); + stop(&timer, 0); - unsigned int nr_threads = 0; +#if WITH_BENCHMARK + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; - - printf("[::] n_threads=%d e_type=%s n_elements=%lu " - "| throughput_cpu_omp_MBps=%f\n", - nr_threads, XSTR(T), n_size * m_size, - n_size * m_size * sizeof(T) / timer.time[0]); - printf("[::] n_threads=%d e_type=%s n_elements=%lu " - "| throughput_cpu_omp_MOpps=%f\n", - nr_threads, XSTR(T), n_size * m_size, - n_size * m_size / timer.time[0]); - printf("[::] n_threads=%d e_type=%s n_elements=%lu |", - nr_threads, XSTR(T), n_size * m_size); - printall(&timer, 0); - } - - uint32_t sum = mlp_host_sum(n_size, m_size); - - printf("Kernel "); - print(&timer, 0, 1); - printf("\n"); - - printf("SUM = %d \n", sum); - - for(int l = 0; l < NUM_LAYERS; l++) - free(A[l]); - free(A); - free(B); - free(C); - - return 0; + nr_threads++; + + printf("[::] MLP-CPU | n_threads=%d e_type=%s n_elements=%lu", + nr_threads, XSTR(T), n_size * m_size); +#if NUMA + printf + (" numa_node_data=%d numa_node_cpu=%d numa_distance_cpu_data=%d", + numa_node_data, numa_node_cpu, + numa_distance(numa_node_data, numa_node_cpu)); +#endif + printf(" | throughput_MBps=%f throughput_MOpps=%f", + n_size * m_size * sizeof(T) / timer.time[0], + n_size * m_size / timer.time[0]); + printf(" latency_us=%f\n", timer.time[0]); +#endif // WITH_BENCHMARK + } + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } +#endif + + uint32_t sum = mlp_host_sum(n_size); + + printf("SUM = %d \n", sum); + +#if NUMA + for (int l = 0; l < NUM_LAYERS; l++) { + numa_free(A[l], n_size * m_size * sizeof(unsigned int)); + } + numa_free(A, NUM_LAYERS * sizeof(T *)); + numa_free(B, m_size * sizeof(unsigned int)); + numa_free(C, m_size * sizeof(unsigned int)); +#else + for (int l = 0; l < NUM_LAYERS; l++) { + free(A[l]); + } + free(A); + free(B); + free(C); +#endif + + return 0; } diff --git a/MLP/dpu/task.c b/MLP/dpu/task.c index de3e554..4f85024 100644 --- a/MLP/dpu/task.c +++ b/MLP/dpu/task.c @@ -15,7 +15,8 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; // GEMV -static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { +static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) +{ for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) { bufferC[pos] += bufferA[i] * bufferB[i]; } @@ -26,13 +27,14 @@ static void gemv(T *bufferC, T *bufferA, T *bufferB, int pos) { BARRIER_INIT(my_barrier, NR_TASKLETS); // main -int main() { +int main() +{ unsigned int tasklet_id = me(); #if PRINT printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap } // Barrier barrier_wait(&my_barrier); @@ -42,12 +44,11 @@ int main() { uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows; uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows; - unsigned int nrows = nr_rows; - unsigned int rows_per_tasklet; + unsigned int rows_per_tasklet; unsigned int start_row; unsigned int chunks = nrows / (NR_TASKLETS + NR_TASKLETS); - unsigned int dbl_chunks = chunks + chunks; + unsigned int dbl_chunks = chunks + chunks; rows_per_tasklet = dbl_chunks; unsigned int rest_rows = nrows % (NR_TASKLETS + NR_TASKLETS); @@ -57,19 +58,30 @@ int main() { if ((tasklet_id + tasklet_id) >= rest_rows) { unsigned int hlf_rest_rows = rest_rows >> 1; if ((rest_rows & 1) == 1) - start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks; + start_row = + (hlf_rest_rows + 1) * (dbl_chunks + 2) + + (tasklet_id - 1 - + hlf_rest_rows) * dbl_chunks; else - start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks; - } else + start_row = + (hlf_rest_rows) * (dbl_chunks + 2) + + (tasklet_id - hlf_rest_rows) * dbl_chunks; + } else start_row = tasklet_id * (dbl_chunks + 2); } else { start_row = tasklet_id * (dbl_chunks); } // Address of the current row in MRAM - uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); - uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T)); - uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T)); + uint32_t mram_base_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T)); + uint32_t mram_base_addr_B = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T)); + uint32_t mram_base_addr_C = + (uint32_t) (DPU_MRAM_HEAP_POINTER + + max_rows * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T) + start_row * sizeof(T)); uint32_t mram_temp_addr_A = mram_base_addr_A; uint32_t mram_temp_addr_B = mram_base_addr_B; @@ -82,34 +94,44 @@ int main() { int offset = 0; // Iterate over nr_rows - for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) { + for (unsigned int i = start_row; i < start_row + rows_per_tasklet; + i += 2) { - mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); + mram_temp_addr_A = + (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; cache_C[0] = 0; cache_C[1] = 0; - for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){ + for (unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++) { int n = 0, j; - for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T))) - { - - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - if(offset) - { - - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++) - { + for (n = 0; + n < (int32_t) (n_size - (BLOCK_SIZE / sizeof(T))); + n += (BLOCK_SIZE / sizeof(T))) { + + mram_read((__mram_ptr void const + *)(mram_temp_addr_A), cache_A, + BLOCK_SIZE); + mram_read((__mram_ptr void const + *)(mram_temp_addr_B), cache_B, + BLOCK_SIZE); + + if (offset) { + + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + + BLOCK_SIZE), cache_A_aux, + 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } - // Compute GEMV gemv(cache_C, cache_A, cache_B, pos); @@ -118,49 +140,51 @@ int main() { mram_temp_addr_B += BLOCK_SIZE; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE); + mram_read((__mram_ptr void const *)(mram_temp_addr_A), + cache_A, BLOCK_SIZE); - - if(offset) - { - for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++) - { + if (offset) { + for (unsigned int off = 0; + off < (BLOCK_SIZE / sizeof(T)) - 1; + off++) { cache_A[off] = cache_A[off + 1]; } - mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8); + mram_read((__mram_ptr void const + *)(mram_temp_addr_A + BLOCK_SIZE), + cache_A_aux, 8); - cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0]; + cache_A[BLOCK_SIZE / sizeof(T) - 1] = + cache_A_aux[0]; } + mram_read((__mram_ptr void const *)(mram_temp_addr_B), + cache_B, BLOCK_SIZE); - mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE); - - for (j = 0; j < (int) (n_size - n); j++) { + for (j = 0; j < (int)(n_size - n); j++) { // Compute GEMV - if(j >= (int)(BLOCK_SIZE / sizeof(T))){ + if (j >= (int)(BLOCK_SIZE / sizeof(T))) { printf("error\n"); break; } cache_C[pos] += cache_A[j] * cache_B[j]; } - - mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T)); + mram_temp_addr_A += + (BLOCK_SIZE - + ((BLOCK_SIZE / sizeof(T)) - + (n_size - n)) * sizeof(T)); mram_temp_addr_B = mram_base_addr_B; - if(mram_temp_addr_A % 8 != 0) - { + if (mram_temp_addr_A % 8 != 0) { offset = 1; - } - else - { + } else { offset = 0; } } // Write cache to current MRAM block - mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8); + mram_write(cache_C, (__mram_ptr void *)(mram_base_addr_C), 8); // Update memory address mram_base_addr_C += 2 * sizeof(T); diff --git a/MLP/host/app.c b/MLP/host/app.c index 952cb3f..24243bf 100644 --- a/MLP/host/app.c +++ b/MLP/host/app.c @@ -27,28 +27,29 @@ #define DPU_BINARY "./bin/mlp_dpu" #endif -static T** A; -static T* B; -static T* B_host; -static T* B_tmp; -static T* C; -static T* C_dpu; +static T **A; +static T *B; +static T *B_host; +static T *B_tmp; +static T *C; +static T *C_dpu; // Create input arrays -static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int n_size) { +static void init_data(T **A, T *B, T *B_host, unsigned int m_size, + unsigned int n_size) +{ for (unsigned int l = 0; l < NUM_LAYERS; l++) - for (unsigned int i = 0; i < m_size * n_size; i++){ - if(i % 100 < 98){ + for (unsigned int i = 0; i < m_size * n_size; i++) { + if (i % 100 < 98) { A[l][i] = 0; - }else{ - A[l][i] = (l+i) % 2; + } else { + A[l][i] = (l + i) % 2; } } - for (unsigned int i = 0; i < n_size; i++){ - if(i % 50 < 48){ + for (unsigned int i = 0; i < n_size; i++) { + if (i % 50 < 48) { B[i] = 0; - } - else{ + } else { B[i] = i % 2; } B_host[i] = B[i]; @@ -56,26 +57,29 @@ static void init_data(T** A, T* B, T* B_host, unsigned int m_size, unsigned int } // Compute output in the host -static void mlp_host(T* C, T** A, T* B, unsigned int m_size, unsigned int n_size) { +static void mlp_host(T *C, T **A, T *B, unsigned int m_size, + unsigned int n_size) +{ - for (unsigned int nl = 0; nl < NUM_LAYERS; nl++){ - for (unsigned int m = 0; m < m_size; m++){ + for (unsigned int nl = 0; nl < NUM_LAYERS; nl++) { + for (unsigned int m = 0; m < m_size; m++) { C[m] = 0; } - for (unsigned int m = 0; m < m_size; m++){ - for (unsigned int n = 0; n < n_size; n++){ + for (unsigned int m = 0; m < m_size; m++) { + for (unsigned int n = 0; n < n_size; n++) { C[m] += A[nl][m * n_size + n] * B[n]; } C[m] = max(0, C[m]); } - for (unsigned int n = 0; n < n_size; n++){ + for (unsigned int n = 0; n < n_size; n++) { B[n] = C[n]; } } } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ struct Params p = input_params(argc, argv); @@ -97,14 +101,15 @@ int main(int argc, char **argv) { unsigned int n_size = p.n_size; // Initialize help data - dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t)); - dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t)); + dpu_info = + (struct dpu_info_t *)malloc(nr_of_dpus * sizeof(struct dpu_info_t)); + dpu_arguments_t *input_args = + (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t)); uint32_t max_rows_per_dpu = 0; uint32_t n_size_pad = n_size; - if(n_size % 2 == 1){ + if (n_size % 2 == 1) { n_size_pad++; } - // Timer Timer timer; i = 0; @@ -118,7 +123,10 @@ int main(int argc, char **argv) { rows_per_dpu++; if (rest_rows > 0) { if (i >= rest_rows) - prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks; + prev_rows_dpu = + rest_rows * (chunks + 1) + (i - + rest_rows) * + chunks; else prev_rows_dpu = i * (chunks + 1); } else { @@ -127,7 +135,7 @@ int main(int argc, char **argv) { // Keep max rows for parallel transfers uint32_t rows_per_dpu_pad = rows_per_dpu; - if (rows_per_dpu_pad % 2 == 1) // 4-byte elements + if (rows_per_dpu_pad % 2 == 1) // 4-byte elements rows_per_dpu_pad++; if (rows_per_dpu_pad > max_rows_per_dpu) max_rows_per_dpu = rows_per_dpu_pad; @@ -142,14 +150,15 @@ int main(int argc, char **argv) { input_args[i].nr_rows = rows_per_dpu; } - A = (T**)malloc(NUM_LAYERS * sizeof(T*)); - for(l = 0; l < NUM_LAYERS; l++) - A[l] = (T*)malloc( max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T)); + A = (T **) malloc(NUM_LAYERS * sizeof(T *)); + for (l = 0; l < NUM_LAYERS; l++) + A[l] = + (T *) malloc(max_rows_per_dpu * nr_of_dpus * n_size_pad * + sizeof(T)); - - B = (T*)malloc(n_size * sizeof(T)); - B_host = (T*)malloc(n_size * sizeof(T)); - C = (T*)malloc(m_size * sizeof(T)); + B = (T *) malloc(n_size * sizeof(T)); + B_host = (T *) malloc(n_size * sizeof(T)); + C = (T *) malloc(m_size * sizeof(T)); C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); @@ -170,26 +179,36 @@ int main(int argc, char **argv) { input_args[i].max_rows = max_rows_per_dpu; DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); - + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(dpu_arguments_t), DPU_XFER_DEFAULT)); // Copy input array and vector i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, A[0] + dpu_info[i].prev_rows_dpu * n_size)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, + A[0] + dpu_info[i].prev_rows_dpu * n_size)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + max_rows_per_dpu * n_size_pad * sizeof(T), + DPU_XFER_DEFAULT)); i = 0; DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, B)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T), + n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); if (rep >= p.n_warmup) stop(&timer, 1); // Run kernel on DPUs - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { start(&timer, 2, rep - p.n_warmup); #if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); @@ -198,31 +217,38 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { stop(&timer, 2); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); #endif } - for(int lay = 1; lay < NUM_LAYERS; lay++){ + for (int lay = 1; lay < NUM_LAYERS; lay++) { if (rep >= p.n_warmup) start(&timer, 4, rep - p.n_warmup); i = 0; // Copy C_dpu DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, C_dpu + i * max_rows_per_dpu)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T), + max_rows_per_dpu * sizeof(T), + DPU_XFER_DEFAULT)); // B = C unsigned int n, j; i = 0; for (n = 0; n < nr_of_dpus; n++) { for (j = 0; j < dpu_info[n].rows_per_dpu; j++) { - B_tmp[i] = C_dpu[n * max_rows_per_dpu + j]; + B_tmp[i] = + C_dpu[n * max_rows_per_dpu + j]; i++; } } @@ -230,20 +256,31 @@ int main(int argc, char **argv) { DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, B_tmp)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T), + n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); // Copy next matrix of weights i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, A[lay] + dpu_info[i].prev_rows_dpu * n_size)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, + A[lay] + + dpu_info[i].prev_rows_dpu * + n_size)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + max_rows_per_dpu * n_size_pad * sizeof(T), + DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) + if (rep >= p.n_warmup) stop(&timer, 4); - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { start(&timer, 2, rep - p.n_warmup); #if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); @@ -252,8 +289,7 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { stop(&timer, 2); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); @@ -273,16 +309,23 @@ int main(int argc, char **argv) { start(&timer, 3, rep - p.n_warmup); i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, C_dpu + i * max_rows_per_dpu)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + max_rows_per_dpu * n_size_pad * sizeof(T) + + n_size_pad * sizeof(T), + max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) stop(&timer, 3); } #if ENERGY double acc_energy, avg_energy, acc_time, avg_time; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get + (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); @@ -311,23 +354,26 @@ int main(int argc, char **argv) { i = 0; for (n = 0; n < nr_of_dpus; n++) { for (j = 0; j < dpu_info[n].rows_per_dpu; j++) { - if(C[i] != C_dpu[n * max_rows_per_dpu + j]) { + if (C[i] != C_dpu[n * max_rows_per_dpu + j]) { status = false; #if PRINT - printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]); + printf("%d: %d -- %d\n", i, C[i], + C_dpu[n * max_rows_per_dpu + j]); #endif } i++; } } if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] Outputs are equal\n"); } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] Outputs differ!\n"); } // Deallocation - for(i = 0; i < NUM_LAYERS; i++) + for (i = 0; i < NUM_LAYERS; i++) free(A[i]); free(A); free(B); diff --git a/MLP/support/common.h b/MLP/support/common.h index 53b2f1c..4b5031b 100755 --- a/MLP/support/common.h +++ b/MLP/support/common.h @@ -3,21 +3,21 @@ // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t n_size; - uint32_t n_size_pad; - uint32_t nr_rows; - uint32_t max_rows; + uint32_t n_size; + uint32_t n_size_pad; + uint32_t nr_rows; + uint32_t max_rows; } dpu_arguments_t; // Specific information for each DPU struct dpu_info_t { - uint32_t rows_per_dpu; - uint32_t rows_per_dpu_pad; - uint32_t prev_rows_dpu; + uint32_t rows_per_dpu; + uint32_t rows_per_dpu_pad; + uint32_t prev_rows_dpu; }; struct dpu_info_t *dpu_info; -#define NUM_LAYERS 3 +#define NUM_LAYERS 3 #define max(x, y) (x > y ? x : y) #define min(x, y) (x < y ? x : y) diff --git a/MLP/support/params.h b/MLP/support/params.h index f9e790e..4bfc2fc 100644 --- a/MLP/support/params.h +++ b/MLP/support/params.h @@ -4,53 +4,62 @@ #include "common.h" typedef struct Params { - unsigned int m_size; - unsigned int n_size; - unsigned int n_warmup; - unsigned int n_reps; -}Params; + unsigned int m_size; + unsigned int n_size; + unsigned int n_warmup; + unsigned int n_reps; +} Params; -static void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n" - "\nBenchmark-specific options:" - "\n -m <I> m_size (default=2048 elements)" - "\n -n <I> n_size (default=2048 elements)" - "\n"); +static void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -m <I> m_size (default=2048 elements)" + "\n -n <I> n_size (default=2048 elements)" "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.m_size = 163840; - p.n_size = 4096; - p.n_warmup = 1; - p.n_reps = 3; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.m_size = 163840; + p.n_size = 4096; + p.n_warmup = 1; + p.n_reps = 3; - int opt; - while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'm': p.m_size = atoi(optarg); break; - case 'n': p.n_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'm': + p.m_size = atoi(optarg); + break; + case 'n': + p.n_size = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; + return p; } #endif diff --git a/MLP/support/timer.h b/MLP/support/timer.h index 886380a..961ed11 100755 --- a/MLP/support/timer.h +++ b/MLP/support/timer.h @@ -1,62 +1,69 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[5];
- struct timeval stopTime[5];
- double time[5];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
- //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
-
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer { + + struct timeval startTime[5]; + struct timeval stopTime[5]; + double time[5]; + +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); + //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + + // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000); + +} + +void print(Timer *timer, int i, int REP) +{ + printf("%f\t", timer->time[i] / (1000 * REP)); +} diff --git a/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh new file mode 100755 index 0000000..869ada3 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/nodmc25-alloc.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +mkdir -p log/$(hostname) +fn=log/$(hostname)/nodmc25-alloc + +source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + ./make-size.sh ${size} + n_nops=$((size * 256)) + if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then + for l in $(seq 1 20); do + bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}') + done + fi + return $? +} + +export -f run_benchmark_nmc + +( + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \ + ::: i $(seq 1 5) \ + ::: numa_rank -1 \ + ::: numa_cpu 0 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: size $(seq 0 15) + +) >> ${fn}.txt diff --git a/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh new file mode 100755 index 0000000..33bb12f --- /dev/null +++ b/Microbenchmarks/CPU-DPU/nodmc25-transfer.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +mkdir -p log/$(hostname) +fn=log/$(hostname)/nodmc25-transfer + +source /opt/upmem/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + +./make-size.sh 0 + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1 + bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size} + return $? +} + +export -f run_benchmark_nmc + +# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output). +# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB). +# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory. + +( + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \ + ::: i $(seq 1 10) \ + ::: numa_rank -1 \ + ::: numa_in 0 1 \ + ::: numa_out 0 1 \ + ::: numa_cpu 0 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: input_size 1 1048576 + +) >> ${fn}.txt diff --git a/SpMV/baselines/cpu/Makefile b/SpMV/baselines/cpu/Makefile index 64b20db..a24b764 100644 --- a/SpMV/baselines/cpu/Makefile +++ b/SpMV/baselines/cpu/Makefile @@ -1,7 +1,15 @@ +native ?= 1 + +CFLAGS = + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + all: spmv spmv: app.c - gcc -O2 -o spmv -fopenmp app.c + gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o spmv -fopenmp app.c spmv_O0: app.c gcc -o spmv_O0 -fopenmp app.c diff --git a/SpMV/baselines/cpu/run-perf.sh b/SpMV/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..714498d --- /dev/null +++ b/SpMV/baselines/cpu/run-perf.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B + +OMP_NUM_THREADS=1 perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run +OMP_NUM_THREADS=4 perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} make run diff --git a/TRNS/baselines/cpu/Makefile b/TRNS/baselines/cpu/Makefile index 236f7bb..438b9fb 100644 --- a/TRNS/baselines/cpu/Makefile +++ b/TRNS/baselines/cpu/Makefile @@ -32,16 +32,23 @@ # THE SOFTWARE. # -NUMA ?= 0 -NUMA_MEMCPY ?= 0 -FLAGS = +native ?= 1 +numa ?= 0 +numa_memcpy ?= 0 -ifeq (${NUMA}, 1) +CFLAGS = +LDFLAGS = + +ifeq (${native}, 1) + CFLAGS += -march=native +endif + +ifeq (${numa}, 1) FLAGS += -lnuma endif CXX=g++ -CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} +CXX_FLAGS=-std=c++11 -Wall -Wextra -pedantic -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} LIB=-L/usr/lib/ -lm -pthread @@ -52,7 +59,7 @@ EXE=trns all: trns trns: ${SRC} - $(CXX) -O2 $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE) $(FLAGS) + $(CXX) -O3 $(CXX_FLAGS) ${CFLAGS} $(SRC) $(LIB) -o $(EXE) ${LDFLAGS} trns_O0: ${SRC} $(CXX) $(CXX_FLAGS) $(SRC) $(LIB) -o $(EXE)_O0 diff --git a/TRNS/baselines/cpu/run-perf.sh b/TRNS/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..f16a3b1 --- /dev/null +++ b/TRNS/baselines/cpu/run-perf.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B numa=1 + +perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 1 -a 4 -c 4 +perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./trns -w 0 -r 20 -p 2048 -o 2048 -m 16 -n 8 -t 4 -a 4 -c 4 diff --git a/TRNS/dimes-hetsim-hbm.sh b/TRNS/dimes-hetsim-hbm.sh index e2efaee..cc5dc68 100755 --- a/TRNS/dimes-hetsim-hbm.sh +++ b/TRNS/dimes-hetsim-hbm.sh @@ -32,7 +32,7 @@ fn=log/$(hostname)/dimes-hetsim-hbm ( -make -B NUMA=1 NUMA_MEMCPY=1 +make -B numa=1 numa_memcpy=1 echo "CPU single-node operation with setup cost, memcpy node == input node, cpu node == output node (1/3)" >&2 @@ -43,10 +43,9 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ ::: ram_in $(seq 0 15) \ :::+ cpu_memcpy $(seq 0 7) $(seq 0 7) \ ::: ram_local $(seq 0 15) \ - :::+ cpu $(seq 0 7) $(seq 0 7) \ - ::: input_size 167772160 + :::+ cpu $(seq 0 7) $(seq 0 7) -make -B NUMA=1 +make -B numa=1 echo "CPU single-node operation (2/3)" >&2 diff --git a/TRNS/dimes-hetsim-nmc.sh b/TRNS/dimes-hetsim-nmc.sh index b5f6f13..80987e7 100755 --- a/TRNS/dimes-hetsim-nmc.sh +++ b/TRNS/dimes-hetsim-nmc.sh @@ -73,7 +73,7 @@ parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ ) >> ${fn}.txt cd baselines/cpu -make -B NUMA=1 +make -B numa=1 ( @@ -97,7 +97,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ ) >> ${fn}.txt -make -B NUMA=1 NUMA_MEMCPY=1 +make -B numa=1 numa_memcpy=1 ( diff --git a/TS/baselines/cpu/mprofile.h b/TS/baselines/cpu/mprofile.h index 120c225..bfaf052 100644 --- a/TS/baselines/cpu/mprofile.h +++ b/TS/baselines/cpu/mprofile.h @@ -10,5 +10,7 @@ //#define HBM_ALOC //#define RANDOM_DIAGS -int loadTimeSeriesFromFile (std::string infilename, std::vector<DTYPE> &A, int &timeSeriesLength); -int saveProfileToFile(std::string outfilename, DTYPE * profile, int * profileIndex, int timeSeriesLength, int windowSize); +int loadTimeSeriesFromFile(std::string infilename, std::vector < DTYPE > &A, + int &timeSeriesLength); +int saveProfileToFile(std::string outfilename, DTYPE * profile, + int *profileIndex, int timeSeriesLength, int windowSize); diff --git a/TS/baselines/cpu/run-perf.sh b/TS/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..453b64b --- /dev/null +++ b/TS/baselines/cpu/run-perf.sh @@ -0,0 +1,8 @@ +#!/bin/zsh + +make -B NUMA=1 + +for i in $(seq 1 20); do + OMP_NUM_THREADS=1 perf stat record -o t1.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4 + OMP_NUM_THREADS=4 perf stat record -o t4.${i}.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./streamp_openmp inputs/randomlist10M.txt 256 4 4 +done diff --git a/TS/dpu/task.c b/TS/dpu/task.c index d704160..5a756aa 100644 --- a/TS/dpu/task.c +++ b/TS/dpu/task.c @@ -18,18 +18,18 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_result_t DPU_RESULTS[NR_TASKLETS]; // Dot product -static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, DTYPE * result) { - - for(uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++) - { - for(uint32_t j = 0; j < DOTPIP; j++) - { - if((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1) - { - result[j] += vectorA_aux[(j + i) - BLOCK_SIZE / sizeof(DTYPE)] * vectorB[i]; - } - else - { +static void dot_product(DTYPE *vectorA, DTYPE *vectorA_aux, DTYPE *vectorB, + DTYPE *result) +{ + + for (uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++) { + for (uint32_t j = 0; j < DOTPIP; j++) { + if ((j + i) > BLOCK_SIZE / sizeof(DTYPE) - 1) { + result[j] += + vectorA_aux[(j + i) - + BLOCK_SIZE / sizeof(DTYPE)] * + vectorB[i]; + } else { result[j] += vectorA[j + i] * vectorB[i]; } } @@ -40,43 +40,46 @@ BARRIER_INIT(my_barrier, NR_TASKLETS); extern int main_kernel1(void); -int(*kernels[nr_kernels])(void) = {main_kernel1}; +int (*kernels[nr_kernels])(void) = { main_kernel1 }; -int main(void){ +int main(void) +{ // Kernel - return kernels[DPU_INPUT_ARGUMENTS.kernel](); + return kernels[DPU_INPUT_ARGUMENTS.kernel] (); } // main_kernel1 -int main_kernel1() { +int main_kernel1() +{ unsigned int tasklet_id = me(); #if PRINT printf("tasklet_id = %u\n", tasklet_id); #endif - if(tasklet_id == 0){ - mem_reset(); // Reset the heap + if (tasklet_id == 0) { + mem_reset(); // Reset the heap } // Barrier barrier_wait(&my_barrier); // Input arguments - uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length; - DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean; - DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std; + uint32_t query_length = DPU_INPUT_ARGUMENTS.query_length; + DTYPE query_mean = DPU_INPUT_ARGUMENTS.query_mean; + DTYPE query_std = DPU_INPUT_ARGUMENTS.query_std; uint32_t slice_per_dpu = DPU_INPUT_ARGUMENTS.slice_per_dpu; // Boundaries for current tasklet - uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS)); - uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1; + uint32_t myStartElem = tasklet_id * (slice_per_dpu / (NR_TASKLETS)); + uint32_t myEndElem = myStartElem + (slice_per_dpu / (NR_TASKLETS)) - 1; // Check time series limit - if(myEndElem > slice_per_dpu - query_length) myEndElem = slice_per_dpu - query_length; + if (myEndElem > slice_per_dpu - query_length) + myEndElem = slice_per_dpu - query_length; // Starting address of the current processing block in MRAM uint32_t mem_offset = (uint32_t) DPU_MRAM_HEAP_POINTER; // Starting address of the query subsequence - uint32_t current_mram_block_addr_query = (uint32_t)(mem_offset); + uint32_t current_mram_block_addr_query = (uint32_t) (mem_offset); mem_offset += query_length * sizeof(DTYPE); // Starting address of the time series slice @@ -86,18 +89,18 @@ int main_kernel1() { // Starting address of the time series means mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE); - uint32_t current_mram_block_addr_TSMean = (uint32_t)(mem_offset); + uint32_t current_mram_block_addr_TSMean = (uint32_t) (mem_offset); // Starting address of the time series standard deviations mem_offset += (slice_per_dpu + query_length) * sizeof(DTYPE); - uint32_t current_mram_block_addr_TSSigma = (uint32_t)(mem_offset); + uint32_t current_mram_block_addr_TSSigma = (uint32_t) (mem_offset); // Initialize local caches to store the MRAM blocks - DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE); - DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TS = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TS_aux = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_query = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TSMean = (DTYPE *) mem_alloc(BLOCK_SIZE); + DTYPE *cache_TSSigma = (DTYPE *) mem_alloc(BLOCK_SIZE); DTYPE *cache_dotprods = (DTYPE *) mem_alloc(BLOCK_SIZE); // Create result structure pointer @@ -108,41 +111,56 @@ int main_kernel1() { DTYPE min_distance = DTYPE_MAX; uint32_t min_index = 0; - - for(uint32_t i = myStartElem; i < myEndElem; i+= (BLOCK_SIZE / sizeof(DTYPE))) - { - for(uint32_t d = 0; d < DOTPIP; d++) + for (uint32_t i = myStartElem; i < myEndElem; + i += (BLOCK_SIZE / sizeof(DTYPE))) { + for (uint32_t d = 0; d < DOTPIP; d++) cache_dotprods[d] = 0; - current_mram_block_addr_TS = (uint32_t) starting_offset_ts + (i - myStartElem) * sizeof(DTYPE); - current_mram_block_addr_query = (uint32_t) DPU_MRAM_HEAP_POINTER; - - for(uint32_t j = 0; j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++) - { - mram_read((__mram_ptr void const *) current_mram_block_addr_TS, cache_TS, BLOCK_SIZE); - mram_read((__mram_ptr void const *) current_mram_block_addr_TS + BLOCK_SIZE, cache_TS_aux, BLOCK_SIZE); - mram_read((__mram_ptr void const *) current_mram_block_addr_query, cache_query, BLOCK_SIZE); - - current_mram_block_addr_TS += BLOCK_SIZE; + current_mram_block_addr_TS = + (uint32_t) starting_offset_ts + (i - + myStartElem) * + sizeof(DTYPE); + current_mram_block_addr_query = + (uint32_t) DPU_MRAM_HEAP_POINTER; + + for (uint32_t j = 0; + j < (query_length) / (BLOCK_SIZE / sizeof(DTYPE)); j++) { + mram_read((__mram_ptr void const *) + current_mram_block_addr_TS, cache_TS, + BLOCK_SIZE); + mram_read((__mram_ptr void const *) + current_mram_block_addr_TS + BLOCK_SIZE, + cache_TS_aux, BLOCK_SIZE); + mram_read((__mram_ptr void const *) + current_mram_block_addr_query, cache_query, + BLOCK_SIZE); + + current_mram_block_addr_TS += BLOCK_SIZE; current_mram_block_addr_query += BLOCK_SIZE; - dot_product(cache_TS, cache_TS_aux, cache_query, cache_dotprods); + dot_product(cache_TS, cache_TS_aux, cache_query, + cache_dotprods); } - - mram_read((__mram_ptr void const *) current_mram_block_addr_TSMean, cache_TSMean, BLOCK_SIZE); - mram_read((__mram_ptr void const *) current_mram_block_addr_TSSigma, cache_TSSigma, BLOCK_SIZE); - current_mram_block_addr_TSMean += BLOCK_SIZE; + mram_read((__mram_ptr void const *) + current_mram_block_addr_TSMean, cache_TSMean, + BLOCK_SIZE); + mram_read((__mram_ptr void const *) + current_mram_block_addr_TSSigma, cache_TSSigma, + BLOCK_SIZE); + current_mram_block_addr_TSMean += BLOCK_SIZE; current_mram_block_addr_TSSigma += BLOCK_SIZE; - for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++) - { - distance = 2 * ((DTYPE) query_length - (cache_dotprods[k] - (DTYPE) query_length * cache_TSMean[k] - * query_mean) / (cache_TSSigma[k] * query_std)); - - if(distance < min_distance) - { - min_distance = distance; - min_index = i + k; + for (uint32_t k = 0; k < (BLOCK_SIZE / sizeof(DTYPE)); k++) { + distance = + 2 * ((DTYPE) query_length - + (cache_dotprods[k] - + (DTYPE) query_length * cache_TSMean[k] + * query_mean) / (cache_TSSigma[k] * + query_std)); + + if (distance < min_distance) { + min_distance = distance; + min_index = i + k; } } } diff --git a/TS/host/app.c b/TS/host/app.c index b9faa9c..a19232b 100644 --- a/TS/host/app.c +++ b/TS/host/app.c @@ -31,23 +31,23 @@ #define MAX_DATA_VAL 127 static DTYPE tSeries[1 << 26]; -static DTYPE query [1 << 15]; -static DTYPE AMean [1 << 26]; -static DTYPE ASigma [1 << 26]; +static DTYPE query[1 << 15]; +static DTYPE AMean[1 << 26]; +static DTYPE ASigma[1 << 26]; static DTYPE minHost; static DTYPE minHostIdx; // Create input arrays -static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elements) { +static DTYPE *create_test_file(unsigned int ts_elements, + unsigned int query_elements) +{ srand(0); - for (uint64_t i = 0; i < ts_elements; i++) - { + for (uint64_t i = 0; i < ts_elements; i++) { tSeries[i] = i % MAX_DATA_VAL; } - for (uint64_t i = 0; i < query_elements; i++) - { + for (uint64_t i = 0; i < query_elements; i++) { query[i] = i % MAX_DATA_VAL; } @@ -55,61 +55,62 @@ static DTYPE *create_test_file(unsigned int ts_elements, unsigned int query_elem } // Compute output in the host -static void streamp(DTYPE* tSeries, DTYPE* AMean, DTYPE* ASigma, int ProfileLength, - DTYPE* query, int queryLength, DTYPE queryMean, DTYPE queryStdDeviation) +static void streamp(DTYPE *tSeries, DTYPE *AMean, DTYPE *ASigma, + int ProfileLength, DTYPE *query, int queryLength, + DTYPE queryMean, DTYPE queryStdDeviation) { DTYPE distance; DTYPE dotprod; - minHost = INT32_MAX; + minHost = INT32_MAX; minHostIdx = 0; - for (int subseq = 0; subseq < ProfileLength; subseq++) - { + for (int subseq = 0; subseq < ProfileLength; subseq++) { dotprod = 0; - for(int j = 0; j < queryLength; j++) - { + for (int j = 0; j < queryLength; j++) { dotprod += tSeries[j + subseq] * query[j]; } - distance = 2 * (queryLength - (dotprod - queryLength * AMean[subseq] - * queryMean) / (ASigma[subseq] * queryStdDeviation)); + distance = + 2 * (queryLength - (dotprod - queryLength * AMean[subseq] + * queryMean) / (ASigma[subseq] * + queryStdDeviation)); - if(distance < minHost) - { + if (distance < minHost) { minHost = distance; minHostIdx = subseq; } } } -static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int ProfileLength, unsigned int queryLength) +static void compute_ts_statistics(unsigned int timeSeriesLength, + unsigned int ProfileLength, + unsigned int queryLength) { - double* ACumSum = malloc(sizeof(double) * timeSeriesLength); + double *ACumSum = malloc(sizeof(double) * timeSeriesLength); ACumSum[0] = tSeries[0]; for (uint64_t i = 1; i < timeSeriesLength; i++) ACumSum[i] = tSeries[i] + ACumSum[i - 1]; - double* ASqCumSum = malloc(sizeof(double) * timeSeriesLength); + double *ASqCumSum = malloc(sizeof(double) * timeSeriesLength); ASqCumSum[0] = tSeries[0] * tSeries[0]; for (uint64_t i = 1; i < timeSeriesLength; i++) ASqCumSum[i] = tSeries[i] * tSeries[i] + ASqCumSum[i - 1]; - double* ASum = malloc(sizeof(double) * ProfileLength); + double *ASum = malloc(sizeof(double) * ProfileLength); ASum[0] = ACumSum[queryLength - 1]; for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++) ASum[i + 1] = ACumSum[queryLength + i] - ACumSum[i]; - double* ASumSq = malloc(sizeof(double) * ProfileLength); + double *ASumSq = malloc(sizeof(double) * ProfileLength); ASumSq[0] = ASqCumSum[queryLength - 1]; for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++) ASumSq[i + 1] = ASqCumSum[queryLength + i] - ASqCumSum[i]; - double * AMean_tmp = malloc(sizeof(double) * ProfileLength); + double *AMean_tmp = malloc(sizeof(double) * ProfileLength); for (uint64_t i = 0; i < ProfileLength; i++) AMean_tmp[i] = ASum[i] / queryLength; - double* ASigmaSq = malloc(sizeof(double) * ProfileLength); + double *ASigmaSq = malloc(sizeof(double) * ProfileLength); for (uint64_t i = 0; i < ProfileLength; i++) ASigmaSq[i] = ASumSq[i] / queryLength - AMean[i] * AMean[i]; - for (uint64_t i = 0; i < ProfileLength; i++) - { + for (uint64_t i = 0; i < ProfileLength; i++) { ASigma[i] = sqrt(ASigmaSq[i]); - AMean[i] = (DTYPE) AMean_tmp[i]; + AMean[i] = (DTYPE) AMean_tmp[i]; } free(ACumSum); @@ -121,7 +122,8 @@ static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int Pr } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ // Timer declaration Timer timer; @@ -129,22 +131,22 @@ int main(int argc, char **argv) { struct Params p = input_params(argc, argv); struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + uint32_t nr_of_ranks; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + timer.time[0] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[1] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[6] = 0; // free #endif #if ENERGY @@ -152,12 +154,15 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); #endif - unsigned long int ts_size = p.input_size_n; + unsigned long int ts_size = p.input_size_n; const unsigned int query_length = p.input_size_m; // Size adjustment - if(ts_size % (NR_DPUS * NR_TASKLETS*query_length)) - ts_size = ts_size + (NR_DPUS * NR_TASKLETS * query_length - ts_size % (NR_DPUS * NR_TASKLETS*query_length)); + if (ts_size % (NR_DPUS * NR_TASKLETS * query_length)) + ts_size = + ts_size + (NR_DPUS * NR_TASKLETS * query_length - + ts_size % (NR_DPUS * NR_TASKLETS * + query_length)); // Create an input file with arbitrary data create_test_file(ts_size, query_length); @@ -165,30 +170,34 @@ int main(int argc, char **argv) { DTYPE query_mean; double queryMean = 0; - for(unsigned i = 0; i < query_length; i++) queryMean += query[i]; - queryMean /= (double) query_length; + for (unsigned i = 0; i < query_length; i++) + queryMean += query[i]; + queryMean /= (double)query_length; query_mean = (DTYPE) queryMean; DTYPE query_std; double queryStdDeviation; double queryVariance = 0; - for(unsigned i = 0; i < query_length; i++) - { - queryVariance += (query[i] - queryMean) * (query[i] - queryMean); + for (unsigned i = 0; i < query_length; i++) { + queryVariance += + (query[i] - queryMean) * (query[i] - queryMean); } - queryVariance /= (double) query_length; + queryVariance /= (double)query_length; queryStdDeviation = sqrt(queryVariance); query_std = (DTYPE) queryStdDeviation; - DTYPE *bufferTS = tSeries; - DTYPE *bufferQ = query; - DTYPE *bufferAMean = AMean; + DTYPE *bufferTS = tSeries; + DTYPE *bufferQ = query; + DTYPE *bufferAMean = AMean; DTYPE *bufferASigma = ASigma; uint32_t slice_per_dpu = ts_size / NR_DPUS; unsigned int kernel = 0; - dpu_arguments_t input_arguments = {ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, kernel}; + dpu_arguments_t input_arguments = + { ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, + kernel + }; uint32_t mem_offset; dpu_result_t result; @@ -200,20 +209,20 @@ int main(int argc, char **argv) { for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 0, 0); } DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 0); } #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 1, 0); } DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 1); } DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); @@ -229,54 +238,72 @@ int main(int argc, char **argv) { DPU_FOREACH(dpu_set, dpu) { input_arguments.exclusion_zone = 0; - DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGUMENTS", 0, (const void *) &input_arguments, sizeof(input_arguments))); + DPU_ASSERT(dpu_copy_to + (dpu, "DPU_INPUT_ARGUMENTS", 0, + (const void *)&input_arguments, + sizeof(input_arguments))); i++; } i = 0; mem_offset = 0; - DPU_FOREACH(dpu_set, dpu, i) - { + DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, bufferQ)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, query_length * sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + query_length * sizeof(DTYPE), DPU_XFER_DEFAULT)); i = 0; mem_offset += query_length * sizeof(DTYPE); DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferTS + slice_per_dpu * i)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferTS + slice_per_dpu * i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset,(slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, mem_offset, + (slice_per_dpu + query_length) * sizeof(DTYPE), + DPU_XFER_DEFAULT)); mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE)); i = 0; DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferAMean + slice_per_dpu * i)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferAMean + slice_per_dpu * i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, mem_offset, + (slice_per_dpu + query_length) * sizeof(DTYPE), + DPU_XFER_DEFAULT)); i = 0; mem_offset += ((slice_per_dpu + query_length) * sizeof(DTYPE)); DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferASigma + slice_per_dpu * i)); + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferASigma + slice_per_dpu * i)); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, mem_offset, (slice_per_dpu + query_length)*sizeof(DTYPE), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, mem_offset, + (slice_per_dpu + query_length) * sizeof(DTYPE), + DPU_XFER_DEFAULT)); if (rep >= p.n_warmup) { stop(&timer, 2); } - // Run kernel on DPUs - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { start(&timer, 3, 0); #if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); @@ -285,37 +312,49 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if (rep >= p.n_warmup) - { + if (rep >= p.n_warmup) { stop(&timer, 3); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); #endif } - dpu_result_t* results_retrieve[NR_DPUS]; + dpu_result_t *results_retrieve[NR_DPUS]; if (rep >= p.n_warmup) { start(&timer, 4, 0); } DPU_FOREACH(dpu_set, dpu, i) { - results_retrieve[i] = (dpu_result_t*)malloc(NR_TASKLETS * sizeof(dpu_result_t)); + results_retrieve[i] = + (dpu_result_t *) malloc(NR_TASKLETS * + sizeof(dpu_result_t)); } - DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i])); } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_result_t), DPU_XFER_DEFAULT)); + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, + NR_TASKLETS * sizeof(dpu_result_t), + DPU_XFER_DEFAULT)); i = 0; DPU_FOREACH(dpu_set, dpu, i) { - for (unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++) { - if(results_retrieve[i][each_tasklet].minValue < result.minValue && results_retrieve[i][each_tasklet].minValue > 0) - { - result.minValue = results_retrieve[i][each_tasklet].minValue; - result.minIndex = (DTYPE)results_retrieve[i][each_tasklet].minIndex + (i * slice_per_dpu); + for (unsigned int each_tasklet = 0; + each_tasklet < NR_TASKLETS; each_tasklet++) { + if (results_retrieve[i][each_tasklet].minValue < + result.minValue + && + results_retrieve[i][each_tasklet].minValue > + 0) { + result.minValue = + results_retrieve[i] + [each_tasklet].minValue; + result.minIndex = (DTYPE) + results_retrieve[i] + [each_tasklet].minIndex + + (i * slice_per_dpu); } } @@ -323,11 +362,9 @@ int main(int argc, char **argv) { i++; } - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 4); } - - #if PRINT printf("LOGS\n"); DPU_FOREACH(dpu_set, dpu) { @@ -337,13 +374,13 @@ int main(int argc, char **argv) { #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { start(&timer, 5, 0); } #endif DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { + if (rep >= p.n_warmup) { stop(&timer, 5); } #endif @@ -352,52 +389,83 @@ int main(int argc, char **argv) { if (rep >= p.n_warmup) { start(&timer, 6, 0); } - streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, query, query_length, query_mean, query_std); - if(rep >= p.n_warmup) { + streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, + query, query_length, query_mean, query_std); + if (rep >= p.n_warmup) { stop(&timer, 6); } int status = (minHost == result.minValue); if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n"); + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] results are equal\n"); if (rep >= p.n_warmup) { - printf("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", - NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, ts_size); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD); - printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", - timer.time[0], // alloc - timer.time[1], // load - timer.time[2], // write - timer.time[3], // kernel - timer.time[4], // read - timer.time[5], // free - timer.time[6]); // CPU - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - ts_size * sizeof(DTYPE) / timer.time[6], - ts_size * sizeof(DTYPE) / (timer.time[3]), - ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - ts_size * sizeof(DTYPE) / (timer.time[2] + timer.time[3] + timer.time[4]), - ts_size * sizeof(DTYPE) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]), - ts_size * sizeof(DTYPE) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - ts_size / timer.time[6], - ts_size / (timer.time[3]), - ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - ts_size / (timer.time[2] + timer.time[3] + timer.time[4]), - ts_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]), - ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); + printf + ("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", + NR_DPUS, nr_of_ranks, NR_TASKLETS, + XSTR(DTYPE), BLOCK_SIZE, ts_size); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD); + printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", timer.time[0], // alloc + timer.time[1], // load + timer.time[2], // write + timer.time[3], // kernel + timer.time[4], // read + timer.time[5], // free + timer.time[6]); // CPU + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + ts_size * sizeof(DTYPE) / timer.time[6], + ts_size * sizeof(DTYPE) / (timer.time[3]), + ts_size * sizeof(DTYPE) / (timer.time[0] + + timer.time[1] + + timer.time[2] + + timer.time[3] + + timer.time[4] + + timer.time[5])); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + ts_size * sizeof(DTYPE) / (timer.time[2] + + timer.time[3] + + timer.time[4]), + ts_size * sizeof(DTYPE) / (timer.time[1] + + timer.time[2] + + timer.time[3] + + timer.time[4]), + ts_size * sizeof(DTYPE) / (timer.time[0] + + timer.time[1] + + timer.time[2] + + timer.time[3] + + timer.time[4])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + ts_size / timer.time[6], + ts_size / (timer.time[3]), + ts_size / (timer.time[0] + timer.time[1] + + timer.time[2] + timer.time[3] + + timer.time[4] + timer.time[5])); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + ts_size / (timer.time[2] + timer.time[3] + + timer.time[4]), + ts_size / (timer.time[1] + timer.time[2] + + timer.time[3] + timer.time[4]), + ts_size / (timer.time[0] + timer.time[1] + + timer.time[2] + timer.time[3] + + timer.time[4])); } } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n"); + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] results differ!\n"); } } #if ENERGY double acc_energy, avg_energy, acc_time, avg_time; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); + DPU_ASSERT(dpu_probe_get + (&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time)); DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); @@ -407,7 +475,6 @@ int main(int argc, char **argv) { printf("Energy (J): %f J\t", avg_energy); #endif - #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_free(dpu_set)); #endif diff --git a/TS/support/common.h b/TS/support/common.h index b120bb1..7585b90 100755 --- a/TS/support/common.h +++ b/TS/support/common.h @@ -14,30 +14,30 @@ #define DTYPE int32_t #define DTYPE_MAX INT32_MAX -typedef struct { +typedef struct { uint32_t ts_length; - uint32_t query_length; - DTYPE query_mean; - DTYPE query_std; - uint32_t slice_per_dpu; - int32_t exclusion_zone; - enum kernels { + uint32_t query_length; + DTYPE query_mean; + DTYPE query_std; + uint32_t slice_per_dpu; + int32_t exclusion_zone; + enum kernels { kernel1 = 0, nr_kernels = 1, } kernel; -}dpu_arguments_t; +} dpu_arguments_t; -typedef struct { - DTYPE minValue; - uint32_t minIndex; - DTYPE maxValue; - uint32_t maxIndex; -}dpu_result_t; +typedef struct { + DTYPE minValue; + uint32_t minIndex; + DTYPE maxValue; + uint32_t maxIndex; +} dpu_result_t; #ifndef ENERGY #define ENERGY 0 #endif -#define PRINT 0 +#define PRINT 0 #define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_GREEN "\x1b[32m" diff --git a/TS/support/params.h b/TS/support/params.h index 4668604..b7d9763 100644 --- a/TS/support/params.h +++ b/TS/support/params.h @@ -5,54 +5,63 @@ // Params --------------------------------------------------------------------- typedef struct Params { - unsigned long input_size_n; - unsigned long input_size_m; - int n_warmup; - int n_reps; -}Params; + unsigned long input_size_n; + unsigned long input_size_m; + int n_warmup; + int n_reps; +} Params; -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n" - "\nBenchmark-specific options:" - "\n -n <n> n (TS length. Default=64K elements)" - "\n -m <m> m (Query length. Default=256 elements)" - "\n"); - } +void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n" + "\nBenchmark-specific options:" + "\n -n <n> n (TS length. Default=64K elements)" + "\n -m <m> m (Query length. Default=256 elements)" "\n"); +} - struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size_n = 1 << 16; - p.input_size_m = 1 << 8; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size_n = 1 << 16; + p.input_size_m = 1 << 8; - p.n_warmup = 1; - p.n_reps = 3; + p.n_warmup = 1; + p.n_reps = 3; - int opt; - while((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'n': p.input_size_n = atol(optarg); break; - case 'm': p.input_size_m = atol(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hw:e:n:m:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'n': + p.input_size_n = atol(optarg); + break; + case 'm': + p.input_size_m = atol(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; - } + return p; +} #endif diff --git a/TS/support/timer.h b/TS/support/timer.h index ff1ae1b..c569de7 100755 --- a/TS/support/timer.h +++ b/TS/support/timer.h @@ -1,66 +1,74 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer { + + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; + +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) +{ + printf("%f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/UNI/baselines/cpu/Makefile b/UNI/baselines/cpu/Makefile index ec3f403..bbf9db0 100644 --- a/UNI/baselines/cpu/Makefile +++ b/UNI/baselines/cpu/Makefile @@ -4,7 +4,7 @@ all: uni TYPE ?= int64_t uni: app_baseline.c - gcc -O2 -o uni -fopenmp -DT=${TYPE} app_baseline.c + gcc -Wall -Wextra -pedantic -march=native -O2 -o uni -fopenmp -DT=${TYPE} app_baseline.c uni_O0: app_baseline.c gcc -o uni_O0 -fopenmp app_baseline.c diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile index 76a82e1..04aacb6 100644 --- a/VA/baselines/cpu/Makefile +++ b/VA/baselines/cpu/Makefile @@ -1,9 +1,23 @@ -NUMA ?= 0 -NUMA_MEMCPY ?= 0 -FLAGS = +benchmark ?= 1 +debug ?= 0 +native ?= 1 +nop_sync ?= 0 +numa ?= 0 +numa_memcpy ?= 0 + +CFLAGS = +LDFLAGS = + +ifeq (${debug}, 1) + CFLAGS += -g +endif + +ifeq (${native}, 1) + CFLAGS += -march=native +endif -ifeq (${NUMA}, 1) - FLAGS += -lnuma +ifeq (${numa}, 1) + LDFLAGS += -lnuma endif .PHONY: all @@ -12,7 +26,7 @@ all: va TYPE ?= int32_t va: app_baseline.c - gcc -Wall -Wextra -pedantic -march=native -O2 -o va -fopenmp -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} -DT=${TYPE} app_baseline.c ${FLAGS} + gcc -Wall -Wextra -pedantic ${CFLAGS} -O3 -o va -fopenmp -DNUMA=${numa} -DNUMA_MEMCPY=${numa_memcpy} -DNOP_SYNC=${nop_sync} -DWITH_BENCHMARK=${benchmark} -DT=${TYPE} app_baseline.c ${LDFLAGS} va_O0: app_baseline.c gcc -o va_O0 -fopenmp app_baseline.c diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c index 4c8610a..7975200 100644 --- a/VA/baselines/cpu/app_baseline.c +++ b/VA/baselines/cpu/app_baseline.c @@ -13,13 +13,19 @@ #include <stdint.h> #include <omp.h> + +#if WITH_BENCHMARK #include "../../support/timer.h" +#else +#define start(...) +#define stop(...) +#endif #if NUMA #include <numaif.h> #include <numa.h> -void* mp_pages[1]; +void *mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_in = -1; @@ -49,301 +55,345 @@ static T *B_local; /** * @brief compute output in the host */ -static void vector_addition_host(unsigned int nr_elements, int t) { - omp_set_num_threads(t); - #pragma omp parallel for - for (int i = 0; i < nr_elements; i++) { +static void vector_addition_host(unsigned long nr_elements, int t) +{ + omp_set_num_threads(t); +#pragma omp parallel for + for (long i = 0; i < nr_elements; i++) { #if NUMA_MEMCPY - C[i] = A_local[i] + B_local[i]; + C[i] = A_local[i] + B_local[i]; #else - C[i] = A[i] + B[i]; + C[i] = A[i] + B[i]; #endif - } + } } // Params --------------------------------------------------------------------- typedef struct Params { - int input_size; - int n_warmup; - int n_reps; - int exp; - int n_threads; + long input_size; + int n_warmup; + int n_reps; + int exp; + int n_threads; #if NUMA - struct bitmask* bitmask_in; - struct bitmask* bitmask_out; - int numa_node_cpu; + struct bitmask *bitmask_in; + struct bitmask *bitmask_out; + int numa_node_cpu; #endif #if NUMA_MEMCPY - int numa_node_cpu_memcpy; - struct bitmask* bitmask_cpu; + int numa_node_cpu_memcpy; + struct bitmask *bitmask_cpu; #endif -}Params; - -void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -t <T> # of threads (default=8)" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=8M elements)" - "\n"); +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -t <T> # of threads (default=8)" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=8M elements)" "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size = 16777216; - p.n_warmup = 1; - p.n_reps = 3; - p.exp = 1; - p.n_threads = 5; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size = 16777216; + p.n_warmup = 1; + p.n_reps = 3; + p.exp = 1; + p.n_threads = 5; #if NUMA - p.bitmask_in = NULL; - p.bitmask_out = NULL; - p.numa_node_cpu = -1; + p.bitmask_in = NULL; + p.bitmask_out = NULL; + p.numa_node_cpu = -1; #endif #if NUMA_MEMCPY - p.numa_node_cpu_memcpy = -1; - p.bitmask_cpu = NULL; + p.numa_node_cpu_memcpy = -1; + p.bitmask_cpu = NULL; #endif - int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'x': p.exp = atoi(optarg); break; - case 't': p.n_threads = atoi(optarg); break; + int opt; + while ((opt = getopt(argc, argv, "hi:w:e:x:t:a:b:c:C:M:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atol(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'x': + p.exp = atoi(optarg); + break; + case 't': + p.n_threads = atoi(optarg); + break; #if NUMA - case 'a': p.bitmask_in = numa_parse_nodestring(optarg); break; - case 'b': p.bitmask_out = numa_parse_nodestring(optarg); break; - case 'c': p.numa_node_cpu = atoi(optarg); break; + case 'a': + p.bitmask_in = numa_parse_nodestring(optarg); + break; + case 'b': + p.bitmask_out = numa_parse_nodestring(optarg); + break; + case 'c': + p.numa_node_cpu = atoi(optarg); + break; #if NUMA_MEMCPY - case 'C': p.bitmask_cpu = numa_parse_nodestring(optarg); break; - case 'M': p.numa_node_cpu_memcpy = atoi(optarg); break; -#endif // NUMA_MEMCPY -#endif // NUMA - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(p.n_threads > 0 && "Invalid # of ranks!"); - - return p; + case 'C': + p.bitmask_cpu = numa_parse_nodestring(optarg); + break; + case 'M': + p.numa_node_cpu_memcpy = atoi(optarg); + break; +#endif // NUMA_MEMCPY +#endif // NUMA + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(p.n_threads > 0 && "Invalid # of ranks!"); + + return p; } /** * @brief Main of the Host Application. */ -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - struct Params p = input_params(argc, argv); + struct Params p = input_params(argc, argv); - const unsigned int input_size = p.exp == 0 ? p.input_size * p.n_threads : p.input_size; + const unsigned long input_size = + p.exp == 0 ? p.input_size * p.n_threads : p.input_size; - // Create an input file with arbitrary data. + // Create an input file with arbitrary data. /** * @brief creates a "test file" by filling a buffer of 64MB with pseudo-random values * @param nr_elements how many 32-bit elements we want the file to be * @return the buffer address */ - srand(0); + srand(0); #if NUMA - if (p.bitmask_in) { - numa_set_membind(p.bitmask_in); - numa_free_nodemask(p.bitmask_in); - } - A = (T*) numa_alloc(input_size * sizeof(T)); - B = (T*) numa_alloc(input_size * sizeof(T)); + if (p.bitmask_in) { + numa_set_membind(p.bitmask_in); + numa_free_nodemask(p.bitmask_in); + } + A = (T *) numa_alloc(input_size * sizeof(T)); + B = (T *) numa_alloc(input_size * sizeof(T)); #else - A = (T*) malloc(input_size * sizeof(T)); - B = (T*) malloc(input_size * sizeof(T)); + A = (T *) malloc(input_size * sizeof(T)); + B = (T *) malloc(input_size * sizeof(T)); #endif #if NUMA - if (p.bitmask_out) { - numa_set_membind(p.bitmask_out); - numa_free_nodemask(p.bitmask_out); - } - C = (T*) numa_alloc(input_size * sizeof(T)); + if (p.bitmask_out) { + numa_set_membind(p.bitmask_out); + numa_free_nodemask(p.bitmask_out); + } + C = (T *) numa_alloc(input_size * sizeof(T)); #else - C = (T*) malloc(input_size * sizeof(T)); + C = (T *) malloc(input_size * sizeof(T)); #endif - for (unsigned int i = 0; i < input_size; i++) { - A[i] = (T) (rand()); - B[i] = (T) (rand()); - } + for (unsigned long i = 0; i < input_size; i++) { + A[i] = (T) (rand()); + B[i] = (T) (rand()); + } #if NUMA #if NUMA_MEMCPY - if (p.bitmask_cpu) { - numa_set_membind(p.bitmask_cpu); - numa_free_nodemask(p.bitmask_cpu); - } + if (p.bitmask_cpu) { + numa_set_membind(p.bitmask_cpu); + numa_free_nodemask(p.bitmask_cpu); + } #else - struct bitmask *bitmask_all = numa_allocate_nodemask(); - numa_bitmask_setall(bitmask_all); - numa_set_membind(bitmask_all); - numa_free_nodemask(bitmask_all); -#endif // NUMA_MEMCPY -#endif // NUMA + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY +#endif // NUMA #if NUMA - mp_pages[0] = A; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_in = mp_status[0]; - } - - mp_pages[0] = C; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(C)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_out = mp_status[0]; - } - - numa_node_cpu = p.numa_node_cpu; - if (p.numa_node_cpu != -1) { - if (numa_run_on_node(p.numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } + mp_pages[0] = A; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_in = mp_status[0]; + } + + mp_pages[0] = C; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(C)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_out = mp_status[0]; + } + + numa_node_cpu = p.numa_node_cpu; + if (p.numa_node_cpu != -1) { + if (numa_run_on_node(p.numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } #endif #if NUMA_MEMCPY - numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) + || (numa_node_cpu + 8 == numa_node_in)) * 1; +#endif + +#if WITH_BENCHMARK + Timer timer; #endif - Timer timer; +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } +#endif - for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if NUMA_MEMCPY - numa_node_cpu_memcpy = p.numa_node_cpu_memcpy; - start(&timer, 1, 0); - if (!numa_node_in_is_local) { - A_local = (T*) numa_alloc(input_size * sizeof(T)); - B_local = (T*) numa_alloc(input_size * sizeof(T)); - } - stop(&timer, 1); - if (!numa_node_in_is_local) { - if (p.numa_node_cpu_memcpy != -1) { - if (numa_run_on_node(p.numa_node_cpu_memcpy) == -1) { - perror("numa_run_on_node"); - numa_node_cpu_memcpy = -1; - } - } - } - start(&timer, 2, 0); - if (!numa_node_in_is_local) { - memcpy(A_local, A, input_size * sizeof(T)); - memcpy(B_local, B, input_size * sizeof(T)); - } else { - A_local = A; - B_local = B; - } - stop(&timer, 2); - if (p.numa_node_cpu != -1) { - if (numa_run_on_node(p.numa_node_cpu) == -1) { - perror("numa_run_on_node"); - numa_node_cpu = -1; - } - } - mp_pages[0] = A_local; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(A_local)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_local = mp_status[0]; - } + numa_node_cpu_memcpy = p.numa_node_cpu_memcpy; + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + A_local = (T *) numa_alloc(input_size * sizeof(T)); + B_local = (T *) numa_alloc(input_size * sizeof(T)); + } + stop(&timer, 1); + if (!numa_node_in_is_local) { + if (p.numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(p.numa_node_cpu_memcpy) == + -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + memcpy(A_local, A, input_size * sizeof(T)); + memcpy(B_local, B, input_size * sizeof(T)); + } else { + A_local = A; + B_local = B; + } + stop(&timer, 2); + if (p.numa_node_cpu != -1) { + if (numa_run_on_node(p.numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + mp_pages[0] = A_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A_local)"); + } else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } else { + numa_node_local = mp_status[0]; + } #endif - start(&timer, 0, 0); - vector_addition_host(input_size, p.n_threads); - stop(&timer, 0); + start(&timer, 0, 0); + vector_addition_host(input_size, p.n_threads); + stop(&timer, 0); #if NUMA_MEMCPY - start(&timer, 3, 0); - if (!numa_node_in_is_local) { - numa_free(A_local, input_size * sizeof(T)); - numa_free(B_local, input_size * sizeof(T)); - } - stop(&timer, 3); + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(A_local, input_size * sizeof(T)); + numa_free(B_local, input_size * sizeof(T)); + } + stop(&timer, 3); #endif - unsigned int nr_threads = 0; +#if WITH_BENCHMARK + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic - nr_threads++; + nr_threads++; - if (rep >= p.n_warmup) { + if (rep >= p.n_warmup) { #if NUMA_MEMCPY - printf("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%d" - " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" - " | throughput_MBps=%f", - nr_threads, XSTR(T), input_size, - numa_node_in, numa_node_local, numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), - input_size * 3 * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f", - input_size / timer.time[0]); - printf(" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", - timer.time[0], timer.time[1], timer.time[2], timer.time[3], - timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); + printf + ("[::] VA-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%ld" + " numa_node_in=%d numa_node_local=%d numa_node_out=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " | throughput_MBps=%f", nr_threads, XSTR(T), + input_size, numa_node_in, numa_node_local, + numa_node_out, numa_node_cpu, numa_node_cpu_memcpy, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), + input_size * 3 * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f", + input_size / timer.time[0]); + printf + (" latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + timer.time[0], timer.time[1], timer.time[2], + timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + + timer.time[3]); #else - printf("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%d" + printf + ("[::] VA-CPU | n_threads=%d e_type=%s n_elements=%ld" #if NUMA - " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" + " numa_node_in=%d numa_node_out=%d numa_node_cpu=%d numa_distance_in_cpu=%d numa_distance_cpu_out=%d" #endif - " | throughput_MBps=%f", - nr_threads, XSTR(T), input_size, + " | throughput_MBps=%f", + nr_threads, XSTR(T), input_size, #if NUMA - numa_node_in, numa_node_out, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), numa_distance(numa_node_cpu, numa_node_out), + numa_node_in, numa_node_out, numa_node_cpu, + numa_distance(numa_node_in, numa_node_cpu), + numa_distance(numa_node_cpu, numa_node_out), +#endif + input_size * 3 * sizeof(T) / timer.time[0]); + printf(" throughput_MOpps=%f", + input_size / timer.time[0]); + printf(" latency_us=%f\n", timer.time[0]); +#endif // NUMA_MEMCPY + } +#endif // WITH_BENCHMARK + } + +#if NOP_SYNC + for (int rep = 0; rep < 200000; rep++) { + asm volatile ("nop"::); + } #endif - input_size * 3 * sizeof(T) / timer.time[0]); - printf(" throughput_MOpps=%f", - input_size / timer.time[0]); - printf(" latency_us=%f\n", - timer.time[0]); -#endif // NUMA_MEMCPY - } - } #if NUMA - numa_free(A, input_size * sizeof(T)); - numa_free(B, input_size * sizeof(T)); - numa_free(C, input_size * sizeof(T)); + numa_free(A, input_size * sizeof(T)); + numa_free(B, input_size * sizeof(T)); + numa_free(C, input_size * sizeof(T)); #else - free(A); - free(B); - free(C); + free(A); + free(B); + free(C); #endif - return 0; - } + return 0; +} diff --git a/VA/baselines/cpu/run-perf.sh b/VA/baselines/cpu/run-perf.sh new file mode 100755 index 0000000..8075256 --- /dev/null +++ b/VA/baselines/cpu/run-perf.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B numa=1 + +perf stat record -o t1.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 167772160 +perf stat record -o t4.perf -e ${(j:,:):-$(grep -v '^#' ../../../perf-events.txt | cut -d ' ' -f 1)} ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 167772160 diff --git a/VA/baselines/cpu/run-ws.sh b/VA/baselines/cpu/run-ws.sh new file mode 100755 index 0000000..ccc4993 --- /dev/null +++ b/VA/baselines/cpu/run-ws.sh @@ -0,0 +1,6 @@ +#!/bin/zsh + +make -B benchmark=0 debug=1 native=0 nop_sync=1 numa=1 + +~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t1.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 1 -e 20 -w 0 -i 16777216 +~/var/source/valgrind/vg-in-place --tool=ws --ws-file=t4.ws --ws-peak-detect=yes --ws-every=50000 --ws-track-locality=yes ./va -a 4 -b 4 -c 4 -t 4 -e 20 -w 0 -i 16777216 diff --git a/VA/dpu/task.c b/VA/dpu/task.c index bb41303..9622911 100644 --- a/VA/dpu/task.c +++ b/VA/dpu/task.c @@ -15,10 +15,11 @@ __host dpu_arguments_t DPU_INPUT_ARGUMENTS; // vector_addition: Computes the vector addition of a cached block -static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) { - for (unsigned int i = 0; i < l_size; i++){ - bufferB[i] += bufferA[i]; - } +static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) +{ + for (unsigned int i = 0; i < l_size; i++) { + bufferB[i] += bufferA[i]; + } } // Barrier @@ -26,53 +27,67 @@ BARRIER_INIT(my_barrier, NR_TASKLETS); extern int main_kernel1(void); -int (*kernels[nr_kernels])(void) = {main_kernel1}; +int (*kernels[nr_kernels])(void) = { main_kernel1 }; -int main(void) { - // Kernel - return kernels[DPU_INPUT_ARGUMENTS.kernel](); +int main(void) +{ + // Kernel + return kernels[DPU_INPUT_ARGUMENTS.kernel] (); } // main_kernel1 -int main_kernel1() { - unsigned int tasklet_id = me(); +int main_kernel1() +{ + unsigned int tasklet_id = me(); #if PRINT - printf("tasklet_id = %u\n", tasklet_id); + printf("tasklet_id = %u\n", tasklet_id); #endif - if (tasklet_id == 0){ // Initialize once the cycle counter - mem_reset(); // Reset the heap - } - // Barrier - barrier_wait(&my_barrier); - - uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes - uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes - - // Address of the current processing block in MRAM - uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; - uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER; - uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer); - - // Initialize a local cache to store the MRAM block - T *cache_A = (T *) mem_alloc(BLOCK_SIZE); - T *cache_B = (T *) mem_alloc(BLOCK_SIZE); - - for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){ - - // Bound checking - uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE; - - // Load cache with current MRAM block - mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes); - mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes); - - // Computer vector addition - vector_addition(cache_B, cache_A, l_size_bytes >> DIV); - - // Write cache to current MRAM block - mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes); - - } - - return 0; + if (tasklet_id == 0) { // Initialize once the cycle counter + mem_reset(); // Reset the heap + } + // Barrier + barrier_wait(&my_barrier); + + uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes + uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes + + // Address of the current processing block in MRAM + uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; + uint32_t mram_base_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER; + uint32_t mram_base_addr_B = + (uint32_t) (DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer); + + // Initialize a local cache to store the MRAM block + T *cache_A = (T *) mem_alloc(BLOCK_SIZE); + T *cache_B = (T *) mem_alloc(BLOCK_SIZE); + + for (unsigned int byte_index = base_tasklet; + byte_index < input_size_dpu_bytes; + byte_index += BLOCK_SIZE * NR_TASKLETS) { + + // Bound checking + uint32_t l_size_bytes = + (byte_index + BLOCK_SIZE >= + input_size_dpu_bytes) ? (input_size_dpu_bytes - + byte_index) : BLOCK_SIZE; + + // Load cache with current MRAM block + mram_read((__mram_ptr void const *)(mram_base_addr_A + + byte_index), cache_A, + l_size_bytes); + mram_read((__mram_ptr void const *)(mram_base_addr_B + + byte_index), cache_B, + l_size_bytes); + + // Computer vector addition + vector_addition(cache_B, cache_A, l_size_bytes >> DIV); + + // Write cache to current MRAM block + mram_write(cache_B, + (__mram_ptr void *)(mram_base_addr_B + byte_index), + l_size_bytes); + + } + + return 0; } diff --git a/VA/host/app.c b/VA/host/app.c index 5fe3f61..1a2cdfd 100644 --- a/VA/host/app.c +++ b/VA/host/app.c @@ -33,296 +33,361 @@ #include <dpu_target_macros.h> // Pointer declaration -static T* A; -static T* B; -static T* C; -static T* C2; +static T *A; +static T *B; +static T *C; +static T *C2; // Create input arrays -static void read_input(T* A, T* B, unsigned int nr_elements) { - srand(0); - for (unsigned int i = 0; i < nr_elements; i++) { - A[i] = (T) (rand()); - B[i] = (T) (rand()); - } +static void read_input(T *A, T *B, unsigned int nr_elements) +{ + srand(0); + for (unsigned int i = 0; i < nr_elements; i++) { + A[i] = (T) (rand()); + B[i] = (T) (rand()); + } } // Compute output in the host -static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) { - for (unsigned int i = 0; i < nr_elements; i++) { - C[i] = A[i] + B[i]; - } +static void vector_addition_host(T *C, T *A, T *B, unsigned int nr_elements) +{ + for (unsigned int i = 0; i < nr_elements; i++) { + C[i] = A[i] + B[i]; + } } // Main of the Host Application -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ - struct Params p = input_params(argc, argv); + struct Params p = input_params(argc, argv); - struct dpu_set_t dpu_set, dpu; - uint32_t nr_of_dpus; - uint32_t nr_of_ranks; + struct dpu_set_t dpu_set, dpu; + uint32_t nr_of_dpus; + uint32_t nr_of_ranks; #if ENERGY - struct dpu_probe_t probe; - DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); + struct dpu_probe_t probe; + DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); #endif - // Timer declaration - Timer timer; + // Timer declaration + Timer timer; - int numa_node_rank = -2; + int numa_node_rank = -2; - // Allocate DPUs and load binary + // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + timer.time[0] = 0; // alloc #endif #if !WITH_LOAD_OVERHEAD - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); + timer.time[1] = 0; // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + timer.time[6] = 0; // free #endif - unsigned int i = 0; - const unsigned int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; - const unsigned int input_size_8bytes = - ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned - const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) - const unsigned int input_size_dpu_8bytes = - ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned - - // Input/output allocation - A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - T *bufferA = A; - T *bufferB = B; - T *bufferC = C2; - - // Create an input file with arbitrary data - read_input(A, B, input_size); - - // Loop over main kernel - for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + unsigned int i = 0; + const unsigned int input_size = + p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; + const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned + const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) + const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned + + // Input/output allocation + A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + T *bufferA = A; + T *bufferB = B; + T *bufferC = C2; + + // Create an input file with arbitrary data + read_input(A, B, input_size); + + // Loop over main kernel + for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { #if WITH_ALLOC_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 0, 0); - } - DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - if(rep >= p.n_warmup) { - stop(&timer, 0); - } + if (rep >= p.n_warmup) { + start(&timer, 0, 0); + } + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + if (rep >= p.n_warmup) { + stop(&timer, 0); + } #endif #if WITH_DPUINFO - printf("DPUs:"); - DPU_FOREACH (dpu_set, dpu) { - int rank = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - int slice = dpu_get_slice_id(dpu_from_set(dpu)); - int member = dpu_get_member_id(dpu_from_set(dpu)); - printf(" %d(%d.%d)", rank, slice, member); - } - printf("\n"); + printf("DPUs:"); + DPU_FOREACH(dpu_set, dpu) { + int rank = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + int slice = dpu_get_slice_id(dpu_from_set(dpu)); + int member = dpu_get_member_id(dpu_from_set(dpu)); + printf(" %d(%d.%d)", rank, slice, member); + } + printf("\n"); #endif #if WITH_LOAD_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 1, 0); - } - DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); - if(rep >= p.n_warmup) { - stop(&timer, 1); - } - DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); - assert(nr_of_dpus == NR_DPUS); + if (rep >= p.n_warmup) { + start(&timer, 1, 0); + } + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + if (rep >= p.n_warmup) { + stop(&timer, 1); + } + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); + assert(nr_of_dpus == NR_DPUS); #endif - // int prev_rank_id = -1; - int rank_id = -1; - DPU_FOREACH (dpu_set, dpu) { - rank_id = dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & DPU_TARGET_MASK; - if ((numa_node_rank != -2) && numa_node_rank != dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu)))) { - numa_node_rank = -1; - } else { - numa_node_rank = dpu_get_rank_numa_node(dpu_get_rank(dpu_from_set(dpu))); - } - /* - if (rank_id != prev_rank_id) { - printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); - prev_rank_id = rank_id; - } - */ - } - - - // Compute output on CPU (performance comparison and verification purposes) - if(rep >= p.n_warmup) { - start(&timer, 2, 0); - } - vector_addition_host(C, A, B, input_size); - if(rep >= p.n_warmup) { - stop(&timer, 2); - } - - if(rep >= p.n_warmup) { - start(&timer, 3, 0); - } - // Input arguments - unsigned int kernel = 0; - dpu_arguments_t input_arguments[NR_DPUS]; - for(i=0; i<nr_of_dpus-1; i++) { - input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); - input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); - input_arguments[i].kernel=kernel; - } - input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); - input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); - input_arguments[nr_of_dpus-1].kernel=kernel; - - // Copy input arrays - i = 0; - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i])); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT)); - - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT)); - - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { - stop(&timer, 3); - } - - // Run DPU kernel - if(rep >= p.n_warmup) { - start(&timer, 4, 0); - #if ENERGY - DPU_ASSERT(dpu_probe_start(&probe)); - #endif - } - DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); - if(rep >= p.n_warmup) { - stop(&timer, 4); - #if ENERGY - DPU_ASSERT(dpu_probe_stop(&probe)); - #endif - } - + // int prev_rank_id = -1; + int rank_id = -1; + DPU_FOREACH(dpu_set, dpu) { + rank_id = + dpu_get_rank_id(dpu_get_rank(dpu_from_set(dpu))) & + DPU_TARGET_MASK; + if ((numa_node_rank != -2) + && numa_node_rank != + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu)))) { + numa_node_rank = -1; + } else { + numa_node_rank = + dpu_get_rank_numa_node(dpu_get_rank + (dpu_from_set(dpu))); + } + /* + if (rank_id != prev_rank_id) { + printf("/dev/dpu_rank%d @ NUMA node %d\n", rank_id, numa_node_rank); + prev_rank_id = rank_id; + } + */ + } + + // Compute output on CPU (performance comparison and verification purposes) + if (rep >= p.n_warmup) { + start(&timer, 2, 0); + } + vector_addition_host(C, A, B, input_size); + if (rep >= p.n_warmup) { + stop(&timer, 2); + } + + if (rep >= p.n_warmup) { + start(&timer, 3, 0); + } + // Input arguments + unsigned int kernel = 0; + dpu_arguments_t input_arguments[NR_DPUS]; + for (i = 0; i < nr_of_dpus - 1; i++) { + input_arguments[i].size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[i].transfer_size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[i].kernel = kernel; + } + input_arguments[nr_of_dpus - 1].size = + (input_size_8bytes - + input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T); + input_arguments[nr_of_dpus - 1].transfer_size = + input_size_dpu_8bytes * sizeof(T); + input_arguments[nr_of_dpus - 1].kernel = kernel; + + // Copy input arrays + i = 0; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i])); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, + sizeof(input_arguments[0]), DPU_XFER_DEFAULT)); + + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferA + input_size_dpu_8bytes * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, 0, + input_size_dpu_8bytes * sizeof(T), + DPU_XFER_DEFAULT)); + + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferB + input_size_dpu_8bytes * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_TO_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + input_size_dpu_8bytes * sizeof(T), + input_size_dpu_8bytes * sizeof(T), + DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) { + stop(&timer, 3); + } + // Run DPU kernel + if (rep >= p.n_warmup) { + start(&timer, 4, 0); +#if ENERGY + DPU_ASSERT(dpu_probe_start(&probe)); +#endif + } + DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); + if (rep >= p.n_warmup) { + stop(&timer, 4); +#if ENERGY + DPU_ASSERT(dpu_probe_stop(&probe)); +#endif + } #if PRINT - { - unsigned int each_dpu = 0; - printf("Display DPU Logs\n"); - DPU_FOREACH (dpu_set, dpu) { - printf("DPU#%d:\n", each_dpu); - DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout)); - each_dpu++; - } - } + { + unsigned int each_dpu = 0; + printf("Display DPU Logs\n"); + DPU_FOREACH(dpu_set, dpu) { + printf("DPU#%d:\n", each_dpu); + DPU_ASSERT(dpulog_read_for_dpu + (dpu.dpu, stdout)); + each_dpu++; + } + } #endif - if(rep >= p.n_warmup) { - start(&timer, 5, 0); - } - i = 0; - // PARALLEL RETRIEVE TRANSFER - DPU_FOREACH(dpu_set, dpu, i) { - DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i)); - } - DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) { - stop(&timer, 5); - } - + if (rep >= p.n_warmup) { + start(&timer, 5, 0); + } + i = 0; + // PARALLEL RETRIEVE TRANSFER + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer + (dpu, bufferC + input_size_dpu_8bytes * i)); + } + DPU_ASSERT(dpu_push_xfer + (dpu_set, DPU_XFER_FROM_DPU, + DPU_MRAM_HEAP_POINTER_NAME, + input_size_dpu_8bytes * sizeof(T), + input_size_dpu_8bytes * sizeof(T), + DPU_XFER_DEFAULT)); + if (rep >= p.n_warmup) { + stop(&timer, 5); + } #if WITH_ALLOC_OVERHEAD #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - start(&timer, 6, 0); - } + if (rep >= p.n_warmup) { + start(&timer, 6, 0); + } #endif - DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_free(dpu_set)); #if WITH_FREE_OVERHEAD - if(rep >= p.n_warmup) { - stop(&timer, 6); - } + if (rep >= p.n_warmup) { + stop(&timer, 6); + } #endif #endif - // Check output - bool status = true; - for (i = 0; i < input_size; i++) { - if(C[i] != bufferC[i]){ - status = false; + // Check output + bool status = true; + for (i = 0; i < input_size; i++) { + if (C[i] != bufferC[i]) { + status = false; #if PRINT - printf("%d: %u -- %u\n", i, C[i], bufferC[i]); + printf("%d: %u -- %u\n", i, C[i], bufferC[i]); #endif - } - } - if (status) { - printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - if (rep >= p.n_warmup) { - printf("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d", - nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size / NR_DPUS); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", - timer.time[0], - timer.time[1], - timer.time[2], - timer.time[3], - timer.time[4], - timer.time[5], - timer.time[6]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", - input_size * 3 * sizeof(T) / timer.time[2], - input_size * 3 * sizeof(T) / (timer.time[4]), - input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", - input_size * 3 * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size * 3 * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size * 3 * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", - input_size / timer.time[2], - input_size / (timer.time[4]), - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", - input_size / (timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), - input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - } - } else { - printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); - } - } + } + } + if (status) { + printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET + "] Outputs are equal\n"); + if (rep >= p.n_warmup) { + printf + ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d", + nr_of_dpus, nr_of_ranks, NR_TASKLETS, + XSTR(T), BLOCK_SIZE, input_size, + input_size / NR_DPUS); + printf + (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, + WITH_FREE_OVERHEAD, numa_node_rank); + printf + ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + timer.time[0], timer.time[1], + timer.time[2], timer.time[3], + timer.time[4], timer.time[5], + timer.time[6]); + printf + (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + input_size * 3 * sizeof(T) / timer.time[2], + input_size * 3 * sizeof(T) / + (timer.time[4]), + input_size * 3 * sizeof(T) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5] + timer.time[6])); + printf + (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + input_size * 3 * sizeof(T) / + (timer.time[3] + timer.time[4] + + timer.time[5]), + input_size * 3 * sizeof(T) / + (timer.time[1] + timer.time[3] + + timer.time[4] + timer.time[5]), + input_size * 3 * sizeof(T) / + (timer.time[0] + timer.time[1] + + timer.time[3] + timer.time[4] + + timer.time[5])); + printf + (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + input_size / timer.time[2], + input_size / (timer.time[4]), + input_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5] + + timer.time[6])); + printf + (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + input_size / (timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size / (timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5]), + input_size / (timer.time[0] + + timer.time[1] + + timer.time[3] + + timer.time[4] + + timer.time[5])); + } + } else { + printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET + "] Outputs differ!\n"); + } + } #if ENERGY - double energy; - DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); - printf("DPU Energy (J): %f\t", energy); -#endif - + double energy; + DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); + printf("DPU Energy (J): %f\t", energy); +#endif - // Deallocation - free(A); - free(B); - free(C); - free(C2); + // Deallocation + free(A); + free(B); + free(C); + free(C2); #if !WITH_ALLOC_OVERHEAD - DPU_ASSERT(dpu_free(dpu_set)); + DPU_ASSERT(dpu_free(dpu_set)); #endif - - return 0; + + return 0; } diff --git a/VA/support/common.h b/VA/support/common.h index c1043fd..cee09e2 100755 --- a/VA/support/common.h +++ b/VA/support/common.h @@ -3,11 +3,11 @@ // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; - uint32_t transfer_size; + uint32_t size; + uint32_t transfer_size; enum kernels { - kernel1 = 0, - nr_kernels = 1, + kernel1 = 0, + nr_kernels = 1, } kernel; } dpu_arguments_t; @@ -24,34 +24,34 @@ typedef struct { // Data type #ifdef UINT32 #define T uint32_t -#define DIV 2 // Shift right to divide by sizeof(T) +#define DIV 2 // Shift right to divide by sizeof(T) #elif UINT64 #define T uint64_t -#define DIV 3 // Shift right to divide by sizeof(T) +#define DIV 3 // Shift right to divide by sizeof(T) #elif INT32 #define T int32_t -#define DIV 2 // Shift right to divide by sizeof(T) +#define DIV 2 // Shift right to divide by sizeof(T) #elif INT64 #define T int64_t -#define DIV 3 // Shift right to divide by sizeof(T) +#define DIV 3 // Shift right to divide by sizeof(T) #elif FLOAT #define T float -#define DIV 2 // Shift right to divide by sizeof(T) +#define DIV 2 // Shift right to divide by sizeof(T) #elif DOUBLE #define T double -#define DIV 3 // Shift right to divide by sizeof(T) +#define DIV 3 // Shift right to divide by sizeof(T) #elif CHAR #define T char -#define DIV 0 // Shift right to divide by sizeof(T) +#define DIV 0 // Shift right to divide by sizeof(T) #elif SHORT #define T short -#define DIV 1 // Shift right to divide by sizeof(T) +#define DIV 1 // Shift right to divide by sizeof(T) #endif #ifndef ENERGY #define ENERGY 0 #endif -#define PRINT 0 +#define PRINT 0 #define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_GREEN "\x1b[32m" diff --git a/VA/support/params.h b/VA/support/params.h index 8bd71a6..47c10ef 100644 --- a/VA/support/params.h +++ b/VA/support/params.h @@ -4,53 +4,62 @@ #include "common.h" typedef struct Params { - unsigned int input_size; - int n_warmup; - int n_reps; - int exp; -}Params; + unsigned int input_size; + int n_warmup; + int n_reps; + int exp; +} Params; -static void usage() { - fprintf(stderr, - "\nUsage: ./program [options]" - "\n" - "\nGeneral options:" - "\n -h help" - "\n -w <W> # of untimed warmup iterations (default=1)" - "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" - "\n" - "\nBenchmark-specific options:" - "\n -i <I> input size (default=2621440 elements)" - "\n"); +static void usage() +{ + fprintf(stderr, + "\nUsage: ./program [options]" + "\n" + "\nGeneral options:" + "\n -h help" + "\n -w <W> # of untimed warmup iterations (default=1)" + "\n -e <E> # of timed repetition iterations (default=3)" + "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n" + "\nBenchmark-specific options:" + "\n -i <I> input size (default=2621440 elements)" "\n"); } -struct Params input_params(int argc, char **argv) { - struct Params p; - p.input_size = 2621440; - p.n_warmup = 1; - p.n_reps = 3; - p.exp = 0; +struct Params input_params(int argc, char **argv) +{ + struct Params p; + p.input_size = 2621440; + p.n_warmup = 1; + p.n_reps = 3; + p.exp = 0; - int opt; - while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { - switch(opt) { - case 'h': - usage(); - exit(0); - break; - case 'i': p.input_size = atoi(optarg); break; - case 'w': p.n_warmup = atoi(optarg); break; - case 'e': p.n_reps = atoi(optarg); break; - case 'x': p.exp = atoi(optarg); break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - assert(NR_DPUS > 0 && "Invalid # of dpus!"); + int opt; + while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { + switch (opt) { + case 'h': + usage(); + exit(0); + break; + case 'i': + p.input_size = atoi(optarg); + break; + case 'w': + p.n_warmup = atoi(optarg); + break; + case 'e': + p.n_reps = atoi(optarg); + break; + case 'x': + p.exp = atoi(optarg); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + assert(NR_DPUS > 0 && "Invalid # of dpus!"); - return p; + return p; } #endif diff --git a/VA/support/timer.h b/VA/support/timer.h index 4d597b9..df68334 100755 --- a/VA/support/timer.h +++ b/VA/support/timer.h @@ -1,66 +1,74 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer { + + struct timeval startTime[7]; + struct timeval stopTime[7]; + double time[7]; + +} Timer; + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) +{ + printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); +} + +void printall(Timer *timer, int maxt) +{ + for (int i = 0; i <= maxt; i++) { + printf(" timer%d_us=%f", i, timer->time[i]); + } + printf("\n"); +} diff --git a/perf-events.txt b/perf-events.txt new file mode 100644 index 0000000..ab57ab2 --- /dev/null +++ b/perf-events.txt @@ -0,0 +1,44 @@ +cache-misses # NMPO +cache-references + +cpu-cycles # NMPO +instructions # NMPO + +page-faults + +mem-loads +mem-loads-aux +mem-stores + +branch-misses # NMPO +branch-instructions # NMPO +branch-load-misses # NMPO +branch-loads # NMPO + +l1d_pend_miss.pending # mccalpin2023hpc <https://link.springer.com/chapter/10.1007/978-3-031-40843-4_30> +l1d_pend_miss.pending_cycles + +offcore_requests.all_requests +offcore_requests.data_rd +offcore_requests.demand_data_rd + +offcore_requests_outstanding.data_rd # mccalpin2023hpc +offcore_requests_outstanding.cycles_with_data_rd +offcore_requests_outstanding.cycles_with_demand_data_rd +offcore_requests_outstanding.demand_data_rd # mccalpin2023hpc + +L1-dcache-loads # NMPO +L1-dcache-load-misses # NMPO +L1-dcache-stores # NMPO +L1-icache-load-misses # NMPO + +LLC-loads # NMPO +LLC-load-misses +LLC-stores # NMPO +LLC-store-misses # NMPO + +l2_lines_out.useless_hwpf +l2_lines_out.non_silent +l2_lines_out.silent +l2_request.all +l2_request.miss |