summaryrefslogtreecommitdiff
path: root/BFS/host
diff options
context:
space:
mode:
Diffstat (limited to 'BFS/host')
-rw-r--r--BFS/host/app.c724
-rw-r--r--BFS/host/mram-management.h46
2 files changed, 453 insertions, 317 deletions
diff --git a/BFS/host/app.c b/BFS/host/app.c
index 54b9cdc..9ba7ffb 100644
--- a/BFS/host/app.c
+++ b/BFS/host/app.c
@@ -30,305 +30,429 @@
#define DPU_BINARY "./bin/dpu_code"
// Main of the Host Application
-int main(int argc, char** argv) {
-
- // Process parameters
- struct Params p = input_params(argc, argv);
-
- // Timer and profiling
- Timer timer;
- #if ENERGY
- struct dpu_probe_t probe;
- DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
- double tenergy=0;
- #endif
-
- // Allocate DPUs and load binary
- struct dpu_set_t dpu_set, dpu;
- uint32_t numDPUs;
- DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
- DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
- DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
- PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
-
- // Initialize BFS data structures
- PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
- struct COOGraph cooGraph = readCOOGraph(p.fileName);
- PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
- struct CSRGraph csrGraph = coo2csr(cooGraph);
- uint32_t numNodes = csrGraph.numNodes;
- uint32_t* nodePtrs = csrGraph.nodePtrs;
- uint32_t* neighborIdxs = csrGraph.neighborIdxs;
- uint32_t* nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- uint64_t* visited = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* currentFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- uint64_t* nextFrontier = calloc(numNodes/64, sizeof(uint64_t)); // Bit vector with one bit per node
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- uint32_t level = 1;
-
- // Partition data structure across DPUs
- uint32_t numNodesPerDPU = ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1)/numDPUs + 1);
- PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU", numNodesPerDPU);
- struct DPUParams dpuParams[numDPUs];
- uint32_t dpuParams_m[numDPUs];
- unsigned int dpuIdx = 0;
- unsigned int t0ini = 0;
- unsigned int t1ini = 0;
- unsigned int t2ini = 0;
- unsigned int t3ini = 0;
- DPU_FOREACH (dpu_set, dpu) {
-
- // Allocate parameters
- struct mram_heap_allocator_t allocator;
- init_allocator(&allocator);
- dpuParams_m[dpuIdx] = mram_heap_alloc(&allocator, sizeof(struct DPUParams));
-
- // Find DPU's nodes
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- uint32_t dpuNumNodes;
- if(dpuStartNodeIdx > numNodes) {
- dpuNumNodes = 0;
- } else if(dpuStartNodeIdx + numNodesPerDPU > numNodes) {
- dpuNumNodes = numNodes - dpuStartNodeIdx;
- } else {
- dpuNumNodes = numNodesPerDPU;
- }
- dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
- PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
- PRINT_INFO(p.verbosity >= 2, " Receives %u nodes", dpuNumNodes);
-
- // Partition edges and copy data
- if(dpuNumNodes > 0) {
-
- // Find DPU's CSR graph partition
- uint32_t* dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
- uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
- uint32_t* dpuNeighborIdxs_h = neighborIdxs + dpuNodePtrsOffset;
- uint32_t dpuNumNeighbors = dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
- uint32_t* dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
-
- // Allocate MRAM
- uint32_t dpuNodePtrs_m = mram_heap_alloc(&allocator, (dpuNumNodes + 1)*sizeof(uint32_t));
- uint32_t dpuNeighborIdxs_m = mram_heap_alloc(&allocator, dpuNumNeighbors*sizeof(uint32_t));
- uint32_t dpuNodeLevel_m = mram_heap_alloc(&allocator, dpuNumNodes*sizeof(uint32_t));
- uint32_t dpuVisited_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- uint32_t dpuCurrentFrontier_m = mram_heap_alloc(&allocator, dpuNumNodes/64*sizeof(uint64_t));
- uint32_t dpuNextFrontier_m = mram_heap_alloc(&allocator, numNodes/64*sizeof(uint64_t));
- PRINT_INFO(p.verbosity >= 2, " Total memory allocated is %d bytes", allocator.totalAllocated);
-
- // Set up DPU parameters
- dpuParams[dpuIdx].numNodes = numNodes;
- dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
- dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
- dpuParams[dpuIdx].level = level;
- dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
- dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
- dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
- dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
- dpuParams[dpuIdx].dpuCurrentFrontier_m = dpuCurrentFrontier_m;
- dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
-
- // Send data to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying data to DPU");
- startTimer(&timer, 0, t0ini++);
- copyToDPU(dpu, (uint8_t*)dpuNodePtrs_h, dpuNodePtrs_m, (dpuNumNodes + 1)*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNeighborIdxs_h, dpuNeighborIdxs_m, dpuNumNeighbors*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)dpuNodeLevel_h, dpuNodeLevel_m, dpuNumNodes*sizeof(uint32_t));
- copyToDPU(dpu, (uint8_t*)visited, dpuVisited_m, numNodes/64*sizeof(uint64_t));
- copyToDPU(dpu, (uint8_t*)nextFrontier, dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // NOTE: No need to copy current frontier because it is written before being read
- stopTimer(&timer, 0);
- //loadTime += getElapsedTime(timer);
-
- }
-
- // Send parameters to DPU
- PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU");
- startTimer(&timer, 1, t1ini++);
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- stopTimer(&timer, 1);
- //loadTime += getElapsedTime(timer);
-
- ++dpuIdx;
-
- }
-
- // Iterate until next frontier is empty
- uint32_t nextFrontierEmpty = 0;
- while(!nextFrontierEmpty) {
-
- PRINT_INFO(p.verbosity >= 1, "Processing current frontier for level %u", level);
-
- #if ENERGY
- DPU_ASSERT(dpu_probe_start(&probe));
- #endif
- // Run all DPUs
- PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
- startTimer(&timer, 2, t2ini++);
- DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
- stopTimer(&timer, 2);
- //dpuTime += getElapsedTime(timer);
- #if ENERGY
- DPU_ASSERT(dpu_probe_stop(&probe));
- double energy;
- DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
- tenergy += energy;
- #endif
-
-
-
- // Copy back next frontier from all DPUs and compute their union as the current frontier
- startTimer(&timer, 3, t3ini++);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- if(dpuIdx == 0) {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)currentFrontier, numNodes/64*sizeof(uint64_t));
- } else {
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNextFrontier_m, (uint8_t*)nextFrontier, numNodes/64*sizeof(uint64_t));
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- currentFrontier[i] |= nextFrontier[i];
- }
- }
- ++dpuIdx;
- }
- }
-
- // Check if the next frontier is empty, and copy data to DPU if not empty
- nextFrontierEmpty = 1;
- for(uint32_t i = 0; i < numNodes/64; ++i) {
- if(currentFrontier[i]) {
- nextFrontierEmpty = 0;
- break;
- }
- }
- if(!nextFrontierEmpty) {
- ++level;
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
- copyToDPU(dpu, (uint8_t*)currentFrontier, dpuParams[dpuIdx].dpuNextFrontier_m, numNodes/64*sizeof(uint64_t));
- // Copy new level to DPU
- dpuParams[dpuIdx].level = level;
- copyToDPU(dpu, (uint8_t*)&dpuParams[dpuIdx], dpuParams_m[dpuIdx], sizeof(struct DPUParams));
- ++dpuIdx;
- }
- }
- }
- stopTimer(&timer, 3);
- //hostTime += getElapsedTime(timer);
-
- }
-
- // Copy back node levels
- PRINT_INFO(p.verbosity >= 1, "Copying back the result");
- startTimer(&timer, 4, 0);
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
- if(dpuNumNodes > 0) {
- uint32_t dpuStartNodeIdx = dpuIdx*numNodesPerDPU;
- copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, (uint8_t*)(nodeLevel + dpuStartNodeIdx), dpuNumNodes*sizeof(float));
- }
- ++dpuIdx;
- }
- stopTimer(&timer, 4);
- //retrieveTime += getElapsedTime(timer);
- //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
-
- // Calculating result on CPU
- PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
- uint32_t* nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
- memset(nextFrontier, 0, numNodes/64*sizeof(uint64_t));
- setBit(nextFrontier[0], 0); // Initialize frontier to first node
- nextFrontierEmpty = 0;
- level = 1;
- while(!nextFrontierEmpty) {
- // Update current frontier and visited list based on the next frontier from the previous iteration
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
- currentFrontier[nodeTileIdx] = nextFrontierTile;
- if(nextFrontierTile) {
- visited[nodeTileIdx] |= nextFrontierTile;
- nextFrontier[nodeTileIdx] = 0;
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(nextFrontierTile, node%64)) {
- nodeLevelReference[node] = level;
- }
- }
- }
- }
- // Visit neighbors of the current frontier
- nextFrontierEmpty = 1;
- for(uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes/64; ++nodeTileIdx) {
- uint64_t currentFrontierTile = currentFrontier[nodeTileIdx];
- if(currentFrontierTile) {
- for(uint32_t node = nodeTileIdx*64; node < (nodeTileIdx + 1)*64; ++node) {
- if(isSet(currentFrontierTile, node%64)) { // If the node is in the current frontier
- // Visit its neighbors
- uint32_t nodePtr = nodePtrs[node];
- uint32_t nextNodePtr = nodePtrs[node + 1];
- for(uint32_t i = nodePtr; i < nextNodePtr; ++i) {
- uint32_t neighbor = neighborIdxs[i];
- if(!isSet(visited[neighbor/64], neighbor%64)) { // Neighbor not previously visited
- // Add neighbor to next frontier
- setBit(nextFrontier[neighbor/64], neighbor%64);
- nextFrontierEmpty = 0;
- }
- }
- }
- }
- }
- }
- ++level;
- }
-
- // Verify the result
- PRINT_INFO(p.verbosity >= 1, "Verifying the result");
- int status = 1;
- for(uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
- if(nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
- PRINT_ERROR("Mismatch at node %u (CPU result = level %u, DPU result = level %u)", nodeIdx, nodeLevelReference[nodeIdx], nodeLevel[nodeIdx]);
- status = 0;
- }
- }
-
- if (status) {
- printf("[::] BFS NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d "
- "| throughput_pim_MBps=%f throughput_MBps=%f",
- numDPUs, NR_TASKLETS, "uint32_t", numNodes,
- numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]),
- numNodes * sizeof(uint32_t) / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
- numNodes / (timer.time[2] + timer.time[3]),
- numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4]));
- printAll(&timer, 4);
- }
-
- // Display DPU Logs
- if(p.verbosity >= 2) {
- PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
- dpuIdx = 0;
- DPU_FOREACH (dpu_set, dpu) {
- PRINT("DPU %u:", dpuIdx);
- DPU_ASSERT(dpu_log_read(dpu, stdout));
- ++dpuIdx;
- }
- }
-
- // Deallocate data structures
- freeCOOGraph(cooGraph);
- freeCSRGraph(csrGraph);
- free(nodeLevel);
- free(visited);
- free(currentFrontier);
- free(nextFrontier);
- free(nodeLevelReference);
-
- return 0;
+int main(int argc, char **argv)
+{
-}
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Timer and profiling
+ Timer timer;
+#if ENERGY
+ struct dpu_probe_t probe;
+ DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
+ double tenergy = 0;
+#endif
+
+ printf
+ ("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n",
+ WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD);
+
+ // Allocate DPUs and load binary
+ struct dpu_set_t dpu_set, dpu;
+ uint32_t numDPUs, numRanks;
+
+#if WITH_ALLOC_OVERHEAD
+ startTimer(&timer, 0, 0);
+#endif
+ DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
+#if WITH_ALLOC_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ timer.time[0] = 0;
+#endif
+
+#if WITH_LOAD_OVERHEAD
+ startTimer(&timer, 1, 0);
+#endif
+ DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
+#if WITH_LOAD_OVERHEAD
+ stopTimer(&timer, 0);
+#else
+ timer.time[1] = 0;
+#endif
+ DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs));
+ DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &numRanks));
+ assert(NR_DPUS == numDPUs);
+ PRINT_INFO(p.verbosity >= 1, "Allocated %d DPU(s)", numDPUs);
+
+ // Initialize BFS data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+ struct COOGraph cooGraph = readCOOGraph(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges",
+ cooGraph.numNodes, cooGraph.numEdges);
+ struct CSRGraph csrGraph = coo2csr(cooGraph);
+ uint32_t numNodes = csrGraph.numNodes;
+ uint32_t *nodePtrs = csrGraph.nodePtrs;
+ uint32_t *neighborIdxs = csrGraph.neighborIdxs;
+ uint32_t *nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ uint64_t *visited = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *currentFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ uint64_t *nextFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ uint32_t level = 1;
+
+ // Partition data structure across DPUs
+ uint32_t numNodesPerDPU =
+ ROUND_UP_TO_MULTIPLE_OF_64((numNodes - 1) / numDPUs + 1);
+ PRINT_INFO(p.verbosity >= 1, "Assigning %u nodes per DPU",
+ numNodesPerDPU);
+ struct DPUParams dpuParams[numDPUs];
+ uint32_t dpuParams_m[numDPUs];
+ unsigned int dpuIdx = 0;
+ unsigned int t0ini = 0;
+ unsigned int t1ini = 0;
+ unsigned int t2ini = 0;
+ unsigned int t3ini = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+
+ // Allocate parameters
+ struct mram_heap_allocator_t allocator;
+ init_allocator(&allocator);
+ dpuParams_m[dpuIdx] =
+ mram_heap_alloc(&allocator, sizeof(struct DPUParams));
+
+ // Find DPU's nodes
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ uint32_t dpuNumNodes;
+ if (dpuStartNodeIdx > numNodes) {
+ dpuNumNodes = 0;
+ } else if (dpuStartNodeIdx + numNodesPerDPU > numNodes) {
+ dpuNumNodes = numNodes - dpuStartNodeIdx;
+ } else {
+ dpuNumNodes = numNodesPerDPU;
+ }
+ dpuParams[dpuIdx].dpuNumNodes = dpuNumNodes;
+ PRINT_INFO(p.verbosity >= 2, " DPU %u:", dpuIdx);
+ PRINT_INFO(p.verbosity >= 2, " Receives %u nodes",
+ dpuNumNodes);
+
+ // Partition edges and copy data
+ if (dpuNumNodes > 0) {
+
+ // Find DPU's CSR graph partition
+ uint32_t *dpuNodePtrs_h = &nodePtrs[dpuStartNodeIdx];
+ uint32_t dpuNodePtrsOffset = dpuNodePtrs_h[0];
+ uint32_t *dpuNeighborIdxs_h =
+ neighborIdxs + dpuNodePtrsOffset;
+ uint32_t dpuNumNeighbors =
+ dpuNodePtrs_h[dpuNumNodes] - dpuNodePtrsOffset;
+ uint32_t *dpuNodeLevel_h = &nodeLevel[dpuStartNodeIdx];
+
+ // Allocate MRAM
+ uint32_t dpuNodePtrs_m =
+ mram_heap_alloc(&allocator,
+ (dpuNumNodes +
+ 1) * sizeof(uint32_t));
+ uint32_t dpuNeighborIdxs_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNeighbors * sizeof(uint32_t));
+ uint32_t dpuNodeLevel_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes * sizeof(uint32_t));
+ uint32_t dpuVisited_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ uint32_t dpuCurrentFrontier_m =
+ mram_heap_alloc(&allocator,
+ dpuNumNodes / 64 *
+ sizeof(uint64_t));
+ uint32_t dpuNextFrontier_m =
+ mram_heap_alloc(&allocator,
+ numNodes / 64 * sizeof(uint64_t));
+ PRINT_INFO(p.verbosity >= 2,
+ " Total memory allocated is %d bytes",
+ allocator.totalAllocated);
+
+ // Set up DPU parameters
+ dpuParams[dpuIdx].numNodes = numNodes;
+ dpuParams[dpuIdx].dpuStartNodeIdx = dpuStartNodeIdx;
+ dpuParams[dpuIdx].dpuNodePtrsOffset = dpuNodePtrsOffset;
+ dpuParams[dpuIdx].level = level;
+ dpuParams[dpuIdx].dpuNodePtrs_m = dpuNodePtrs_m;
+ dpuParams[dpuIdx].dpuNeighborIdxs_m = dpuNeighborIdxs_m;
+ dpuParams[dpuIdx].dpuNodeLevel_m = dpuNodeLevel_m;
+ dpuParams[dpuIdx].dpuVisited_m = dpuVisited_m;
+ dpuParams[dpuIdx].dpuCurrentFrontier_m =
+ dpuCurrentFrontier_m;
+ dpuParams[dpuIdx].dpuNextFrontier_m = dpuNextFrontier_m;
+
+ // Send data to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying data to DPU");
+ startTimer(&timer, 2, t0ini++);
+ copyToDPU(dpu, (uint8_t *) dpuNodePtrs_h, dpuNodePtrs_m,
+ (dpuNumNodes + 1) * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) dpuNeighborIdxs_h,
+ dpuNeighborIdxs_m,
+ dpuNumNeighbors * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) dpuNodeLevel_h,
+ dpuNodeLevel_m,
+ dpuNumNodes * sizeof(uint32_t));
+ copyToDPU(dpu, (uint8_t *) visited, dpuVisited_m,
+ numNodes / 64 * sizeof(uint64_t));
+ copyToDPU(dpu, (uint8_t *) nextFrontier,
+ dpuNextFrontier_m,
+ numNodes / 64 * sizeof(uint64_t));
+ // NOTE: No need to copy current frontier because it is written before being read
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ }
+ // Send parameters to DPU
+ PRINT_INFO(p.verbosity >= 2,
+ " Copying parameters to DPU");
+ startTimer(&timer, 2, t1ini++);
+ copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx],
+ dpuParams_m[dpuIdx], sizeof(struct DPUParams));
+ stopTimer(&timer, 2);
+ //loadTime += getElapsedTime(timer);
+
+ ++dpuIdx;
+
+ }
+
+ // Iterate until next frontier is empty
+ uint32_t nextFrontierEmpty = 0;
+ while (!nextFrontierEmpty) {
+
+ PRINT_INFO(p.verbosity >= 1,
+ "Processing current frontier for level %u", level);
+
+#if ENERGY
+ DPU_ASSERT(dpu_probe_start(&probe));
+#endif
+ // Run all DPUs
+ PRINT_INFO(p.verbosity >= 1, " Booting DPUs");
+ startTimer(&timer, 3, t2ini++);
+ DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
+ stopTimer(&timer, 3);
+ //dpuTime += getElapsedTime(timer);
+#if ENERGY
+ DPU_ASSERT(dpu_probe_stop(&probe));
+ double energy;
+ DPU_ASSERT(dpu_probe_get
+ (&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
+ tenergy += energy;
+#endif
+
+ // Copy back next frontier from all DPUs and compute their union as the current frontier
+ startTimer(&timer, 4, t3ini++);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ if (dpuIdx == 0) {
+ copyFromDPU(dpu,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ (uint8_t *) currentFrontier,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ } else {
+ copyFromDPU(dpu,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ (uint8_t *) nextFrontier,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ for (uint32_t i = 0; i < numNodes / 64;
+ ++i) {
+ currentFrontier[i] |=
+ nextFrontier[i];
+ }
+ }
+ ++dpuIdx;
+ }
+ }
+
+ // Check if the next frontier is empty, and copy data to DPU if not empty
+ nextFrontierEmpty = 1;
+ for (uint32_t i = 0; i < numNodes / 64; ++i) {
+ if (currentFrontier[i]) {
+ nextFrontierEmpty = 0;
+ break;
+ }
+ }
+ if (!nextFrontierEmpty) {
+ ++level;
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes =
+ dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier)
+ copyToDPU(dpu,
+ (uint8_t *) currentFrontier,
+ dpuParams[dpuIdx].
+ dpuNextFrontier_m,
+ numNodes / 64 *
+ sizeof(uint64_t));
+ // Copy new level to DPU
+ dpuParams[dpuIdx].level = level;
+ copyToDPU(dpu,
+ (uint8_t *) &
+ dpuParams[dpuIdx],
+ dpuParams_m[dpuIdx],
+ sizeof(struct DPUParams));
+ ++dpuIdx;
+ }
+ }
+ }
+ stopTimer(&timer, 4);
+ //hostTime += getElapsedTime(timer);
+
+ }
+
+ // Copy back node levels
+ PRINT_INFO(p.verbosity >= 1, "Copying back the result");
+ startTimer(&timer, 5, 0);
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes;
+ if (dpuNumNodes > 0) {
+ uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU;
+ copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m,
+ (uint8_t *) (nodeLevel + dpuStartNodeIdx),
+ dpuNumNodes * sizeof(float));
+ }
+ ++dpuIdx;
+ }
+ stopTimer(&timer, 5);
+ //retrieveTime += getElapsedTime(timer);
+ //if(p.verbosity == 0) PRINT("CPU-DPU Time(ms): %f DPU Kernel Time (ms): %f Inter-DPU Time (ms): %f DPU-CPU Time (ms): %f", loadTime*1e3, dpuTime*1e3, hostTime*1e3, retrieveTime*1e3);
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ uint32_t *nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable)
+ memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t));
+ setBit(nextFrontier[0], 0); // Initialize frontier to first node
+ nextFrontierEmpty = 0;
+ level = 1;
+ startTimer(&timer, 6, 0);
+ while (!nextFrontierEmpty) {
+ // Update current frontier and visited list based on the next frontier from the previous iteration
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t nextFrontierTile = nextFrontier[nodeTileIdx];
+ currentFrontier[nodeTileIdx] = nextFrontierTile;
+ if (nextFrontierTile) {
+ visited[nodeTileIdx] |= nextFrontierTile;
+ nextFrontier[nodeTileIdx] = 0;
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(nextFrontierTile, node % 64)) {
+ nodeLevelReference[node] =
+ level;
+ }
+ }
+ }
+ }
+ // Visit neighbors of the current frontier
+ nextFrontierEmpty = 1;
+ for (uint32_t nodeTileIdx = 0; nodeTileIdx < numNodes / 64;
+ ++nodeTileIdx) {
+ uint64_t currentFrontierTile =
+ currentFrontier[nodeTileIdx];
+ if (currentFrontierTile) {
+ for (uint32_t node = nodeTileIdx * 64;
+ node < (nodeTileIdx + 1) * 64; ++node) {
+ if (isSet(currentFrontierTile, node % 64)) { // If the node is in the current frontier
+ // Visit its neighbors
+ uint32_t nodePtr =
+ nodePtrs[node];
+ uint32_t nextNodePtr =
+ nodePtrs[node + 1];
+ for (uint32_t i = nodePtr;
+ i < nextNodePtr; ++i) {
+ uint32_t neighbor =
+ neighborIdxs[i];
+ if (!isSet(visited[neighbor / 64], neighbor % 64)) { // Neighbor not previously visited
+ // Add neighbor to next frontier
+ setBit
+ (nextFrontier
+ [neighbor /
+ 64],
+ neighbor %
+ 64);
+ nextFrontierEmpty
+ = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ ++level;
+ }
+ stopTimer(&timer, 6);
+
+#if WITH_FREE_OVERHEAD
+ startTimer(&timer, 7);
+#endif
+ DPU_ASSERT(dpu_free(dpu_set));
+#if WITH_FREE_OVERHEAD
+ stopTimer(&timer, 7);
+#else
+ timer.time[7] = 0;
+#endif
+
+ // Verify the result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ int status = 1;
+ for (uint32_t nodeIdx = 0; nodeIdx < numNodes; ++nodeIdx) {
+ if (nodeLevel[nodeIdx] != nodeLevelReference[nodeIdx]) {
+ PRINT_ERROR
+ ("Mismatch at node %u (CPU result = level %u, DPU result = level %u)",
+ nodeIdx, nodeLevelReference[nodeIdx],
+ nodeLevel[nodeIdx]);
+ status = 0;
+ }
+ }
+
+ if (status) {
+ printf
+ ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d "
+ "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs,
+ NR_TASKLETS, "uint32_t", numNodes,
+ numNodes * sizeof(uint32_t) / (timer.time[2] +
+ timer.time[3]),
+ numNodes * sizeof(uint32_t) / (timer.time[0] +
+ timer.time[1] +
+ timer.time[2] +
+ timer.time[3] +
+ timer.time[4]));
+ printf(" throughput_pim_MOpps=%f throughput_MOpps=%f",
+ numNodes / (timer.time[2] + timer.time[3]),
+ numNodes / (timer.time[0] + timer.time[1] +
+ timer.time[2] + timer.time[3] +
+ timer.time[4]));
+ printf
+ (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n",
+ timer.time[0], timer.time[1], timer.time[2], timer.time[3],
+ timer.time[4], timer.time[5], timer.time[6],
+ timer.time[7]);
+ }
+ // Display DPU Logs
+ if (p.verbosity >= 2) {
+ PRINT_INFO(p.verbosity >= 2, "Displaying DPU Logs:");
+ dpuIdx = 0;
+ DPU_FOREACH(dpu_set, dpu) {
+ PRINT("DPU %u:", dpuIdx);
+ DPU_ASSERT(dpu_log_read(dpu, stdout));
+ ++dpuIdx;
+ }
+ }
+ // Deallocate data structures
+ freeCOOGraph(cooGraph);
+ freeCSRGraph(csrGraph);
+ free(nodeLevel);
+ free(visited);
+ free(currentFrontier);
+ free(nextFrontier);
+ free(nodeLevelReference);
+
+ return 0;
+
+}
diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h
index 627dfde..f2ee031 100644
--- a/BFS/host/mram-management.h
+++ b/BFS/host/mram-management.h
@@ -5,33 +5,45 @@
#include "../support/common.h"
#include "../support/utils.h"
-#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
+#define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB
struct mram_heap_allocator_t {
- uint32_t totalAllocated;
+ uint32_t totalAllocated;
};
-static void init_allocator(struct mram_heap_allocator_t* allocator) {
- allocator->totalAllocated = 0;
+static void init_allocator(struct mram_heap_allocator_t *allocator)
+{
+ allocator->totalAllocated = 0;
}
-static uint32_t mram_heap_alloc(struct mram_heap_allocator_t* allocator, uint32_t size) {
- uint32_t ret = allocator->totalAllocated;
- allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
- if(allocator->totalAllocated > DPU_CAPACITY) {
- PRINT_ERROR(" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!", allocator->totalAllocated, DPU_CAPACITY);
- exit(0);
- }
- return ret;
+static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator,
+ uint32_t size)
+{
+ uint32_t ret = allocator->totalAllocated;
+ allocator->totalAllocated += ROUND_UP_TO_MULTIPLE_OF_8(size);
+ if (allocator->totalAllocated > DPU_CAPACITY) {
+ PRINT_ERROR
+ (" Total memory allocated is %d bytes which exceeds the DPU capacity (%d bytes)!",
+ allocator->totalAllocated, DPU_CAPACITY);
+ exit(0);
+ }
+ return ret;
}
-static void copyToDPU(struct dpu_set_t dpu, uint8_t* hostPtr, uint32_t mramIdx, uint32_t size) {
- DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx,
+ uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_to
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
-static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, uint8_t* hostPtr, uint32_t size) {
- DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, ROUND_UP_TO_MULTIPLE_OF_8(size)));
+static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx,
+ uint8_t *hostPtr, uint32_t size)
+{
+ DPU_ASSERT(dpu_copy_from
+ (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr,
+ ROUND_UP_TO_MULTIPLE_OF_8(size)));
}
#endif
-