diff options
author | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
---|---|---|
committer | Juan Gomez Luna <juan.gomez@safari.ethz.ch> | 2021-06-16 19:46:05 +0200 |
commit | 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch) | |
tree | fc6776a94549d2d4039898f183dbbeb2ce013ba9 /BFS/baselines | |
parent | ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff) |
PrIM -- first commit
Diffstat (limited to 'BFS/baselines')
-rw-r--r-- | BFS/baselines/cpu/Makefile | 7 | ||||
-rw-r--r-- | BFS/baselines/cpu/README | 9 | ||||
-rw-r--r-- | BFS/baselines/cpu/app.c | 147 | ||||
-rw-r--r-- | BFS/baselines/gpu/Makefile | 5 | ||||
-rw-r--r-- | BFS/baselines/gpu/README | 9 | ||||
-rw-r--r-- | BFS/baselines/gpu/app.cu | 157 |
6 files changed, 334 insertions, 0 deletions
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile new file mode 100644 index 0000000..895d38b --- /dev/null +++ b/BFS/baselines/cpu/Makefile @@ -0,0 +1,7 @@ +all: + gcc -o bfs -fopenmp app.c + +clean: + rm bfs + + diff --git a/BFS/baselines/cpu/README b/BFS/baselines/cpu/README new file mode 100644 index 0000000..f2dfefa --- /dev/null +++ b/BFS/baselines/cpu/README @@ -0,0 +1,9 @@ +Breadth-First Search (BFS) + +Compilation instructions: + + make + +Execution instructions + + ./bfs -f ../../data/loc-gowalla_edges.txt diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c new file mode 100644 index 0000000..f75a877 --- /dev/null +++ b/BFS/baselines/cpu/app.c @@ -0,0 +1,147 @@ + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> + +#include <omp.h> + +#include "../../support/common.h" +#include "../../support/graph.h" +#include "../../support/params.h" +#include "../../support/timer.h" +#include "../../support/utils.h" + +int main(int argc, char** argv) { + + // Process parameters + struct Params p = input_params(argc, argv); + + // Initialize BFS data structures + PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); + struct COOGraph cooGraph = readCOOGraph(p.fileName); + PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); + struct CSRGraph csrGraph = coo2csr(cooGraph); + uint32_t* nodeLevel = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* nodeLevelRef = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + for(uint32_t i = 0; i < csrGraph.numNodes; ++i) { + nodeLevel[i] = UINT32_MAX; // Unreachable + nodeLevelRef[i] = UINT32_MAX; // Unreachable + } + uint32_t srcNode = 0; + + // Initialize frontier double buffers + uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* prevFrontier = buffer1; + uint32_t* currFrontier = buffer2; + + // Calculating result on CPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (OpenMP)"); + omp_set_num_threads(4); + Timer timer; + startTimer(&timer); + nodeLevel[srcNode] = 0; + prevFrontier[0] = srcNode; + uint32_t numPrevFrontier = 1; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + uint32_t numCurrFrontier = 0; + + // Visit nodes in the previous frontier + #pragma omp parallel for + for(uint32_t i = 0; i < numPrevFrontier; ++i) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + uint32_t justVisited = 0; + #pragma omp critical + { + if(nodeLevel[neighbor] == UINT32_MAX) { // Node not previously visited + nodeLevel[neighbor] = level; + justVisited = 1; + } + } + if(justVisited) { + uint32_t currFrontierIdx; + #pragma omp critical + { + currFrontierIdx = numCurrFrontier++; + } + currFrontier[currFrontierIdx] = neighbor; + } + } + } + + // Swap buffers + uint32_t* tmp = prevFrontier; + prevFrontier = currFrontier; + currFrontier = tmp; + numPrevFrontier = numCurrFrontier; + + } + stopTimer(&timer); + if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); + PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3); + + // Calculating result on CPU sequentially + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (sequential)"); + startTimer(&timer); + nodeLevelRef[srcNode] = 0; + prevFrontier[0] = srcNode; + numPrevFrontier = 1; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + uint32_t numCurrFrontier = 0; + + // Visit nodes in the previous frontier + for(uint32_t i = 0; i < numPrevFrontier; ++i) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + uint32_t justVisited = 0; + if(nodeLevelRef[neighbor] == UINT32_MAX) { // Node not previously visited + nodeLevelRef[neighbor] = level; + justVisited = 1; + } + if(justVisited) { + uint32_t currFrontierIdx; + currFrontierIdx = numCurrFrontier++; + currFrontier[currFrontierIdx] = neighbor; + } + } + } + + // Swap buffers + uint32_t* tmp = prevFrontier; + prevFrontier = currFrontier; + currFrontier = tmp; + numPrevFrontier = numCurrFrontier; + + } + stopTimer(&timer); + if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); + PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3); + + // Verifying result + PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { + if(nodeLevel[nodeIdx] != nodeLevelRef[nodeIdx]) { + PRINT_ERROR("Mismatch at node %u (CPU sequential result = level %u, CPU parallel result = level %u)", nodeIdx, nodeLevelRef[nodeIdx], nodeLevel[nodeIdx]); + } + } + + + // Deallocate data structures + freeCOOGraph(cooGraph); + freeCSRGraph(csrGraph); + free(nodeLevel); + free(buffer1); + free(buffer2); + + return 0; + +} + diff --git a/BFS/baselines/gpu/Makefile b/BFS/baselines/gpu/Makefile new file mode 100644 index 0000000..7d9d953 --- /dev/null +++ b/BFS/baselines/gpu/Makefile @@ -0,0 +1,5 @@ +all: + /usr/local/cuda/bin/nvcc app.cu -I/usr/local/cuda/include -lm -o bfs + +clean: + rm bfs diff --git a/BFS/baselines/gpu/README b/BFS/baselines/gpu/README new file mode 100644 index 0000000..f2dfefa --- /dev/null +++ b/BFS/baselines/gpu/README @@ -0,0 +1,9 @@ +Breadth-First Search (BFS) + +Compilation instructions: + + make + +Execution instructions + + ./bfs -f ../../data/loc-gowalla_edges.txt diff --git a/BFS/baselines/gpu/app.cu b/BFS/baselines/gpu/app.cu new file mode 100644 index 0000000..e378b4c --- /dev/null +++ b/BFS/baselines/gpu/app.cu @@ -0,0 +1,157 @@ + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> + +#include "../../support/common.h" +#include "../../support/graph.h" +#include "../../support/params.h" +#include "../../support/timer.h" +#include "../../support/utils.h" + +__global__ void bfs_kernel(CSRGraph csrGraph, uint32_t* nodeLevel, uint32_t* prevFrontier, uint32_t* currFrontier, uint32_t numPrevFrontier, uint32_t* numCurrFrontier, uint32_t level) { + uint32_t i = blockIdx.x*blockDim.x + threadIdx.x; + if(i < numPrevFrontier) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + if(atomicCAS(&nodeLevel[neighbor], UINT32_MAX, level) == UINT32_MAX) { // Node not previously visited + uint32_t currFrontierIdx = atomicAdd(numCurrFrontier, 1); + currFrontier[currFrontierIdx] = neighbor; + } + } + } +} + +int main(int argc, char** argv) { + + // Process parameters + struct Params p = input_params(argc, argv); + + // Initialize BFS data structures + PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); + struct COOGraph cooGraph = readCOOGraph(p.fileName); + PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); + struct CSRGraph csrGraph = coo2csr(cooGraph); + uint32_t* nodeLevel_cpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* nodeLevel_gpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + for(uint32_t i = 0; i < csrGraph.numNodes; ++i) { + nodeLevel_cpu[i] = UINT32_MAX; // Unreachable + nodeLevel_gpu[i] = UINT32_MAX; // Unreachable + } + uint32_t srcNode = 0; + + // Allocate GPU memory + CSRGraph csrGraph_d; + csrGraph_d.numNodes = csrGraph.numNodes; + csrGraph_d.numEdges = csrGraph.numEdges; + cudaMalloc((void**) &csrGraph_d.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t)); + cudaMalloc((void**) &csrGraph_d.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t)); + uint32_t* nodeLevel_d; + cudaMalloc((void**) &nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t)); + uint32_t* buffer1_d; + cudaMalloc((void**) &buffer1_d, csrGraph_d.numNodes*sizeof(uint32_t)); + uint32_t* buffer2_d; + cudaMalloc((void**) &buffer2_d, csrGraph_d.numNodes*sizeof(uint32_t)); + uint32_t* numCurrFrontier_d; + cudaMalloc((void**) &numCurrFrontier_d, sizeof(uint32_t)); + uint32_t* prevFrontier_d = buffer1_d; + uint32_t* currFrontier_d = buffer2_d; + + // Copy data to GPU + cudaMemcpy(csrGraph_d.nodePtrs, csrGraph.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(csrGraph_d.neighborIdxs, csrGraph.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t), cudaMemcpyHostToDevice); + nodeLevel_gpu[srcNode] = 0; + cudaMemcpy(nodeLevel_d, nodeLevel_gpu, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(prevFrontier_d, &srcNode, sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + // Calculating result on GPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on GPU"); + Timer timer; + startTimer(&timer); + uint32_t numPrevFrontier = 1; + uint32_t numThreadsPerBlock = 256; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + // Visit nodes in previous frontier + cudaMemset(numCurrFrontier_d, 0, sizeof(uint32_t)); + uint32_t numBlocks = (numPrevFrontier + numThreadsPerBlock - 1)/numThreadsPerBlock; + bfs_kernel <<< numBlocks, numThreadsPerBlock >>> (csrGraph_d, nodeLevel_d, prevFrontier_d, currFrontier_d, numPrevFrontier, numCurrFrontier_d, level); + + // Swap buffers + uint32_t* tmp = prevFrontier_d; + prevFrontier_d = currFrontier_d; + currFrontier_d = tmp; + cudaMemcpy(&numPrevFrontier, numCurrFrontier_d, sizeof(uint32_t), cudaMemcpyDeviceToHost); + + } + cudaDeviceSynchronize(); + stopTimer(&timer); + if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); + PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3); + + // Copy data from GPU + cudaMemcpy(nodeLevel_gpu, nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + // Initialize frontier double buffers for CPU + uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* prevFrontier = buffer1; + uint32_t* currFrontier = buffer2; + + // Calculating result on CPU + PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); + nodeLevel_cpu[srcNode] = 0; + prevFrontier[0] = srcNode; + numPrevFrontier = 1; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + uint32_t numCurrFrontier = 0; + + // Visit nodes in the previous frontier + for(uint32_t i = 0; i < numPrevFrontier; ++i) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + if(nodeLevel_cpu[neighbor] == UINT32_MAX) { // Node not previously visited + nodeLevel_cpu[neighbor] = level; + currFrontier[numCurrFrontier] = neighbor; + ++numCurrFrontier; + } + } + } + + // Swap buffers + uint32_t* tmp = prevFrontier; + prevFrontier = currFrontier; + currFrontier = tmp; + numPrevFrontier = numCurrFrontier; + + } + + // Verify result + PRINT_INFO(p.verbosity >= 1, "Verifying the result"); + for(uint32_t i = 0; i < csrGraph.numNodes; ++i) { + if(nodeLevel_cpu[i] != nodeLevel_gpu[i]) { + printf("Mismatch detected at node %u (CPU result = %u, GPU result = %u)\n", i, nodeLevel_cpu[i], nodeLevel_gpu[i]); + exit(0); + } + } + + // Deallocate data structures + freeCOOGraph(cooGraph); + freeCSRGraph(csrGraph); + free(nodeLevel_cpu); + free(nodeLevel_gpu); + free(buffer1); + free(buffer2); + + return 0; + +} + |