summaryrefslogtreecommitdiff
path: root/BFS/baselines
diff options
context:
space:
mode:
authorJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
committerJuan Gomez Luna <juan.gomez@safari.ethz.ch>2021-06-16 19:46:05 +0200
commit3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
treefc6776a94549d2d4039898f183dbbeb2ce013ba9 /BFS/baselines
parentef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
PrIM -- first commit
Diffstat (limited to 'BFS/baselines')
-rw-r--r--BFS/baselines/cpu/Makefile7
-rw-r--r--BFS/baselines/cpu/README9
-rw-r--r--BFS/baselines/cpu/app.c147
-rw-r--r--BFS/baselines/gpu/Makefile5
-rw-r--r--BFS/baselines/gpu/README9
-rw-r--r--BFS/baselines/gpu/app.cu157
6 files changed, 334 insertions, 0 deletions
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile
new file mode 100644
index 0000000..895d38b
--- /dev/null
+++ b/BFS/baselines/cpu/Makefile
@@ -0,0 +1,7 @@
+all:
+ gcc -o bfs -fopenmp app.c
+
+clean:
+ rm bfs
+
+
diff --git a/BFS/baselines/cpu/README b/BFS/baselines/cpu/README
new file mode 100644
index 0000000..f2dfefa
--- /dev/null
+++ b/BFS/baselines/cpu/README
@@ -0,0 +1,9 @@
+Breadth-First Search (BFS)
+
+Compilation instructions:
+
+ make
+
+Execution instructions
+
+ ./bfs -f ../../data/loc-gowalla_edges.txt
diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c
new file mode 100644
index 0000000..f75a877
--- /dev/null
+++ b/BFS/baselines/cpu/app.c
@@ -0,0 +1,147 @@
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include <omp.h>
+
+#include "../../support/common.h"
+#include "../../support/graph.h"
+#include "../../support/params.h"
+#include "../../support/timer.h"
+#include "../../support/utils.h"
+
+int main(int argc, char** argv) {
+
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Initialize BFS data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+ struct COOGraph cooGraph = readCOOGraph(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
+ struct CSRGraph csrGraph = coo2csr(cooGraph);
+ uint32_t* nodeLevel = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* nodeLevelRef = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+ nodeLevel[i] = UINT32_MAX; // Unreachable
+ nodeLevelRef[i] = UINT32_MAX; // Unreachable
+ }
+ uint32_t srcNode = 0;
+
+ // Initialize frontier double buffers
+ uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* prevFrontier = buffer1;
+ uint32_t* currFrontier = buffer2;
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (OpenMP)");
+ omp_set_num_threads(4);
+ Timer timer;
+ startTimer(&timer);
+ nodeLevel[srcNode] = 0;
+ prevFrontier[0] = srcNode;
+ uint32_t numPrevFrontier = 1;
+ for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+ uint32_t numCurrFrontier = 0;
+
+ // Visit nodes in the previous frontier
+ #pragma omp parallel for
+ for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+ uint32_t node = prevFrontier[i];
+ for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+ uint32_t neighbor = csrGraph.neighborIdxs[edge];
+ uint32_t justVisited = 0;
+ #pragma omp critical
+ {
+ if(nodeLevel[neighbor] == UINT32_MAX) { // Node not previously visited
+ nodeLevel[neighbor] = level;
+ justVisited = 1;
+ }
+ }
+ if(justVisited) {
+ uint32_t currFrontierIdx;
+ #pragma omp critical
+ {
+ currFrontierIdx = numCurrFrontier++;
+ }
+ currFrontier[currFrontierIdx] = neighbor;
+ }
+ }
+ }
+
+ // Swap buffers
+ uint32_t* tmp = prevFrontier;
+ prevFrontier = currFrontier;
+ currFrontier = tmp;
+ numPrevFrontier = numCurrFrontier;
+
+ }
+ stopTimer(&timer);
+ if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+ PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+ // Calculating result on CPU sequentially
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (sequential)");
+ startTimer(&timer);
+ nodeLevelRef[srcNode] = 0;
+ prevFrontier[0] = srcNode;
+ numPrevFrontier = 1;
+ for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+ uint32_t numCurrFrontier = 0;
+
+ // Visit nodes in the previous frontier
+ for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+ uint32_t node = prevFrontier[i];
+ for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+ uint32_t neighbor = csrGraph.neighborIdxs[edge];
+ uint32_t justVisited = 0;
+ if(nodeLevelRef[neighbor] == UINT32_MAX) { // Node not previously visited
+ nodeLevelRef[neighbor] = level;
+ justVisited = 1;
+ }
+ if(justVisited) {
+ uint32_t currFrontierIdx;
+ currFrontierIdx = numCurrFrontier++;
+ currFrontier[currFrontierIdx] = neighbor;
+ }
+ }
+ }
+
+ // Swap buffers
+ uint32_t* tmp = prevFrontier;
+ prevFrontier = currFrontier;
+ currFrontier = tmp;
+ numPrevFrontier = numCurrFrontier;
+
+ }
+ stopTimer(&timer);
+ if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+ PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+ // Verifying result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+ if(nodeLevel[nodeIdx] != nodeLevelRef[nodeIdx]) {
+ PRINT_ERROR("Mismatch at node %u (CPU sequential result = level %u, CPU parallel result = level %u)", nodeIdx, nodeLevelRef[nodeIdx], nodeLevel[nodeIdx]);
+ }
+ }
+
+
+ // Deallocate data structures
+ freeCOOGraph(cooGraph);
+ freeCSRGraph(csrGraph);
+ free(nodeLevel);
+ free(buffer1);
+ free(buffer2);
+
+ return 0;
+
+}
+
diff --git a/BFS/baselines/gpu/Makefile b/BFS/baselines/gpu/Makefile
new file mode 100644
index 0000000..7d9d953
--- /dev/null
+++ b/BFS/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+ /usr/local/cuda/bin/nvcc app.cu -I/usr/local/cuda/include -lm -o bfs
+
+clean:
+ rm bfs
diff --git a/BFS/baselines/gpu/README b/BFS/baselines/gpu/README
new file mode 100644
index 0000000..f2dfefa
--- /dev/null
+++ b/BFS/baselines/gpu/README
@@ -0,0 +1,9 @@
+Breadth-First Search (BFS)
+
+Compilation instructions:
+
+ make
+
+Execution instructions
+
+ ./bfs -f ../../data/loc-gowalla_edges.txt
diff --git a/BFS/baselines/gpu/app.cu b/BFS/baselines/gpu/app.cu
new file mode 100644
index 0000000..e378b4c
--- /dev/null
+++ b/BFS/baselines/gpu/app.cu
@@ -0,0 +1,157 @@
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "../../support/common.h"
+#include "../../support/graph.h"
+#include "../../support/params.h"
+#include "../../support/timer.h"
+#include "../../support/utils.h"
+
+__global__ void bfs_kernel(CSRGraph csrGraph, uint32_t* nodeLevel, uint32_t* prevFrontier, uint32_t* currFrontier, uint32_t numPrevFrontier, uint32_t* numCurrFrontier, uint32_t level) {
+ uint32_t i = blockIdx.x*blockDim.x + threadIdx.x;
+ if(i < numPrevFrontier) {
+ uint32_t node = prevFrontier[i];
+ for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+ uint32_t neighbor = csrGraph.neighborIdxs[edge];
+ if(atomicCAS(&nodeLevel[neighbor], UINT32_MAX, level) == UINT32_MAX) { // Node not previously visited
+ uint32_t currFrontierIdx = atomicAdd(numCurrFrontier, 1);
+ currFrontier[currFrontierIdx] = neighbor;
+ }
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+
+ // Process parameters
+ struct Params p = input_params(argc, argv);
+
+ // Initialize BFS data structures
+ PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+ struct COOGraph cooGraph = readCOOGraph(p.fileName);
+ PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
+ struct CSRGraph csrGraph = coo2csr(cooGraph);
+ uint32_t* nodeLevel_cpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* nodeLevel_gpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+ nodeLevel_cpu[i] = UINT32_MAX; // Unreachable
+ nodeLevel_gpu[i] = UINT32_MAX; // Unreachable
+ }
+ uint32_t srcNode = 0;
+
+ // Allocate GPU memory
+ CSRGraph csrGraph_d;
+ csrGraph_d.numNodes = csrGraph.numNodes;
+ csrGraph_d.numEdges = csrGraph.numEdges;
+ cudaMalloc((void**) &csrGraph_d.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t));
+ cudaMalloc((void**) &csrGraph_d.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t));
+ uint32_t* nodeLevel_d;
+ cudaMalloc((void**) &nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t));
+ uint32_t* buffer1_d;
+ cudaMalloc((void**) &buffer1_d, csrGraph_d.numNodes*sizeof(uint32_t));
+ uint32_t* buffer2_d;
+ cudaMalloc((void**) &buffer2_d, csrGraph_d.numNodes*sizeof(uint32_t));
+ uint32_t* numCurrFrontier_d;
+ cudaMalloc((void**) &numCurrFrontier_d, sizeof(uint32_t));
+ uint32_t* prevFrontier_d = buffer1_d;
+ uint32_t* currFrontier_d = buffer2_d;
+
+ // Copy data to GPU
+ cudaMemcpy(csrGraph_d.nodePtrs, csrGraph.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(csrGraph_d.neighborIdxs, csrGraph.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t), cudaMemcpyHostToDevice);
+ nodeLevel_gpu[srcNode] = 0;
+ cudaMemcpy(nodeLevel_d, nodeLevel_gpu, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(prevFrontier_d, &srcNode, sizeof(uint32_t), cudaMemcpyHostToDevice);
+ cudaDeviceSynchronize();
+
+ // Calculating result on GPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on GPU");
+ Timer timer;
+ startTimer(&timer);
+ uint32_t numPrevFrontier = 1;
+ uint32_t numThreadsPerBlock = 256;
+ for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+ // Visit nodes in previous frontier
+ cudaMemset(numCurrFrontier_d, 0, sizeof(uint32_t));
+ uint32_t numBlocks = (numPrevFrontier + numThreadsPerBlock - 1)/numThreadsPerBlock;
+ bfs_kernel <<< numBlocks, numThreadsPerBlock >>> (csrGraph_d, nodeLevel_d, prevFrontier_d, currFrontier_d, numPrevFrontier, numCurrFrontier_d, level);
+
+ // Swap buffers
+ uint32_t* tmp = prevFrontier_d;
+ prevFrontier_d = currFrontier_d;
+ currFrontier_d = tmp;
+ cudaMemcpy(&numPrevFrontier, numCurrFrontier_d, sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+ }
+ cudaDeviceSynchronize();
+ stopTimer(&timer);
+ if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+ PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+ // Copy data from GPU
+ cudaMemcpy(nodeLevel_gpu, nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+ cudaDeviceSynchronize();
+
+ // Initialize frontier double buffers for CPU
+ uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+ uint32_t* prevFrontier = buffer1;
+ uint32_t* currFrontier = buffer2;
+
+ // Calculating result on CPU
+ PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+ nodeLevel_cpu[srcNode] = 0;
+ prevFrontier[0] = srcNode;
+ numPrevFrontier = 1;
+ for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+ uint32_t numCurrFrontier = 0;
+
+ // Visit nodes in the previous frontier
+ for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+ uint32_t node = prevFrontier[i];
+ for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+ uint32_t neighbor = csrGraph.neighborIdxs[edge];
+ if(nodeLevel_cpu[neighbor] == UINT32_MAX) { // Node not previously visited
+ nodeLevel_cpu[neighbor] = level;
+ currFrontier[numCurrFrontier] = neighbor;
+ ++numCurrFrontier;
+ }
+ }
+ }
+
+ // Swap buffers
+ uint32_t* tmp = prevFrontier;
+ prevFrontier = currFrontier;
+ currFrontier = tmp;
+ numPrevFrontier = numCurrFrontier;
+
+ }
+
+ // Verify result
+ PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+ for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+ if(nodeLevel_cpu[i] != nodeLevel_gpu[i]) {
+ printf("Mismatch detected at node %u (CPU result = %u, GPU result = %u)\n", i, nodeLevel_cpu[i], nodeLevel_gpu[i]);
+ exit(0);
+ }
+ }
+
+ // Deallocate data structures
+ freeCOOGraph(cooGraph);
+ freeCSRGraph(csrGraph);
+ free(nodeLevel_cpu);
+ free(nodeLevel_gpu);
+ free(buffer1);
+ free(buffer2);
+
+ return 0;
+
+}
+