PrIM -- first commit

author: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
committer: Juan Gomez Luna <juan.gomez@safari.ethz.ch> 2021-06-16 19:46:05 +0200
commit: 3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree: fc6776a94549d2d4039898f183dbbeb2ce013ba9 /BFS/baselines
parent: ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)
6 files changed, 334 insertions, 0 deletions
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile
new file mode 100644
index 0000000..895d38b
--- /dev/null
+++ b/BFS/baselines/cpu/Makefile
@@ -0,0 +1,7 @@
+all:
+		gcc -o bfs -fopenmp app.c 
+
+clean:
+		rm bfs
+
+
diff --git a/BFS/baselines/cpu/README b/BFS/baselines/cpu/README
new file mode 100644
index 0000000..f2dfefa
--- /dev/null
+++ b/BFS/baselines/cpu/README
@@ -0,0 +1,9 @@
+Breadth-First Search (BFS)
+
+Compilation instructions:
+
+    make
+
+Execution instructions
+
+    ./bfs -f ../../data/loc-gowalla_edges.txt 
diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c
new file mode 100644
index 0000000..f75a877
--- /dev/null
+++ b/BFS/baselines/cpu/app.c
@@ -0,0 +1,147 @@
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include <omp.h>
+
+#include "../../support/common.h"
+#include "../../support/graph.h"
+#include "../../support/params.h"
+#include "../../support/timer.h"
+#include "../../support/utils.h"
+
+int main(int argc, char** argv) {
+
+    // Process parameters
+    struct Params p = input_params(argc, argv);
+
+    // Initialize BFS data structures
+    PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+    struct COOGraph cooGraph = readCOOGraph(p.fileName);
+    PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
+    struct CSRGraph csrGraph = coo2csr(cooGraph);
+    uint32_t* nodeLevel = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* nodeLevelRef = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+        nodeLevel[i] = UINT32_MAX; // Unreachable
+        nodeLevelRef[i] = UINT32_MAX; // Unreachable
+    }
+    uint32_t srcNode = 0;
+
+    // Initialize frontier double buffers
+    uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* prevFrontier = buffer1;
+    uint32_t* currFrontier = buffer2;
+
+    // Calculating result on CPU
+    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (OpenMP)");
+    omp_set_num_threads(4);
+    Timer timer;
+    startTimer(&timer);
+    nodeLevel[srcNode] = 0;
+    prevFrontier[0] = srcNode;
+    uint32_t numPrevFrontier = 1;
+    for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+        uint32_t numCurrFrontier = 0;
+
+        // Visit nodes in the previous frontier
+        #pragma omp parallel for
+        for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+            uint32_t node = prevFrontier[i];
+            for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+                uint32_t neighbor = csrGraph.neighborIdxs[edge];
+                uint32_t justVisited = 0;
+                #pragma omp critical
+                {
+                    if(nodeLevel[neighbor] == UINT32_MAX) { // Node not previously visited
+                        nodeLevel[neighbor] = level;
+                        justVisited = 1;
+                    }
+                }
+                if(justVisited) {
+                    uint32_t currFrontierIdx;
+                    #pragma omp critical
+                    {
+                        currFrontierIdx = numCurrFrontier++;
+                    }
+                    currFrontier[currFrontierIdx] = neighbor;
+                }
+            }
+        }
+
+        // Swap buffers
+        uint32_t* tmp = prevFrontier;
+        prevFrontier = currFrontier;
+        currFrontier = tmp;
+        numPrevFrontier = numCurrFrontier;
+
+    }
+    stopTimer(&timer);
+    if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+    PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+    // Calculating result on CPU sequentially
+    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (sequential)");
+    startTimer(&timer);
+    nodeLevelRef[srcNode] = 0;
+    prevFrontier[0] = srcNode;
+    numPrevFrontier = 1;
+    for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+        uint32_t numCurrFrontier = 0;
+
+        // Visit nodes in the previous frontier
+        for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+            uint32_t node = prevFrontier[i];
+            for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+                uint32_t neighbor = csrGraph.neighborIdxs[edge];
+                uint32_t justVisited = 0;
+                if(nodeLevelRef[neighbor] == UINT32_MAX) { // Node not previously visited
+                    nodeLevelRef[neighbor] = level;
+                    justVisited = 1;
+                }
+                if(justVisited) {
+                    uint32_t currFrontierIdx;
+                    currFrontierIdx = numCurrFrontier++;
+                    currFrontier[currFrontierIdx] = neighbor;
+                }
+            }
+        }
+
+        // Swap buffers
+        uint32_t* tmp = prevFrontier;
+        prevFrontier = currFrontier;
+        currFrontier = tmp;
+        numPrevFrontier = numCurrFrontier;
+
+    }
+    stopTimer(&timer);
+    if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+    PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+    // Verifying result
+    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+    for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) {
+        if(nodeLevel[nodeIdx] != nodeLevelRef[nodeIdx]) {
+            PRINT_ERROR("Mismatch at node %u (CPU sequential result = level %u, CPU parallel result = level %u)", nodeIdx, nodeLevelRef[nodeIdx], nodeLevel[nodeIdx]);
+        }
+    }
+
+
+    // Deallocate data structures
+    freeCOOGraph(cooGraph);
+    freeCSRGraph(csrGraph);
+    free(nodeLevel);
+    free(buffer1);
+    free(buffer2);
+
+    return 0;
+
+}
+
diff --git a/BFS/baselines/gpu/Makefile b/BFS/baselines/gpu/Makefile
new file mode 100644
index 0000000..7d9d953
--- /dev/null
+++ b/BFS/baselines/gpu/Makefile
@@ -0,0 +1,5 @@
+all:
+	/usr/local/cuda/bin/nvcc app.cu -I/usr/local/cuda/include -lm -o bfs
+
+clean:
+	rm bfs
diff --git a/BFS/baselines/gpu/README b/BFS/baselines/gpu/README
new file mode 100644
index 0000000..f2dfefa
--- /dev/null
+++ b/BFS/baselines/gpu/README
@@ -0,0 +1,9 @@
+Breadth-First Search (BFS)
+
+Compilation instructions:
+
+    make
+
+Execution instructions
+
+    ./bfs -f ../../data/loc-gowalla_edges.txt 
diff --git a/BFS/baselines/gpu/app.cu b/BFS/baselines/gpu/app.cu
new file mode 100644
index 0000000..e378b4c
--- /dev/null
+++ b/BFS/baselines/gpu/app.cu
@@ -0,0 +1,157 @@
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "../../support/common.h"
+#include "../../support/graph.h"
+#include "../../support/params.h"
+#include "../../support/timer.h"
+#include "../../support/utils.h"
+
+__global__ void bfs_kernel(CSRGraph csrGraph, uint32_t* nodeLevel, uint32_t* prevFrontier, uint32_t* currFrontier, uint32_t numPrevFrontier, uint32_t* numCurrFrontier,  uint32_t level) {
+    uint32_t i = blockIdx.x*blockDim.x + threadIdx.x;
+    if(i < numPrevFrontier) {
+        uint32_t node = prevFrontier[i];
+        for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+            uint32_t neighbor = csrGraph.neighborIdxs[edge];
+            if(atomicCAS(&nodeLevel[neighbor], UINT32_MAX, level) == UINT32_MAX) { // Node not previously visited
+                uint32_t currFrontierIdx = atomicAdd(numCurrFrontier, 1);
+                currFrontier[currFrontierIdx] = neighbor;
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+
+    // Process parameters
+    struct Params p = input_params(argc, argv);
+
+    // Initialize BFS data structures
+    PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName);
+    struct COOGraph cooGraph = readCOOGraph(p.fileName);
+    PRINT_INFO(p.verbosity >= 1, "    Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges);
+    struct CSRGraph csrGraph = coo2csr(cooGraph);
+    uint32_t* nodeLevel_cpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* nodeLevel_gpu = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+        nodeLevel_cpu[i] = UINT32_MAX; // Unreachable
+        nodeLevel_gpu[i] = UINT32_MAX; // Unreachable
+    }
+    uint32_t srcNode = 0;
+
+    // Allocate GPU memory
+    CSRGraph csrGraph_d;
+    csrGraph_d.numNodes = csrGraph.numNodes;
+    csrGraph_d.numEdges = csrGraph.numEdges;
+    cudaMalloc((void**) &csrGraph_d.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t));
+    cudaMalloc((void**) &csrGraph_d.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t));
+    uint32_t* nodeLevel_d;
+    cudaMalloc((void**) &nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t));
+    uint32_t* buffer1_d;
+    cudaMalloc((void**) &buffer1_d, csrGraph_d.numNodes*sizeof(uint32_t));
+    uint32_t* buffer2_d;
+    cudaMalloc((void**) &buffer2_d, csrGraph_d.numNodes*sizeof(uint32_t));
+    uint32_t* numCurrFrontier_d;
+    cudaMalloc((void**) &numCurrFrontier_d, sizeof(uint32_t));
+    uint32_t* prevFrontier_d = buffer1_d;
+    uint32_t* currFrontier_d = buffer2_d;
+
+    // Copy data to GPU
+    cudaMemcpy(csrGraph_d.nodePtrs, csrGraph.nodePtrs, (csrGraph_d.numNodes + 1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(csrGraph_d.neighborIdxs, csrGraph.neighborIdxs, csrGraph_d.numEdges*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    nodeLevel_gpu[srcNode] = 0;
+    cudaMemcpy(nodeLevel_d, nodeLevel_gpu, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(prevFrontier_d, &srcNode, sizeof(uint32_t), cudaMemcpyHostToDevice);
+    cudaDeviceSynchronize();
+
+    // Calculating result on GPU
+    PRINT_INFO(p.verbosity >= 1, "Calculating result on GPU");
+    Timer timer;
+    startTimer(&timer);
+    uint32_t numPrevFrontier = 1;
+    uint32_t numThreadsPerBlock = 256;
+    for(uint32_t level = 1; numPrevFrontier > 0; ++level) {
+
+        // Visit nodes in previous frontier
+        cudaMemset(numCurrFrontier_d, 0, sizeof(uint32_t));
+        uint32_t numBlocks = (numPrevFrontier + numThreadsPerBlock - 1)/numThreadsPerBlock;
+        bfs_kernel <<< numBlocks, numThreadsPerBlock >>> (csrGraph_d, nodeLevel_d, prevFrontier_d, currFrontier_d, numPrevFrontier, numCurrFrontier_d, level);
+
+        // Swap buffers
+        uint32_t* tmp = prevFrontier_d;
+        prevFrontier_d = currFrontier_d;
+        currFrontier_d = tmp;
+        cudaMemcpy(&numPrevFrontier, numCurrFrontier_d, sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+    }
+    cudaDeviceSynchronize();
+    stopTimer(&timer);
+    if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3);
+    PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3);
+
+    // Copy data from GPU
+    cudaMemcpy(nodeLevel_gpu, nodeLevel_d, csrGraph_d.numNodes*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+
+    // Initialize frontier double buffers for CPU
+    uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t));
+    uint32_t* prevFrontier = buffer1;
+    uint32_t* currFrontier = buffer2;
+
+    // Calculating result on CPU
+    PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU");
+    nodeLevel_cpu[srcNode] = 0;
+    prevFrontier[0] = srcNode;
+    numPrevFrontier = 1;
+    for(uint32_t level = 1; numPrevFrontier > 0; ++level) { 
+
+        uint32_t numCurrFrontier = 0;
+
+        // Visit nodes in the previous frontier
+        for(uint32_t i = 0; i < numPrevFrontier; ++i) {
+            uint32_t node = prevFrontier[i];
+            for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) {
+                uint32_t neighbor = csrGraph.neighborIdxs[edge];
+                if(nodeLevel_cpu[neighbor] == UINT32_MAX) { // Node not previously visited
+                    nodeLevel_cpu[neighbor] = level;
+                    currFrontier[numCurrFrontier] = neighbor;
+                    ++numCurrFrontier;
+                }
+            }
+        }
+
+        // Swap buffers
+        uint32_t* tmp = prevFrontier;
+        prevFrontier = currFrontier;
+        currFrontier = tmp;
+        numPrevFrontier = numCurrFrontier;
+
+    }
+
+    // Verify result
+    PRINT_INFO(p.verbosity >= 1, "Verifying the result");
+    for(uint32_t i = 0; i < csrGraph.numNodes; ++i) {
+        if(nodeLevel_cpu[i] != nodeLevel_gpu[i]) {
+            printf("Mismatch detected at node %u (CPU result = %u, GPU result = %u)\n", i, nodeLevel_cpu[i], nodeLevel_gpu[i]);
+            exit(0);
+        }
+    }
+
+    // Deallocate data structures
+    freeCOOGraph(cooGraph);
+    freeCSRGraph(csrGraph);
+    free(nodeLevel_cpu);
+    free(nodeLevel_gpu);
+    free(buffer1);
+    free(buffer2);
+
+    return 0;
+
+}
+
author	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
committer	Juan Gomez Luna <juan.gomez@safari.ethz.ch>	2021-06-16 19:46:05 +0200
commit	3de4b495fb176eba9a0eb517a4ce05903cb67acb (patch)
tree	fc6776a94549d2d4039898f183dbbeb2ce013ba9 /BFS/baselines
parent	ef5c3688c486b80a56d3c1cded25f2b2387f2668 (diff)