diff options
Diffstat (limited to 'BFS/baselines/cpu')
-rw-r--r-- | BFS/baselines/cpu/Makefile | 28 | ||||
-rw-r--r-- | BFS/baselines/cpu/app.c | 231 | ||||
-rwxr-xr-x | BFS/baselines/cpu/run.sh | 14 |
3 files changed, 171 insertions, 102 deletions
diff --git a/BFS/baselines/cpu/Makefile b/BFS/baselines/cpu/Makefile index 895d38b..1b1d57c 100644 --- a/BFS/baselines/cpu/Makefile +++ b/BFS/baselines/cpu/Makefile @@ -1,7 +1,27 @@ -all: - gcc -o bfs -fopenmp app.c +.PHONY: all +all: bfs -clean: - rm bfs +bfs: app.c + gcc -O2 -o bfs -fopenmp app.c + +bfs_O0: app.c + gcc -o bfs_O0 -fopenmp app.c + +bfs_O2: app.c + gcc -O2 -o bfs_O2 -fopenmp app.c +.PHONY: run +run: bfs + ./bfs -f ../../data/loc-gowalla_edges.txt +.PHONY: run_O0 +run_O0: bfs_O0 + OMP_NUM_THREADS=4 ./bfs_O0 -f ../../data/loc-gowalla_edges.txt + +.PHONY: run_O2 +run_O2: bfs_O2 + OMP_NUM_THREADS=4 ./bfs_O2 -f ../../data/loc-gowalla_edges.txt + +.PHONY: clean +clean: + rm -f bfs bfs_O0 bfs_O2 diff --git a/BFS/baselines/cpu/app.c b/BFS/baselines/cpu/app.c index f75a877..65cd420 100644 --- a/BFS/baselines/cpu/app.c +++ b/BFS/baselines/cpu/app.c @@ -23,123 +23,158 @@ int main(int argc, char** argv) { PRINT_INFO(p.verbosity >= 1, "Reading graph %s", p.fileName); struct COOGraph cooGraph = readCOOGraph(p.fileName); PRINT_INFO(p.verbosity >= 1, " Graph has %d nodes and %d edges", cooGraph.numNodes, cooGraph.numEdges); - struct CSRGraph csrGraph = coo2csr(cooGraph); - uint32_t* nodeLevel = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); - uint32_t* nodeLevelRef = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); - for(uint32_t i = 0; i < csrGraph.numNodes; ++i) { - nodeLevel[i] = UINT32_MAX; // Unreachable - nodeLevelRef[i] = UINT32_MAX; // Unreachable - } - uint32_t srcNode = 0; - // Initialize frontier double buffers - uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); - uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); - uint32_t* prevFrontier = buffer1; - uint32_t* currFrontier = buffer2; - // Calculating result on CPU - PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (OpenMP)"); - omp_set_num_threads(4); Timer timer; - startTimer(&timer); - nodeLevel[srcNode] = 0; - prevFrontier[0] = srcNode; - uint32_t numPrevFrontier = 1; - for(uint32_t level = 1; numPrevFrontier > 0; ++level) { - - uint32_t numCurrFrontier = 0; - - // Visit nodes in the previous frontier - #pragma omp parallel for - for(uint32_t i = 0; i < numPrevFrontier; ++i) { - uint32_t node = prevFrontier[i]; - for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { - uint32_t neighbor = csrGraph.neighborIdxs[edge]; - uint32_t justVisited = 0; - #pragma omp critical - { - if(nodeLevel[neighbor] == UINT32_MAX) { // Node not previously visited - nodeLevel[neighbor] = level; - justVisited = 1; - } - } - if(justVisited) { - uint32_t currFrontierIdx; + for(int rep = 0; rep < 100; rep++) { + + struct CSRGraph csrGraph = coo2csr(cooGraph); + uint32_t* nodeLevel = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* nodeLevelRef = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + for(uint32_t i = 0; i < csrGraph.numNodes; ++i) { + nodeLevel[i] = UINT32_MAX; // Unreachable + nodeLevelRef[i] = UINT32_MAX; // Unreachable + } + uint32_t srcNode = 0; + + // Initialize frontier double buffers + uint32_t* buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + uint32_t* prevFrontier = buffer1; + uint32_t* currFrontier = buffer2; + + // Calculating result on CPU + startTimer(&timer, 0, 0); + nodeLevel[srcNode] = 0; + prevFrontier[0] = srcNode; + uint32_t numPrevFrontier = 1; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + uint32_t numCurrFrontier = 0; + + // Visit nodes in the previous frontier + #pragma omp parallel for + for(uint32_t i = 0; i < numPrevFrontier; ++i) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + uint32_t justVisited = 0; #pragma omp critical { - currFrontierIdx = numCurrFrontier++; + if(nodeLevel[neighbor] == UINT32_MAX) { // Node not previously visited + nodeLevel[neighbor] = level; + justVisited = 1; + } + } + if(justVisited) { + uint32_t currFrontierIdx; + #pragma omp critical + { + currFrontierIdx = numCurrFrontier++; + } + currFrontier[currFrontierIdx] = neighbor; } - currFrontier[currFrontierIdx] = neighbor; } } + + // Swap buffers + uint32_t* tmp = prevFrontier; + prevFrontier = currFrontier; + currFrontier = tmp; + numPrevFrontier = numCurrFrontier; + } - - // Swap buffers - uint32_t* tmp = prevFrontier; - prevFrontier = currFrontier; - currFrontier = tmp; - numPrevFrontier = numCurrFrontier; - - } - stopTimer(&timer); - if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); - PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3); - - // Calculating result on CPU sequentially - PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU (sequential)"); - startTimer(&timer); - nodeLevelRef[srcNode] = 0; - prevFrontier[0] = srcNode; - numPrevFrontier = 1; - for(uint32_t level = 1; numPrevFrontier > 0; ++level) { - - uint32_t numCurrFrontier = 0; - - // Visit nodes in the previous frontier - for(uint32_t i = 0; i < numPrevFrontier; ++i) { - uint32_t node = prevFrontier[i]; - for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { - uint32_t neighbor = csrGraph.neighborIdxs[edge]; - uint32_t justVisited = 0; - if(nodeLevelRef[neighbor] == UINT32_MAX) { // Node not previously visited - nodeLevelRef[neighbor] = level; - justVisited = 1; - } - if(justVisited) { - uint32_t currFrontierIdx; - currFrontierIdx = numCurrFrontier++; - currFrontier[currFrontierIdx] = neighbor; + stopTimer(&timer, 0); + + freeCSRGraph(csrGraph); + free(buffer1); + free(buffer2); + + csrGraph = coo2csr(cooGraph); + srcNode = 0; + + // Initialize frontier double buffers + buffer1 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + buffer2 = (uint32_t*) malloc(csrGraph.numNodes*sizeof(uint32_t)); + prevFrontier = buffer1; + currFrontier = buffer2; + + // Calculating result on CPU sequentially + startTimer(&timer, 1, 0); + nodeLevelRef[srcNode] = 0; + prevFrontier[0] = srcNode; + numPrevFrontier = 1; + for(uint32_t level = 1; numPrevFrontier > 0; ++level) { + + uint32_t numCurrFrontier = 0; + + // Visit nodes in the previous frontier + for(uint32_t i = 0; i < numPrevFrontier; ++i) { + uint32_t node = prevFrontier[i]; + for(uint32_t edge = csrGraph.nodePtrs[node]; edge < csrGraph.nodePtrs[node + 1]; ++edge) { + uint32_t neighbor = csrGraph.neighborIdxs[edge]; + uint32_t justVisited = 0; + if(nodeLevelRef[neighbor] == UINT32_MAX) { // Node not previously visited + nodeLevelRef[neighbor] = level; + justVisited = 1; + } + if(justVisited) { + uint32_t currFrontierIdx; + currFrontierIdx = numCurrFrontier++; + currFrontier[currFrontierIdx] = neighbor; + } } } + + // Swap buffers + uint32_t* tmp = prevFrontier; + prevFrontier = currFrontier; + currFrontier = tmp; + numPrevFrontier = numCurrFrontier; + + } + stopTimer(&timer, 1); + + unsigned int nr_threads = 0; +#pragma omp parallel +#pragma omp atomic + nr_threads++; + + // Verifying result + int isOK = 1; + for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { + if(nodeLevel[nodeIdx] != nodeLevelRef[nodeIdx]) { + PRINT_ERROR("Mismatch at node %u (CPU sequential result = level %u, CPU parallel result = level %u)", nodeIdx, nodeLevelRef[nodeIdx], nodeLevel[nodeIdx]); + isOK = 0; + } } - // Swap buffers - uint32_t* tmp = prevFrontier; - prevFrontier = currFrontier; - currFrontier = tmp; - numPrevFrontier = numCurrFrontier; - - } - stopTimer(&timer); - if(p.verbosity == 0) PRINT("%f", getElapsedTime(timer)*1e3); - PRINT_INFO(p.verbosity >= 1, "Elapsed time: %f ms", getElapsedTime(timer)*1e3); - - // Verifying result - PRINT_INFO(p.verbosity >= 1, "Verifying the result"); - for(uint32_t nodeIdx = 0; nodeIdx < csrGraph.numNodes; ++nodeIdx) { - if(nodeLevel[nodeIdx] != nodeLevelRef[nodeIdx]) { - PRINT_ERROR("Mismatch at node %u (CPU sequential result = level %u, CPU parallel result = level %u)", nodeIdx, nodeLevelRef[nodeIdx], nodeLevel[nodeIdx]); + if (isOK) { + printf("[::] n_threads=%d e_type=%s n_elements=%d " + "| throughput_cpu_ref_MBps=%f throughput_cpu_omp_MBps=%f\n", + nr_threads, "uint32_t", csrGraph.numNodes, + csrGraph.numNodes * sizeof(uint32_t) / timer.time[1], + csrGraph.numNodes * sizeof(uint32_t) / timer.time[0]); + printf("[::] n_threads=%d e_type=%s n_elements=%d " + "| throughput_cpu_ref_MOpps=%f throughput_cpu_omp_MOpps=%f\n", + nr_threads, "uint32_t", csrGraph.numNodes, + csrGraph.numNodes / timer.time[1], + csrGraph.numNodes / timer.time[0]); + printf("[::] n_threads=%d e_type=%s n_elements=%d |", + nr_threads, "uint32_t", csrGraph.numNodes); + printAll(&timer, 1); } + + freeCSRGraph(csrGraph); + free(nodeLevel); + free(nodeLevelRef); + free(buffer1); + free(buffer2); } // Deallocate data structures freeCOOGraph(cooGraph); - freeCSRGraph(csrGraph); - free(nodeLevel); - free(buffer1); - free(buffer2); return 0; diff --git a/BFS/baselines/cpu/run.sh b/BFS/baselines/cpu/run.sh new file mode 100755 index 0000000..cbed050 --- /dev/null +++ b/BFS/baselines/cpu/run.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +set -e + +echo "prim-benchmarks BFS CPU (dfatool edition)" +echo "Started at $(date)" +echo "Revision $(git describe --always)" + +# default threads: 4 + +make +for nr_threads in 1 2 4 6 8 12 16 20 24 32; do + OMP_NUM_THREADS=${nr_threads} timeout -k 1m 30m ./bfs -f ../../data/loc-gowalla_edges.txt || true +done |