diff options
187 files changed, 2663 insertions, 2880 deletions
@@ -3,10 +3,13 @@ bin log-paper-strong-full.txt log-paper-strong-rank.txt log-paper-weak.txt +*~ *-O0.txt *-O2.txt *-explore.txt +*.perf */bin +*/repo.acp log-*.txt log diff --git a/BFS/Makefile b/BFS/Makefile index d43202f..a773b38 100644 --- a/BFS/Makefile +++ b/BFS/Makefile @@ -4,17 +4,34 @@ WITH_ALLOC_OVERHEAD ?= 0 WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -23,11 +40,13 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah clean: ${QUIET}rm -rf bin diff --git a/BFS/benchmark-scripts/ccmcc25-sim.sh b/BFS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..bcbe284 --- /dev/null +++ b/BFS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -f ${data} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +# BFS does not support repeated kernel invocations → repeat it here +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \ + ::: i $(seq 0 4) \ + ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ +>> ${fn}.txt diff --git a/BFS/benchmark-scripts/ccmcc25.sh b/BFS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..0dcf4bb --- /dev/null +++ b/BFS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -f ${data} 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks BFS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + # BFS does not support repeated kernel invocations → repeat it here + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any data={data} \ + ::: i $(seq 0 10) \ + ::: data data/roadNet-CA.txt data/loc-gowalla_edges.txt \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + >> ${fn}.txt + +done diff --git a/BFS/dpu/task.c b/BFS/dpu/task.c index 44ec214..5275047 100644 --- a/BFS/dpu/task.c +++ b/BFS/dpu/task.c @@ -12,7 +12,7 @@ #include <perfcounter.h> #include "dpu-utils.h" -#include "../support/common.h" +#include "common.h" BARRIER_INIT(my_barrier, NR_TASKLETS); diff --git a/BFS/host/app.c b/BFS/host/app.c index 9ba7ffb..4431193 100644 --- a/BFS/host/app.c +++ b/BFS/host/app.c @@ -3,9 +3,24 @@ * BFS Host Application Source File * */ +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#ifndef ENERGY +#define ENERGY 0 +#endif +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <assert.h> #include <getopt.h> #include <stdio.h> @@ -14,18 +29,11 @@ #include <unistd.h> #include "mram-management.h" -#include "../support/common.h" -#include "../support/graph.h" -#include "../support/params.h" -#include "../support/timer.h" -#include "../support/utils.h" - -#ifndef ENERGY -#define ENERGY 0 -#endif -#if ENERGY -#include <dpu_probe.h> -#endif +#include "common.h" +#include "graph.h" +#include "params.h" +#include "timer.h" +#include "utils.h" #define DPU_BINARY "./bin/dpu_code" @@ -44,10 +52,6 @@ int main(int argc, char **argv) double tenergy = 0; #endif - printf - ("WITH_ALLOC_OVERHEAD=%d WITH_LOAD_OVERHEAD=%d WITH_FREE_OVERHEAD=%d\n", - WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD); - // Allocate DPUs and load binary struct dpu_set_t dpu_set, dpu; uint32_t numDPUs, numRanks; @@ -59,7 +63,7 @@ int main(int argc, char **argv) #if WITH_ALLOC_OVERHEAD stopTimer(&timer, 0); #else - timer.time[0] = 0; + zeroTimer(&timer, 0); #endif #if WITH_LOAD_OVERHEAD @@ -69,7 +73,7 @@ int main(int argc, char **argv) #if WITH_LOAD_OVERHEAD stopTimer(&timer, 0); #else - timer.time[1] = 0; + zeroTimer(&timer, 1); #endif DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &numDPUs)); @@ -86,10 +90,10 @@ int main(int argc, char **argv) uint32_t numNodes = csrGraph.numNodes; uint32_t *nodePtrs = csrGraph.nodePtrs; uint32_t *neighborIdxs = csrGraph.neighborIdxs; - uint32_t *nodeLevel = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) - uint64_t *visited = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t *currentFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node - uint64_t *nextFrontier = calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint32_t *nodeLevel = (uint32_t*)calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + uint64_t *visited = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *currentFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node + uint64_t *nextFrontier = (uint64_t*)calloc(numNodes / 64, sizeof(uint64_t)); // Bit vector with one bit per node setBit(nextFrontier[0], 0); // Initialize frontier to first node uint32_t level = 1; @@ -182,19 +186,27 @@ int main(int argc, char **argv) PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); startTimer(&timer, 2, t0ini++); - copyToDPU(dpu, (uint8_t *) dpuNodePtrs_h, dpuNodePtrs_m, - (dpuNumNodes + 1) * sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t *) dpuNeighborIdxs_h, - dpuNeighborIdxs_m, - dpuNumNeighbors * sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t *) dpuNodeLevel_h, - dpuNodeLevel_m, - dpuNumNodes * sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t *) visited, dpuVisited_m, - numNodes / 64 * sizeof(uint64_t)); - copyToDPU(dpu, (uint8_t *) nextFrontier, - dpuNextFrontier_m, - numNodes / 64 * sizeof(uint64_t)); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNodePtrs_m, (uint8_t *) dpuNodePtrs_h, + ROUND_UP_TO_MULTIPLE_OF_8((dpuNumNodes + 1) * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNeighborIdxs_m, (uint8_t *) dpuNeighborIdxs_h, + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNeighbors * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNodeLevel_m, (uint8_t *) dpuNodeLevel_h, + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(uint32_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuVisited_m, (uint8_t *) visited, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNextFrontier_m, (uint8_t *) nextFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); + // NOTE: No need to copy current frontier because it is written before being read stopTimer(&timer, 2); //loadTime += getElapsedTime(timer); @@ -204,8 +216,9 @@ int main(int argc, char **argv) PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); startTimer(&timer, 2, t1ini++); - copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], - dpuParams_m[dpuIdx], sizeof(struct DPUParams)); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams_m[dpuIdx], (uint8_t *) & dpuParams[dpuIdx], + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)))); stopTimer(&timer, 2); //loadTime += getElapsedTime(timer); @@ -244,19 +257,15 @@ int main(int argc, char **argv) uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; if (dpuNumNodes > 0) { if (dpuIdx == 0) { - copyFromDPU(dpu, - dpuParams[dpuIdx]. - dpuNextFrontier_m, - (uint8_t *) currentFrontier, - numNodes / 64 * - sizeof(uint64_t)); + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) currentFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); } else { - copyFromDPU(dpu, - dpuParams[dpuIdx]. - dpuNextFrontier_m, - (uint8_t *) nextFrontier, - numNodes / 64 * - sizeof(uint64_t)); + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) nextFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); for (uint32_t i = 0; i < numNodes / 64; ++i) { currentFrontier[i] |= @@ -283,19 +292,15 @@ int main(int argc, char **argv) dpuParams[dpuIdx].dpuNumNodes; if (dpuNumNodes > 0) { // Copy current frontier to all DPUs (place in next frontier and DPU will update visited and copy to current frontier) - copyToDPU(dpu, - (uint8_t *) currentFrontier, - dpuParams[dpuIdx]. - dpuNextFrontier_m, - numNodes / 64 * - sizeof(uint64_t)); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNextFrontier_m, + (uint8_t *) currentFrontier, + ROUND_UP_TO_MULTIPLE_OF_8(numNodes / 64 * sizeof(uint64_t)))); // Copy new level to DPU dpuParams[dpuIdx].level = level; - copyToDPU(dpu, - (uint8_t *) & - dpuParams[dpuIdx], - dpuParams_m[dpuIdx], - sizeof(struct DPUParams)); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams_m[dpuIdx], (uint8_t *) &dpuParams[dpuIdx], + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)))); ++dpuIdx; } } @@ -313,9 +318,10 @@ int main(int argc, char **argv) uint32_t dpuNumNodes = dpuParams[dpuIdx].dpuNumNodes; if (dpuNumNodes > 0) { uint32_t dpuStartNodeIdx = dpuIdx * numNodesPerDPU; - copyFromDPU(dpu, dpuParams[dpuIdx].dpuNodeLevel_m, - (uint8_t *) (nodeLevel + dpuStartNodeIdx), - dpuNumNodes * sizeof(float)); + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuNodeLevel_m, + (uint8_t *) (nodeLevel + dpuStartNodeIdx), + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNodes * sizeof(float)))); } ++dpuIdx; } @@ -325,7 +331,7 @@ int main(int argc, char **argv) // Calculating result on CPU PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); - uint32_t *nodeLevelReference = calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) + uint32_t *nodeLevelReference = (uint32_t*) calloc(numNodes, sizeof(uint32_t)); // Node's BFS level (initially all 0 meaning not reachable) memset(nextFrontier, 0, numNodes / 64 * sizeof(uint64_t)); setBit(nextFrontier[0], 0); // Initialize frontier to first node nextFrontierEmpty = 0; @@ -395,7 +401,7 @@ int main(int argc, char **argv) #if WITH_FREE_OVERHEAD stopTimer(&timer, 7); #else - timer.time[7] = 0; + zeroTimer(&timer, 7); #endif // Verify the result @@ -412,9 +418,9 @@ int main(int argc, char **argv) } if (status) { - printf + dfatool_printf ("[::] BFS-UMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d " - "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, + "| throughput_pim_MBps=%f throughput_MBps=%f", numDPUs, numRanks, NR_TASKLETS, "uint32_t", numNodes, numNodes * sizeof(uint32_t) / (timer.time[2] + timer.time[3]), @@ -423,12 +429,12 @@ int main(int argc, char **argv) timer.time[2] + timer.time[3] + timer.time[4])); - printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", + dfatool_printf(" throughput_pim_MOpps=%f throughput_MOpps=%f", numNodes / (timer.time[2] + timer.time[3]), numNodes / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4])); - printf + dfatool_printf (" latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_cpu_us=%f latency_free_us=%f\n", timer.time[0], timer.time[1], timer.time[2], timer.time[3], timer.time[4], timer.time[5], timer.time[6], diff --git a/BFS/host/mram-management.h b/BFS/host/mram-management.h index f2ee031..a953d6a 100644 --- a/BFS/host/mram-management.h +++ b/BFS/host/mram-management.h @@ -1,9 +1,7 @@ +#pragma once -#ifndef _MRAM_MANAGEMENT_H_ -#define _MRAM_MANAGEMENT_H_ - -#include "../support/common.h" -#include "../support/utils.h" +#include "common.h" +#include "utils.h" #define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB @@ -29,21 +27,3 @@ static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator, } return ret; } - -static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx, - uint32_t size) -{ - DPU_ASSERT(dpu_copy_to - (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, - ROUND_UP_TO_MULTIPLE_OF_8(size))); -} - -static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, - uint8_t *hostPtr, uint32_t size) -{ - DPU_ASSERT(dpu_copy_from - (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, - ROUND_UP_TO_MULTIPLE_OF_8(size))); -} - -#endif diff --git a/BFS/support/common.h b/BFS/include/common.h index 5f2aa0d..5f2aa0d 100644 --- a/BFS/support/common.h +++ b/BFS/include/common.h diff --git a/BFS/include/dfatool_host.ah b/BFS/include/dfatool_host.ah new file mode 100644 index 0000000..b2677e1 --- /dev/null +++ b/BFS/include/dfatool_host.ah @@ -0,0 +1,30 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned long input_size; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(uint32_t); + } + + advice call("% input_params(...)"): after() { + printf("[>>] BFS | n_dpus=%u\n", NR_DPUS); + } + + advice call("% coo2csr(...)") : after() { + struct CSRGraph *g = tjp->result(); + input_size = g->numNodes; + printf("[--] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size); + } + + advice execution("% main(...)") : after() { + printf("[<<] BFS | n_dpus=%u n_nodes=%lu\n", NR_DPUS, input_size); + } +}; diff --git a/BFS/support/graph.h b/BFS/include/graph.h index 2a19f67..2a19f67 100644 --- a/BFS/support/graph.h +++ b/BFS/include/graph.h diff --git a/BFS/support/params.h b/BFS/include/params.h index f9169bc..f9169bc 100644 --- a/BFS/support/params.h +++ b/BFS/include/params.h diff --git a/BFS/include/timer.h b/BFS/include/timer.h new file mode 100644 index 0000000..e85490f --- /dev/null +++ b/BFS/include/timer.h @@ -0,0 +1,8 @@ +#pragma once + +#define N_TIMERS 8 +#define startTimer start +#define stopTimer stop +#define zeroTimer zero +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/BFS/support/utils.h b/BFS/include/utils.h index ccd8fbd..ccd8fbd 100644 --- a/BFS/support/utils.h +++ b/BFS/include/utils.h diff --git a/BFS/run-paper-strong-full.sh b/BFS/run-paper-strong-full.sh deleted file mode 100755 index 42806a2..0000000 --- a/BFS/run-paper-strong-full.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 are not part of upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/BFS/run-paper-strong-rank.sh b/BFS/run-paper-strong-rank.sh deleted file mode 100755 index e01d18a..0000000 --- a/BFS/run-paper-strong-rank.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 5m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/BFS/run-paper-weak.sh b/BFS/run-paper-weak.sh deleted file mode 100755 index 121758a..0000000 --- a/BFS/run-paper-weak.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks BFS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# 256 and 512 are not part of upstream -for nr_dpus in 256 512 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - # upstream code uses some kind of generated rMat graphs, but does not provide instructions for reproduction - timeout --foreground -k 1m 3m bin/host_code -f data/loc-gowalla_edges.txt || true - done - fi - done -done | -) tee log-paper-weak.txt diff --git a/BFS/run.sh b/BFS/run.sh deleted file mode 100755 index 8f5bfb8..0000000 --- a/BFS/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -e - -# -f: input file (i.e., input size) -# bin/host_code -f data/loc-gowalla_edges.txt - -# input size depends on file -> strong scaling only - -echo "prim-benchmarks BFS (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do - for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do - for f in loc-gowalla_edges roadNet-CA; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - for i in `seq 1 20`; do - timeout --foreground -k 1m 30m bin/host_code -f data/${f}.txt || true - done - fi - done - done -done diff --git a/BFS/support/timer.h b/BFS/support/timer.h deleted file mode 100644 index 63b5567..0000000 --- a/BFS/support/timer.h +++ /dev/null @@ -1,31 +0,0 @@ - -#ifndef _TIMER_H_ -#define _TIMER_H_ - -#include <stdio.h> -#include <sys/time.h> - -typedef struct Timer { - struct timeval startTime[8]; - struct timeval stopTime[8]; - double time[8]; -} Timer; - -static void startTimer(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -static void stopTimer(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -#endif diff --git a/BS/Makefile b/BS/Makefile index f9c3002..f5f0c67 100644 --- a/BS/Makefile +++ b/BS/Makefile @@ -7,17 +7,34 @@ WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 WITH_DPUINFO ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DINPUT_SIZE=${INPUT_SIZE} -DPROBLEM_SIZE=${PROBLEM_SIZE} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DINPUT_SIZE=${INPUT_SIZE} -DPROBLEM_SIZE=${PROBLEM_SIZE} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -26,10 +43,12 @@ all: bin/bs_host bin/bs_dpu bin: ${QUIET}mkdir -p bin -bin/bs_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin/bs_host: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/bs_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/bs_dpu: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c index 2e4c300..5084c41 100644 --- a/BS/baselines/cpu/bs_omp.c +++ b/BS/baselines/cpu/bs_omp.c @@ -50,7 +50,7 @@ void create_test_file(DTYPE *input, uint64_t nr_elements, DTYPE *querys, } for (uint64_t i = 0; i < n_querys; i++) { - querys[i] = input[rand() % nr_elements]; + querys[i] = input[rand() % (nr_elements - 2)]; } } diff --git a/BS/benchmark-scripts/ccmcc25-sim.sh b/BS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..05e7f87 --- /dev/null +++ b/BS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + INPUT_SIZE=${nr_elements} PROBLEM_SIZE=${nr_queries} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/bs_host -w 0 -e 5 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks BS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_elements={nr_elements} nr_queries={nr_queries} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: nr_elements $((2**18)) $((2**19)) $((2**20)) $((2**21)) $((2**22)) \ + ::: nr_queries 512 1024 2048 4096 \ +>> ${fn}.txt diff --git a/BS/benchmark-scripts/ccmcc25.sh b/BS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..186baf6 --- /dev/null +++ b/BS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + INPUT_SIZE=${nr_elements} PROBLEM_SIZE=${nr_queries} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/bs_host -w 0 -e 50 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks BS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_elements={nr_elements} nr_queries={nr_queries} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: nr_elements $((2**20)) $((2**21)) $((2**22)) \ + ::: nr_queries 524288 1048576 2097152 \ + >> ${fn}.txt + +done diff --git a/BS/benchmark-scripts/milos-hbm-cxl.sh b/BS/benchmark-scripts/milos-hbm-cxl.sh new file mode 100755 index 0000000..79d02c7 --- /dev/null +++ b/BS/benchmark-scripts/milos-hbm-cxl.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +cd baselines/cpu +make -B numa=1 + +mkdir -p log/$(hostname) +fn=log/$(hostname)/milos-hbm-cxl + +# * uint64 == 128 MiB +num_queries_hbm=16777216 + +run_benchmark() { + local "$@" + OMP_NUM_THREADS=${nr_threads} ./bs_omp ${input_size} ${num_queries} $ram $cpu 2>&1 + return $? +} + +export -f run_benchmark + +( + +echo "single-node execution, HBM ref (1/2)" >&2 + +# 4 GiB +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ + input_size=$(perl -E 'say 2 ** 29') num_queries=${num_queries_hbm} \ + ::: i $(seq 1 5) \ + ::: nr_threads 1 2 4 8 12 16 \ + ::: cpu $(seq 0 7) \ + ::: ram $(seq 0 16) + +echo "multi-node execution, HBM ref (2/2)" >&2 + +# 8 GiB +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ + input_size=$(perl -E 'say 2 ** 30') num_queries=${num_queries_hbm} \ + ::: i $(seq 1 40) \ + ::: nr_threads 32 48 64 96 128 \ + ::: cpu -1 \ + ::: ram $(seq 0 16) + +) >> ${fn}.txt diff --git a/BS/host/app.c b/BS/host/app.c index 217ea99..90d016f 100644 --- a/BS/host/app.c +++ b/BS/host/app.c @@ -7,20 +7,28 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> -#include <dpu.h> -#include <dpu_log.h> #include <unistd.h> #include <getopt.h> #include <assert.h> #include <time.h> -#if ENERGY -#include <dpu_probe.h> +#if ASPECTC +extern "C" { #endif +#include <dpu.h> +#include <dpu_log.h> #include <dpu_management.h> #include <dpu_target_macros.h> +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #define XSTR(x) STR(x) #define STR(x) #x @@ -35,14 +43,12 @@ void create_test_file(DTYPE *input, DTYPE *querys, uint64_t nr_elements, uint64_t nr_querys) { - srand(time(NULL)); - input[0] = 1; for (uint64_t i = 1; i < nr_elements; i++) { - input[i] = input[i - 1] + (rand() % 10) + 1; + input[i] = input[i - 1] + 1; } for (uint64_t i = 0; i < nr_querys; i++) { - querys[i] = input[rand() % nr_elements]; + querys[i] = i; } } @@ -96,17 +102,17 @@ int main(int argc, char **argv) // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + zero(&timer, 0); // alloc #endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + zero(&timer, 1); // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + zero(&timer, 6); // free #endif #if ENERGY @@ -122,15 +128,15 @@ int main(int argc, char **argv) assert(num_querys % (NR_DPUS * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors - DTYPE *input = malloc((input_size) * sizeof(DTYPE)); - DTYPE *querys = malloc((num_querys) * sizeof(DTYPE)); + DTYPE *input = (DTYPE*)malloc((input_size) * sizeof(DTYPE)); + DTYPE *querys = (DTYPE*)malloc((num_querys) * sizeof(DTYPE)); // Create an input file with arbitrary data create_test_file(input, querys, input_size, num_querys); // Create kernel arguments uint64_t slice_per_dpu = num_querys / NR_DPUS; - dpu_arguments_t input_arguments = { input_size, slice_per_dpu, 0 }; + dpu_arguments_t input_arguments = { input_size, slice_per_dpu, (enum kernel)0 }; for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Perform input transfers @@ -322,21 +328,21 @@ int main(int argc, char **argv) printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n"); if (rep >= p.n_warmup) { - printf + dfatool_printf ("[::] BS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, input_size); - printf + dfatool_printf (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf + dfatool_printf ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3], timer.time[4], timer.time[5], timer.time[6]); - printf + dfatool_printf (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", num_querys * sizeof(DTYPE) / timer.time[2], num_querys * sizeof(DTYPE) / @@ -345,7 +351,7 @@ int main(int argc, char **argv) (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf + dfatool_printf (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", num_querys * sizeof(DTYPE) / (timer.time[3] + timer.time[4] + @@ -357,7 +363,7 @@ int main(int argc, char **argv) (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf + dfatool_printf (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", num_querys / timer.time[2], num_querys / (timer.time[4]), @@ -367,7 +373,7 @@ int main(int argc, char **argv) timer.time[4] + timer.time[5] + timer.time[6])); - printf + dfatool_printf (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", num_querys / (timer.time[3] + timer.time[4] + diff --git a/BS/support/common.h b/BS/include/common.h index 54adc39..d0b2865 100755..100644 --- a/BS/support/common.h +++ b/BS/include/common.h @@ -27,13 +27,15 @@ #define INPUT_SIZE 2048576 #endif +enum kernel { + kernel1 = 0, + nr_kernels = 1, +}; + typedef struct { uint64_t input_size; uint64_t slice_per_dpu; - enum kernels { - kernel1 = 0, - nr_kernels = 1, - } kernel; + enum kernel kernel; } dpu_arguments_t; // Structures used by both the host and the dpu to communicate information diff --git a/BS/include/dfatool_host.ah b/BS/include/dfatool_host.ah new file mode 100644 index 0000000..19019a5 --- /dev/null +++ b/BS/include/dfatool_host.ah @@ -0,0 +1,31 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned long n_elements, n_queries; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(uint32_t); + } + + advice call("% input_params(...)"): after() { + Params* p = tjp->result(); + n_elements = INPUT_SIZE; + n_queries = p->num_querys; + printf("[>>] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries); + } + + advice call("% binarySearch(...)") : after() { + printf("[--] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries); + } + + advice execution("% main(...)") : after() { + printf("[<<] BS | n_dpus=%u n_elements=%lu n_queries=%lu\n", NR_DPUS, n_elements, n_queries); + } +}; diff --git a/BS/support/params.h b/BS/include/params.h index c91202f..f970eda 100644 --- a/BS/support/params.h +++ b/BS/include/params.h @@ -20,7 +20,7 @@ void usage() "\n -e <E> # of timed repetition iterations (default=3)" "\n" "\nBenchmark-specific options:" - "\n -i <I> problem size (default=2 queries)" "\n"); + "\n -i <I> problem size (default=%d queries)" "\n", PROBLEM_SIZE); } struct Params input_params(int argc, char **argv) diff --git a/BS/include/timer.h b/BS/include/timer.h new file mode 100644 index 0000000..7b80823 --- /dev/null +++ b/BS/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 7 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/BS/run-fgbs24a.sh b/BS/run-fgbs24a.sh deleted file mode 100755 index 06f8766..0000000 --- a/BS/run-fgbs24a.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -set -e - -mkdir -p $(hostname) - -ts=$(date +%Y%m%d) - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks BS (dfatool fgbs24a edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 2304 2048 2543; do - for nr_tasklets in 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true - fi - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true - fi - done -done -echo "Completed at $(date)" -) | tee "$(hostname)/${ts}-fgbs24a.txt" diff --git a/BS/run-paper-strong-full.sh b/BS/run-paper-strong-full.sh deleted file mode 100755 index a6129aa..0000000 --- a/BS/run-paper-strong-full.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks BS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 are not part of uptsream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 16777216 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/BS/run-paper-strong-rank.sh b/BS/run-paper-strong-rank.sh deleted file mode 100755 index c2d4f36..0000000 --- a/BS/run-paper-strong-rank.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks BS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i 262144 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/BS/run-paper-weak.sh b/BS/run-paper-weak.sh deleted file mode 100755 index a27c547..0000000 --- a/BS/run-paper-weak.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements -# ... so the weak rank script might be bogus - -( - -echo "prim-benchmarks BS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - # original Makefile sets PROBLEM_SIZE=2, for some reason. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1 PROBLEM_SIZE=2; then - i=$(( nr_dpus * 262144 )) - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i $i || true - fi - done -done -) | tee log-paper-weak.txt diff --git a/BS/run.sh b/BS/run.sh deleted file mode 100755 index 0c67c93..0000000 --- a/BS/run.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks BS (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for i in 262144 16777216; do - for nr_dpus in 1 4 8 16 32 64 128 256 512 768 1024 1536 2048 2304 2542; do - for nr_tasklets in 8 12 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 30m bin/bs_host -w 0 -e 100 -i $i || true - fi - done - done -done -echo "Completed at $(date)" -) | tee "log-$(hostname)-ndpus.txt" diff --git a/BS/support/timer.h b/BS/support/timer.h deleted file mode 100755 index 256447a..0000000 --- a/BS/support/timer.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> -typedef struct Timer { - struct timeval startTime[7]; - struct timeval stopTime[7]; - double time[7]; -} Timer; - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -void print(Timer *timer, int i, int REP) -{ - printf("%f\t", timer->time[i] / (1000 * REP)); -} - -void printall(Timer *timer, int maxt) -{ - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); -} diff --git a/GEMV/Makefile b/GEMV/Makefile index 5f766ae..644278e 100644 --- a/GEMV/Makefile +++ b/GEMV/Makefile @@ -5,16 +5,31 @@ WITH_ALLOC_OVERHEAD ?= 0 WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -.PHONY: all clean test +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ ifdef verbose @@ -27,7 +42,9 @@ bin: ${QUIET}mkdir -p bin bin/gemv_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah bin/gemv_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} @@ -37,3 +54,5 @@ clean: test: all bin/gemv_host -m 1024 -n 1024 + +.PHONY: all clean test diff --git a/GEMV/benchmark-scripts/ccmcc25-sim.sh b/GEMV/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..3f88fcf --- /dev/null +++ b/GEMV/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/gemv_host -w 0 -e 5 -n ${nr_cols} -m ${nr_rows} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks GEMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_cols={nr_cols} nr_rows={nr_rows} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: nr_cols 256 512 768 1024 1536 \ + ::: nr_rows 512 68 1024 1536 2048 \ +>> ${fn}.txt diff --git a/GEMV/benchmark-scripts/ccmcc25.sh b/GEMV/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..591a200 --- /dev/null +++ b/GEMV/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/gemv_host -w 0 -e 50 -n ${nr_cols} -m ${nr_rows} 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks GEMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_cols={nr_cols} nr_rows={nr_rows} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: nr_cols 2048 4096 8192 \ + ::: nr_rows 40960 81920 163840 \ + >> ${fn}.txt + +done diff --git a/GEMV/dpu/task.c b/GEMV/dpu/task.c index 3bf52e8..120f134 100644 --- a/GEMV/dpu/task.c +++ b/GEMV/dpu/task.c @@ -10,7 +10,7 @@ #include <barrier.h> #include <seqread.h> -#include "../support/common.h" +#include "common.h" #define roundup(n, m) ((n / m) * m + m) diff --git a/GEMV/host/app.c b/GEMV/host/app.c index 6553774..9838eb4 100644 --- a/GEMV/host/app.c +++ b/GEMV/host/app.c @@ -8,25 +8,33 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> -#include <dpu.h> -#include <dpu_log.h> #include <unistd.h> #include <getopt.h> #include <assert.h> -#if ENERGY -#include <dpu_probe.h> +#if ASPECTC +extern "C" { #endif +#include <dpu.h> +#include <dpu_log.h> #include <dpu_management.h> #include <dpu_target_macros.h> +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #define XSTR(x) STR(x) #define STR(x) #x -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -38,6 +46,8 @@ static T *B; static T *C; static T *C_dpu; +unsigned int kernel = 0; + // Create input arrays static void init_data(T *A, T *B, unsigned int m_size, unsigned int n_size) { @@ -85,18 +95,24 @@ int main(int argc, char **argv) // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); +#if DFATOOL_TIMING timer.time[0] = 0; // alloc #endif +#endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); +#if DFATOOL_TIMING timer.time[1] = 0; // load #endif +#endif #if !WITH_FREE_OVERHEAD +#if DFATOOL_TIMING timer.time[8] = 0; // free #endif +#endif #if ENERGY struct dpu_probe_t probe; @@ -155,10 +171,10 @@ int main(int argc, char **argv) input_args[i].nr_rows = rows_per_dpu; } - A = malloc(max_rows_per_dpu * NR_DPUS * n_size_pad * sizeof(T)); - B = malloc(n_size_pad * sizeof(T)); - C = malloc(max_rows_per_dpu * NR_DPUS * sizeof(T)); - C_dpu = malloc(max_rows_per_dpu * NR_DPUS * sizeof(T)); + A = (T*)malloc(max_rows_per_dpu * NR_DPUS * n_size_pad * sizeof(T)); + B = (T*)malloc(n_size_pad * sizeof(T)); + C = (T*)malloc(max_rows_per_dpu * NR_DPUS * sizeof(T)); + C_dpu = (T*)malloc(max_rows_per_dpu * NR_DPUS * sizeof(T)); // Initialize data with arbitrary data init_data(A, B, m_size, n_size); @@ -347,26 +363,26 @@ int main(int argc, char **argv) printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); if (rep >= p.n_warmup) { - printf + dfatool_printf ("[::] GEMV-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d", NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, n_size * m_size); - printf + dfatool_printf (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf + dfatool_printf ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3] + timer.time[6] + timer.time[7], timer.time[4], timer.time[5], timer.time[8]); - printf + dfatool_printf (" latency_write1_us=%f latency_write2_us=%f latency_write3_us=%f", timer.time[3], timer.time[6], timer.time[7] ); - printf + dfatool_printf (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", n_size * m_size * sizeof(T) / timer.time[2], @@ -377,7 +393,7 @@ int main(int argc, char **argv) timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5] + timer.time[8])); - printf + dfatool_printf (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", n_size * m_size * sizeof(T) / (timer.time[3] + timer.time[6] + @@ -392,7 +408,7 @@ int main(int argc, char **argv) timer.time[3] + timer.time[6] + timer.time[7] + timer.time[4] + timer.time[5])); - printf + dfatool_printf (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", n_size * m_size / timer.time[2], n_size * m_size / (timer.time[4]), @@ -404,7 +420,7 @@ int main(int argc, char **argv) timer.time[4] + timer.time[5] + timer.time[8])); - printf + dfatool_printf (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", n_size * m_size / (timer.time[3] + timer.time[6] + diff --git a/GEMV/support/common.h b/GEMV/include/common.h index 47a9628..47a9628 100755..100644 --- a/GEMV/support/common.h +++ b/GEMV/include/common.h diff --git a/GEMV/include/dfatool_host.ah b/GEMV/include/dfatool_host.ah new file mode 100644 index 0000000..84c1dd3 --- /dev/null +++ b/GEMV/include/dfatool_host.ah @@ -0,0 +1,30 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned int n_cols, n_rows; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_cols = p->n_size; + n_rows = p->m_size; + printf("[>>] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows); + } + + advice call("% gemv_host(...)") : after() { + printf("[--] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows); + } + + advice execution("% main(...)") : after() { + printf("[<<] GEMV | n_dpus=%u n_cols=%u n_rows=%u\n", NR_DPUS, n_cols, n_rows); + } +}; diff --git a/GEMV/support/params.h b/GEMV/include/params.h index c72b0c1..c72b0c1 100644 --- a/GEMV/support/params.h +++ b/GEMV/include/params.h diff --git a/GEMV/include/timer.h b/GEMV/include/timer.h new file mode 100644 index 0000000..313151d --- /dev/null +++ b/GEMV/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 9 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/GEMV/run-fgbs24a.sh b/GEMV/run-fgbs24a.sh deleted file mode 100755 index 4135623..0000000 --- a/GEMV/run-fgbs24a.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -set -e - -mkdir -p $(hostname) - -ts=$(date +%Y%m%d) - -( - -echo "prim-benchmarks GEMV (dfatool fgbs24a edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 2304 2048 2543; do - for nr_tasklets in 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m 163840 -n 4096 || true - fi - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1; then - timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m 163840 -n 4096 || true - fi - done -done -echo "Completed at $(date)" -) | tee "$(hostname)/${ts}-fgbs24a.txt" diff --git a/GEMV/run-paper-strong-full.sh b/GEMV/run-paper-strong-full.sh deleted file mode 100755 index 38e6123..0000000 --- a/GEMV/run-paper-strong-full.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks GEMV strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 are not part of upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m 163840 -n 4096 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/GEMV/run-paper-strong-rank.sh b/GEMV/run-paper-strong-rank.sh deleted file mode 100755 index 64f0751..0000000 --- a/GEMV/run-paper-strong-rank.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks GEMV strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m 8192 -n 1024 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/GEMV/run-paper-weak.sh b/GEMV/run-paper-weak.sh deleted file mode 100755 index 0632e71..0000000 --- a/GEMV/run-paper-weak.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks GEMV weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# 256 and 512 are not part of upstream config space -for nr_dpus in 512 256 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - i=$(( nr_dpus * 1024 )) - timeout --foreground -k 1m 30m bin/gemv_host -w 1 -e 100 -m $i -n 2048 || true - fi - done -done -) | tee log-paper-weak.txt diff --git a/GEMV/run.sh b/GEMV/run.sh deleted file mode 100755 index 68637dc..0000000 --- a/GEMV/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -m: number of rows -# -n: number of cols - -( - -echo "prim-benchmarks GEMV (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# run-paper-strong-full: m=163840 n=4096 -# run-paper-strong-rank: m=8192 n=1024 -# run-paper-weak: m=ndpus*1024 n=2048 -for n in 512 1024 2048 4096; do - for m in 512 1024 2048 4096 8192 163840; do - for nr_dpus in 1 4 8 16 32 64 128 256 512 768 1024 1536 2048; do - for nr_tasklets in 8 12 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 30m bin/gemv_host -w 0 -e 100 -m $m -n $n || true - fi - done - done - done -done -) | tee "log-$(hostname)-ndpus.txt" diff --git a/GEMV/support/timer.h b/GEMV/support/timer.h deleted file mode 100755 index b2b9148..0000000 --- a/GEMV/support/timer.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> -typedef struct Timer { - struct timeval startTime[9]; - struct timeval stopTime[9]; - double time[9]; -} Timer; - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); - - //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + - // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000); -} - -void print(Timer *timer, int i, int REP) -{ - printf("%f\t", timer->time[i] / (1000 * REP)); -} - -void printall(Timer *timer, int maxt) -{ - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); -} diff --git a/HST-L/Makefile b/HST-L/Makefile index 1888b0a..45ba86c 100644 --- a/HST-L/Makefile +++ b/HST-L/Makefile @@ -3,15 +3,35 @@ NR_TASKLETS ?= 16 NR_HISTO ?= 1 BL ?= 10 ENERGY ?= 0 +WITH_ALLOC_OVERHEAD ?= 0 +WITH_LOAD_OVERHEAD ?= 0 +WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DNR_HISTO=${NR_HISTO} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DNR_HISTO=${NR_HISTO} +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing} DPU_FLAGS := ${COMMON_FLAGS} -O2 +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ ifdef verbose @@ -23,10 +43,12 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: diff --git a/HST-L/benchmark-scripts/ccmcc25-sim.sh b/HST-L/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..34e841a --- /dev/null +++ b/HST-L/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/host_code -w 0 -e 5 -b ${bin_size} -i ${input_size} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: input_size $((256 * 256)) $((512 * 512)) $((768 * 768)) $((1024 * 1024)) \ +>> ${fn}.txt diff --git a/HST-L/benchmark-scripts/ccmcc25.sh b/HST-L/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..1c939f1 --- /dev/null +++ b/HST-L/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/host_code -w 0 -e 50 -b ${bin_size} -i ${input_size} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks HST-L $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} bin_size=256 numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size $((1024 * 1024)) $((1536 * 1024)) $((2048 * 1024)) \ + >> ${fn}.txt + +done diff --git a/HST-L/dpu/task.c b/HST-L/dpu/task.c index 356b2f9..26021bd 100644 --- a/HST-L/dpu/task.c +++ b/HST-L/dpu/task.c @@ -12,7 +12,7 @@ #include <atomic_bit.h> #include <mutex.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; diff --git a/HST-L/host/app.c b/HST-L/host/app.c index b9c07f9..ac7381b 100644 --- a/HST-L/host/app.c +++ b/HST-L/host/app.c @@ -8,15 +8,29 @@ #include <stdbool.h> #include <string.h> #include <math.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -26,10 +40,6 @@ #define XSTR(x) STR(x) #define STR(x) #x -#if ENERGY -#include <dpu_probe.h> -#endif - // Pointer declaration static T* A; static unsigned int* histo_host; @@ -89,17 +99,29 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_probe_init("energy_probe", &probe)); #endif + // Timer declaration + Timer timer; + // Allocate DPUs and load binary +#if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + zero(&timer, 0); // aloc +#endif +#if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); - printf("Allocated %d DPU(s)\n", nr_of_dpus); + assert(nr_of_dpus == NR_DPUS); + zero(&timer, 1); // load +#endif +#if !WITH_FREE_OVERHEAD + zero(&timer, 6); // free +#endif unsigned int i = 0; unsigned int input_size; // Size of input image unsigned int dpu_s = p.dpu_s; if(p.exp == 0) - input_size = p.input_size * nr_of_dpus; // Size of input image + input_size = p.input_size * NR_DPUS; // Size of input image else if(p.exp == 1) input_size = p.input_size; // Size of input image else @@ -107,20 +129,20 @@ int main(int argc, char **argv) { const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned - const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.) + const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned // Input/output allocation - A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T)); + A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); T *bufferA = A; - histo_host = malloc(p.bins * sizeof(unsigned int)); - histo = malloc(nr_of_dpus * p.bins * sizeof(unsigned int)); + histo_host = (unsigned int*)malloc(p.bins * sizeof(unsigned int)); + histo = (unsigned int*)malloc(NR_DPUS * p.bins * sizeof(unsigned int)); // Create an input file with arbitrary data read_input(A, p); if(p.exp == 0){ - for(unsigned int j = 1; j < nr_of_dpus; j++){ + for(unsigned int j = 1; j < NR_DPUS; j++){ memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T)); } } @@ -129,40 +151,59 @@ int main(int argc, char **argv) { memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T)); } - // Timer declaration - Timer timer; - - printf("NR_TASKLETS\t%d\tBL\t%d\tinput_size\t%u\n", NR_TASKLETS, BL, input_size); - // Loop over main kernel for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { - memset(histo_host, 0, p.bins * sizeof(unsigned int)); - memset(histo, 0, nr_of_dpus * p.bins * sizeof(unsigned int)); - // Compute output on CPU (performance comparison and verification purposes) - if(rep >= p.n_warmup) +#if WITH_ALLOC_OVERHEAD + if(rep >= p.n_warmup) { start(&timer, 0, 0); - histogram_host(histo_host, A, p.bins, p.input_size, 1, nr_of_dpus); - if(rep >= p.n_warmup) + } + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); + if(rep >= p.n_warmup) { stop(&timer, 0); - - printf("Load input data\n"); - if(rep >= p.n_warmup) + } +#endif +#if WITH_LOAD_OVERHEAD + if(rep >= p.n_warmup) { start(&timer, 1, 0); + } + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); + if(rep >= p.n_warmup) { + stop(&timer, 1); + } + DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + assert(nr_of_dpus == NR_DPUS); +#endif + + memset(histo_host, 0, p.bins * sizeof(unsigned int)); + memset(histo, 0, NR_DPUS * p.bins * sizeof(unsigned int)); + + // Compute output on CPU (performance comparison and verification purposes) + if(rep >= p.n_warmup) { + start(&timer, 2, 0); + } + histogram_host(histo_host, A, p.bins, p.input_size, 1, NR_DPUS); + if(rep >= p.n_warmup) { + stop(&timer, 2); + } + + if(rep >= p.n_warmup) { + start(&timer, 3, 0); + } // Input arguments unsigned int kernel = 0; i = 0; dpu_arguments_t input_arguments[NR_DPUS]; - for(i=0; i<nr_of_dpus-1; i++) { + for(i=0; i<NR_DPUS-1; i++) { input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); input_arguments[i].bins=p.bins; - input_arguments[i].kernel=kernel; + input_arguments[i].kernel = (enum kernels)kernel; } - input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); - input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); - input_arguments[nr_of_dpus-1].bins=p.bins; - input_arguments[nr_of_dpus-1].kernel=kernel; + input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); + input_arguments[NR_DPUS-1].transfer_size=input_size_dpu_8bytes * sizeof(T); + input_arguments[NR_DPUS-1].bins=p.bins; + input_arguments[NR_DPUS-1].kernel = (enum kernels)kernel; // Copy input arrays i = 0; @@ -174,13 +215,13 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i)); } DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT)); - if(rep >= p.n_warmup) - stop(&timer, 1); + if(rep >= p.n_warmup) { + stop(&timer, 3); + } - printf("Run program on DPU(s) \n"); // Run DPU kernel if(rep >= p.n_warmup) { - start(&timer, 2, 0); + start(&timer, 4, 0); #if ENERGY DPU_ASSERT(dpu_probe_start(&probe)); #endif @@ -188,7 +229,7 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS)); if(rep >= p.n_warmup) { - stop(&timer, 2); + stop(&timer, 4); #if ENERGY DPU_ASSERT(dpu_probe_stop(&probe)); #endif @@ -206,10 +247,10 @@ int main(int argc, char **argv) { } #endif - printf("Retrieve results\n"); i = 0; - if(rep >= p.n_warmup) - start(&timer, 3, 0); + if(rep >= p.n_warmup) { + start(&timer, 5, 0); + } // PARALLEL RETRIEVE TRANSFER DPU_FOREACH(dpu_set, dpu, i) { DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i)); @@ -217,40 +258,60 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT)); // Final histogram merging - for(i = 1; i < nr_of_dpus; i++){ + for(i = 1; i < NR_DPUS; i++){ for(unsigned int j = 0; j < p.bins; j++){ histo[j] += histo[j + i * p.bins]; } } - if(rep >= p.n_warmup) - stop(&timer, 3); + if(rep >= p.n_warmup) { + stop(&timer, 5); + } + +#if WITH_ALLOC_OVERHEAD +#if WITH_FREE_OVERHEAD + if(rep >= p.n_warmup) { + start(&timer, 6, 0); + } +#endif + DPU_ASSERT(dpu_free(dpu_set)); +#if WITH_FREE_OVERHEAD + if(rep >= p.n_warmup) { + stop(&timer, 6); + } +#endif +#endif if (rep >= p.n_warmup) { - printf("[::] HST-L NMC | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%u n_bins=%d " - "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f", - nr_of_dpus, NR_TASKLETS, XSTR(T), input_size, p.bins, - input_size * sizeof(T) / timer.time[0], + dfatool_printf("[::] HST-L UPMEM | n_dpus=%d n_tasklets=%d e_type=%s n_elements=%d n_bins=%d ", + nr_of_dpus, NR_TASKLETS, XSTR(T), input_size, p.bins); + dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + timer.time[0], + timer.time[1], + timer.time[2], + timer.time[3], + timer.time[4], + timer.time[5], + timer.time[6]); + dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", input_size * sizeof(T) / timer.time[2], - input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3])); - printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f", - input_size / timer.time[0], + input_size * sizeof(T) / (timer.time[4]), + input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); + dfatool_printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]), + input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), + input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); + dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", input_size / timer.time[2], - input_size / (timer.time[1] + timer.time[2] + timer.time[3])); - printall(&timer, 3); + input_size / (timer.time[4]), + input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); + dfatool_printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + input_size / (timer.time[3] + timer.time[4] + timer.time[5]), + input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), + input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); } } - // Print timing results - printf("CPU "); - print(&timer, 0, p.n_reps); - printf("CPU-DPU "); - print(&timer, 1, p.n_reps); - printf("DPU Kernel "); - print(&timer, 2, p.n_reps); - printf("DPU-CPU "); - print(&timer, 3, p.n_reps); - #if ENERGY double energy; DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy)); @@ -279,10 +340,10 @@ int main(int argc, char **argv) { } else for (unsigned int j = 0; j < p.bins; j++) { - if(nr_of_dpus * histo_host[j] != histo[j]){ + if(NR_DPUS * histo_host[j] != histo[j]){ status = false; #if PRINT - printf("%u - %u: %u -- %u\n", j, j, nr_of_dpus * histo_host[j], histo[j]); + printf("%u - %u: %u -- %u\n", j, j, NR_DPUS * histo_host[j], histo[j]); #endif } } @@ -296,7 +357,10 @@ int main(int argc, char **argv) { free(A); free(histo_host); free(histo); + +#if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_free(dpu_set)); +#endif return status ? 0 : -1; } diff --git a/HST-L/support/common.h b/HST-L/include/common.h index 30df40d..438825e 100755..100644 --- a/HST-L/support/common.h +++ b/HST-L/include/common.h @@ -20,15 +20,17 @@ #define DEPTH 12 #define ByteSwap16(n) (((((unsigned int)n) << 8) & 0xFF00) | ((((unsigned int)n) >> 8) & 0x00FF)) +enum kernels { + kernel1 = 0, + nr_kernels = 1, +} kernel; + // Structures used by both the host and the dpu to communicate information typedef struct { uint32_t size; uint32_t transfer_size; uint32_t bins; - enum kernels { - kernel1 = 0, - nr_kernels = 1, - } kernel; + enum kernels kernel; } dpu_arguments_t; #ifndef ENERGY diff --git a/HST-L/include/dfatool_host.ah b/HST-L/include/dfatool_host.ah new file mode 100644 index 0000000..db4e441 --- /dev/null +++ b/HST-L/include/dfatool_host.ah @@ -0,0 +1,31 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_pixels; + unsigned int n_bins; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_pixels = p->input_size; + n_bins = p->bins; + printf("[>>] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins); + } + + advice call("% histogram_host(...)") : after() { + printf("[--] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins); + } + + advice execution("% main(...)") : after() { + printf("[<<] HST-L | n_dpus=%u n_pixels=%lu n_bins=%u\n", NR_DPUS, n_pixels, n_bins); + } +}; diff --git a/HST-L/support/params.h b/HST-L/include/params.h index e29449b..d0c3129 100644 --- a/HST-L/support/params.h +++ b/HST-L/include/params.h @@ -21,7 +21,7 @@ static void usage() { "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1, 2) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1, 2) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=1536*1024 elements)" @@ -36,7 +36,7 @@ struct Params input_params(int argc, char **argv) { p.bins = 256; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; p.file_name = "./input/image_VanHateren.iml"; p.dpu_s = 64; diff --git a/HST-L/include/timer.h b/HST-L/include/timer.h new file mode 100644 index 0000000..7b80823 --- /dev/null +++ b/HST-L/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 7 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/HST-L/run-paper-strong-full.sh b/HST-L/run-paper-strong-full.sh deleted file mode 100755 index 0108d40..0000000 --- a/HST-L/run-paper-strong-full.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks HST-S strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 are not part of upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 2 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/HST-L/run-paper-strong-rank.sh b/HST-L/run-paper-strong-rank.sh deleted file mode 100755 index f2f80b1..0000000 --- a/HST-L/run-paper-strong-rank.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks HST-S strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream config space -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 1 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/HST-L/run-paper-weak.sh b/HST-L/run-paper-weak.sh deleted file mode 100755 index 3ddd801..0000000 --- a/HST-L/run-paper-weak.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks HST-S weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# upstream does not include >64 -for nr_dpus in 256 512 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -b 256 -x 0 || true - fi - done -done -) | tee log-paper-weak.txt diff --git a/HST-L/run.sh b/HST-L/run.sh deleted file mode 100755 index d2a072f..0000000 --- a/HST-L/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -for i in 1 -do - for b in 64 128 256 512 1024 2048 4096 - do - for k in 1 2 4 8 16 - do - NR_DPUS=$i NR_TASKLETS=$k BL=10 make all - wait - ./bin/host_code -w 2 -e 5 -b ${b} > profile/HSTL_${b}_tl${k}_dpu${i}.txt - wait - make clean - wait - done - done -done diff --git a/HST-L/support/timer.h b/HST-L/support/timer.h deleted file mode 100755 index 5c00213..0000000 --- a/HST-L/support/timer.h +++ /dev/null @@ -1,66 +0,0 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[4];
- struct timeval stopTime[4];
- double time[4];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
diff --git a/MLP/Makefile b/MLP/Makefile index 944b3ca..1ce804d 100644 --- a/MLP/Makefile +++ b/MLP/Makefile @@ -1,44 +1,54 @@ -DPU_DIR := dpu -HOST_DIR := host -BUILDDIR ?= bin -NR_TASKLETS ?= 16 +NR_DPUS ?= 1 +NR_TASKLETS ?= 16 BL ?= 10 -NR_DPUS ?= 1 -define conf_filename - ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf -endef -CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL}) +HOST_SOURCES := $(wildcard host/*.c) +DPU_SOURCES := $(wildcard dpu/*.c) -HOST_TARGET := ${BUILDDIR}/mlp_host -DPU_TARGET := ${BUILDDIR}/mlp_dpu +aspectc ?= 0 +aspectc_timing ?= 0 -COMMON_INCLUDES := support -HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) -DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) +HOST_CC := ${CC} -.PHONY: all clean test +COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DASPECTC=${aspectc} +DPU_FLAGS := ${COMMON_FLAGS} -O2 + +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 -__dirs := $(shell mkdir -p ${BUILDDIR}) +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} +QUIET = @ -all: ${HOST_TARGET} ${DPU_TARGET} +ifdef verbose + QUIET = +endif -${CONF}: - $(RM) $(call conf_filename,*,*) - touch ${CONF} +all: bin/mlp_dpu bin/mlp_host -${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} - $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin: + ${QUIET}mkdir -p bin -${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} - dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin/mlp_host: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah + +bin/mlp_dpu: ${DPU_SOURCES} include bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: - $(RM) -r $(BUILDDIR) + ${QUIET}$(RM) -r $(BUILDDIR) test: all - ./${HOST_TARGET} -m 1024 -n 1024 + bin/mlp_host -m 1024 -n 1024 + +.PHONY: all clean test diff --git a/MLP/benchmark-scripts/ccmcc25-sim.sh b/MLP/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..3abe82e --- /dev/null +++ b/MLP/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/mlp_host -w 0 -e 50 -m ${nr_rows} -n ${nr_cols} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks MLP $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_cols={nr_cols} nr_rows={nr_rows} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: nr_cols 1024 2048 3072 4096 \ + ::: nr_rows 512 768 1024 2048 \ +>> ${fn}.txt diff --git a/MLP/benchmark-scripts/ccmcc25.sh b/MLP/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..02063b9 --- /dev/null +++ b/MLP/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/mlp_host -w 0 -e 50 -m ${nr_rows} -n ${nr_cols} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks MLP $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} nr_cols={nr_cols} nr_rows={nr_rows} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: nr_cols 4096 8192 16384 \ + ::: nr_rows 1024 2048 4096 \ + >> ${fn}.txt + +done diff --git a/MLP/dpu/task.c b/MLP/dpu/task.c index 4f85024..ae400ae 100644 --- a/MLP/dpu/task.c +++ b/MLP/dpu/task.c @@ -10,7 +10,7 @@ #include <barrier.h> #include <seqread.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; diff --git a/MLP/host/app.c b/MLP/host/app.c index 24243bf..9c32ab8 100644 --- a/MLP/host/app.c +++ b/MLP/host/app.c @@ -8,19 +8,28 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> -#include <dpu.h> -#include <dpu_log.h> #include <unistd.h> #include <getopt.h> #include <assert.h> +#if ASPECTC +extern "C" { +#endif + +#include <dpu.h> +#include <dpu_log.h> + #if ENERGY #include <dpu_probe.h> #endif -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#if ASPECTC +} +#endif + +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -159,8 +168,8 @@ int main(int argc, char **argv) B = (T *) malloc(n_size * sizeof(T)); B_host = (T *) malloc(n_size * sizeof(T)); C = (T *) malloc(m_size * sizeof(T)); - C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); - B_tmp = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); + C_dpu = (T*)malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); + B_tmp = (T*)malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T)); init_data(A, B, B_host, m_size, n_size); @@ -331,22 +340,9 @@ int main(int argc, char **argv) DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time)); #endif - // Print timing results - printf("CPU Version Time (ms): "); - print(&timer, 0, 1); - printf("CPU-DPU Time (ms): "); - print(&timer, 1, p.n_reps); - printf("DPU Kernel Time (ms): "); - print(&timer, 2, p.n_reps); - printf("Inter-DPU Time (ms): "); - print(&timer, 4, p.n_reps); - printf("DPU-CPU Time (ms): "); - print(&timer, 3, p.n_reps); - #if ENERGY printf("Energy (J): %f J\t", avg_energy); #endif - printf("\n\n"); // Check output bool status = true; diff --git a/MLP/support/common.h b/MLP/include/common.h index 4b5031b..4b5031b 100755..100644 --- a/MLP/support/common.h +++ b/MLP/include/common.h diff --git a/MLP/include/dfatool_host.ah b/MLP/include/dfatool_host.ah new file mode 100644 index 0000000..6ea4a18 --- /dev/null +++ b/MLP/include/dfatool_host.ah @@ -0,0 +1,33 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned int n_rows, n_cols; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(uint32_t); + } + + advice call("% input_params(...)"): after() { + Params* p = tjp->result(); + n_rows = p->m_size; + n_cols = p->n_size; + printf("[>>] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols); + } + + advice call("% start(...)") : after() { + if (*(tjp->arg<1>()) == 1) { + printf("[--] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols); + } + } + + advice execution("% main(...)") : after() { + printf("[<<] MLP | n_dpus=%u n_rows=%u n_cols=%u\n", NR_DPUS, n_rows, n_cols); + } +}; diff --git a/MLP/support/params.h b/MLP/include/params.h index 4bfc2fc..4bfc2fc 100644 --- a/MLP/support/params.h +++ b/MLP/include/params.h diff --git a/MLP/include/timer.h b/MLP/include/timer.h new file mode 100644 index 0000000..bff638d --- /dev/null +++ b/MLP/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 5 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/MLP/support/timer.h b/MLP/support/timer.h deleted file mode 100755 index 961ed11..0000000 --- a/MLP/support/timer.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> - -typedef struct Timer { - - struct timeval startTime[5]; - struct timeval stopTime[5]; - double time[5]; - -} Timer; - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); - //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + - // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000); - -} - -void print(Timer *timer, int i, int REP) -{ - printf("%f\t", timer->time[i] / (1000 * REP)); -} diff --git a/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh new file mode 100755 index 0000000..ee5ee99 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/benchmark-scripts/upvec-transformation.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +./make-size.sh 0 + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1 + bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 20 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size} + return $? +} + +export -f run_benchmark_nmc + +# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output). +# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB). +# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory. + +for sdk in 2025.1.0-orig 2025.1.0-notransform; do + source /opt/upmem/transformation-benchmarks/${sdk}/upmem_env.sh + fn=log/$(hostname)/upvec-${sdk} + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \ + ::: numa_rank any \ + ::: numa_in 1 \ + ::: numa_out 1 \ + ::: numa_cpu 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: input_size 1048576 \ + >> ${fn}.txt +done diff --git a/Microbenchmarks/CPU-DPU/splc25-alloc.sh b/Microbenchmarks/CPU-DPU/splc25-alloc.sh new file mode 100755 index 0000000..6f4f055 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/splc25-alloc.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + ./make-size.sh ${size} + n_nops=$((size * 256)) + if make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 DPU_BINARY=\'\"bin/dpu_size\"\' NUMA=1; then + for l in $(seq 1 20); do + bin/host_code -c ${numa_cpu} -w 1 -e 0 -x 1 -i 65536 -N $n_nops -I $(size -A bin/dpu_size | awk '($1 == ".text") {print $2/8}') + done + fi + return $? +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + fn=log/$(hostname)/splc25-alloc-${sdk} + + parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_cpu={numa_cpu} size={size} \ + ::: i $(seq 1 5) \ + ::: numa_rank -1 \ + ::: numa_cpu 0 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: size $(seq 0 15) \ + >> ${fn}.txt + +done diff --git a/Microbenchmarks/CPU-DPU/splc25-transfer.sh b/Microbenchmarks/CPU-DPU/splc25-transfer.sh new file mode 100755 index 0000000..0227cab --- /dev/null +++ b/Microbenchmarks/CPU-DPU/splc25-transfer.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +./make-size.sh 0 + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_RANKS=${nr_ranks} NR_TASKLETS=1 BL=10 TRANSFER=PUSH NUMA=1 + bin/host_code -a $numa_in -b $numa_out -c $numa_cpu -w 0 -e 5 -x 0 -N 0 -I $(size -A bin/dpu_code | awk '($1 == ".text") {print $2/8}') -i ${input_size} + return $? +} + +export -f run_benchmark_nmc + +# The benchmark allocates 3 * 64 * nr_ranks * 8B * input_size (one array for input, one array for output). +# With 1048576 elements (8 MiB per DPU), this gives a maximum allocation of 60 GiB, which will fit comfortably into system memory (128 GiB). +# With 2097152 elements (16 MiB per DPU), we may encounter OoM conditions, since the UPMEM SDK also allocates some memory. + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + fn=log/$(hostname)/splc25-transfer-${sdk} + + parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_nmc nr_ranks={nr_ranks} numa_rank={numa_rank} numa_in={numa_in} numa_out={numa_out} numa_cpu={numa_cpu} input_size={input_size} \ + ::: i $(seq 1 10) \ + ::: numa_rank -1 \ + ::: numa_in 1 \ + ::: numa_out 1 \ + ::: numa_cpu 1 \ + ::: nr_ranks $(seq 1 40) \ + ::: input_size 1 1048576 \ + >> ${fn}.txt +done diff --git a/Microbenchmarks/CPU-DPU/util/upvec-read.pdf b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf Binary files differnew file mode 100644 index 0000000..63af6cc --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-read.pdf diff --git a/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance new file mode 100755 index 0000000..b175b8d --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-transformation-relevance @@ -0,0 +1,41 @@ +#!/bin/sh + +data=$(mktemp -d) + +echo +echo SDK with transformation +echo + +DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \ +analyze-log.py \ +--filter-param='n_elements_per_dpu=1048576' \ +--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \ +--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \ +--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \ +--export-pgf-unparam ${data}/orig- \ +--cross-validate=kfold:10 --progress \ +--show-model=param --show-model-error --show-model-precision=6 \ +log/tinos/upvec-2025.1.0-orig.txt + +echo +echo SDK without transformation +echo + +DFATOOL_ULS_MIN_BOUND=0 DFATOOL_PARAM_RELEVANCE_THRESHOLD=0.9 DFATOOL_ULS_FUNCTIONS=roofline,linear \ +analyze-log.py \ +--filter-param='n_elements_per_dpu=1048576' \ +--ignore-param='n_dpus,n_nops,numa_node_cpu,numa_node_in,numa_node_out,numa_node_rank,n_elements' \ +--filter-observation='NMC-transfer:throughput_dram_mram_Bps,NMC-transfer:throughput_mram_dram_Bps' \ +--normalize-nfp='writeThroughputGBps=throughput_dram_mram_Bps=/1e9;readThroughputGBps=throughput_mram_dram_Bps=/1e9' \ +--export-pgf-unparam ${data}/notransform- \ +--cross-validate=kfold:10 --progress \ +--show-model=param --show-model-error --show-model-precision=6 \ +log/tinos/upvec-2025.1.0-notransform.txt + +for op in read write; do + cp util/upvec-${op}.tex ${data} + lualatex -output-directory ${data} ${data}/upvec-${op} + cp ${data}/upvec-${op}.pdf util +done + +rm -rf ${data} diff --git a/Microbenchmarks/CPU-DPU/util/upvec-write.tex b/Microbenchmarks/CPU-DPU/util/upvec-write.tex new file mode 100644 index 0000000..f6d7bf5 --- /dev/null +++ b/Microbenchmarks/CPU-DPU/util/upvec-write.tex @@ -0,0 +1,38 @@ +\documentclass{standalone} + +\usepackage[T1]{fontenc} +\usepackage[default]{opensans} +\usepackage[scaled]{beramono} + +\usepackage{tikz} +\usepackage{pgfplots} +\pgfplotsset{compat=1.18} +\usepgfplotslibrary{statistics} + +\begin{document} + \begin{tikzpicture} + \begin{axis}[ + ylabel={write [GB/s]}, + xlabel={\# Ranks}, + x label style={font=\footnotesize, yshift=2mm}, + y label style={font=\footnotesize}, + tick label style={/pgf/number format/assume math mode=true}, + title={Benchmark Data}, + title style={yshift=-3mm}, + legend style={font=\footnotesize, legend columns=-1, column sep=1ex}, + legend pos=south east, + legend entries={upstream,,{no transformation},}, + reverse legend, + ymin=0,ymax=14, + xmin=0,xmax=41, + width=90mm,height=45mm + ] + \addplot[thick, color=red, domain=1:40] {5.042768 + 0.258673 * min(x, 31.536102)}; + \addplot[color=red,only marks,mark=*,mark size=0.9,opacity=.05] + table[x=n_ranks, y=value] {orig-NMC-transfer-writeThroughputGBps.txt}; + \addplot[thick, color=blue, domain=1:40] {5.049962 + 0.308594 * min(x, 25.657012)}; + \addplot[color=blue,only marks,mark=*,mark size=0.9,opacity=.05] + table[x=n_ranks, y=value] {notransform-NMC-transfer-writeThroughputGBps.txt}; + \end{axis} + \end{tikzpicture} +\end{document} diff --git a/NW/Makefile b/NW/Makefile index 68f495a..10276b1 100644 --- a/NW/Makefile +++ b/NW/Makefile @@ -1,46 +1,56 @@ -DPU_DIR := dpu -HOST_DIR := host -BUILDDIR ?= bin NR_TASKLETS ?= 13 -BL ?= 1024 -BL_IN ?= 4 -NR_DPUS ?= 1 +BL ?= 1024 +BL_IN ?= 4 +NR_DPUS ?= 1 ENERGY ?= 0 -define conf_filename - ${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf -endef -CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL}) +HOST_SOURCES := $(wildcard host/*.c) +DPU_SOURCES := $(wildcard dpu/*.c) -HOST_TARGET := ${BUILDDIR}/nw_host -DPU_TARGET := ${BUILDDIR}/nw_dpu +aspectc ?= 0 +aspectc_timing ?= 0 -COMMON_INCLUDES := support -HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c) -DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c) +HOST_CC := ${CC} -.PHONY: all clean test +COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DASPECTC=${aspectc} +DPU_FLAGS := ${COMMON_FLAGS} -O2 -DBL_IN=${BL_IN} + +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 -__dirs := $(shell mkdir -p ${BUILDDIR}) +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY} -DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DBL_IN=${BL_IN} +QUIET = @ -all: ${HOST_TARGET} ${DPU_TARGET} +ifdef verbose + QUIET = +endif -${CONF}: - $(RM) $(call conf_filename,*,*) - touch ${CONF} +all: bin/nw_host bin/nw_dpu -${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF} - $(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin: + ${QUIET}mkdir -p bin -${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF} - dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} +bin/nw_host: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah + +bin/nw_dpu: ${DPU_SOURCES} include bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: $(RM) -r $(BUILDDIR) test: all - ./${HOST_TARGET} + bin/nw_host + +.PHONY: all clean test diff --git a/NW/benchmark-scripts/ccmcc25.sh b/NW/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..80df155 --- /dev/null +++ b/NW/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Generates huge logfiles and crashes frequently. Probably not helpful. +exit 1 + +mkdir -p log/$(hostname) +fn=log/$(hostname)/ccmcc25 + +source /opt/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=32 BL_IN=2 \ + dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/nw_host -w 0 -e 50 -n ${nr_rows} +} + +export -f run_benchmark_nmc + +echo "prim-benchmarks NW $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} nr_rows={nr_rows} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: nr_rows 32768 65536 131072 \ +>> ${fn}.txt diff --git a/NW/dpu/task.c b/NW/dpu/task.c index c022f70..fab163a 100644 --- a/NW/dpu/task.c +++ b/NW/dpu/task.c @@ -10,7 +10,7 @@ #include <perfcounter.h> #include <barrier.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; diff --git a/NW/host/app.c b/NW/host/app.c index 0e899ec..9de2918 100644 --- a/NW/host/app.c +++ b/NW/host/app.c @@ -7,20 +7,30 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> -#include <unistd.h> -#include <getopt.h> -#include <assert.h> - -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" #if ENERGY #include <dpu_probe.h> #endif +#if ASPECTC +} +#endif + +#include <unistd.h> +#include <getopt.h> +#include <assert.h> + +#include "common.h" +#include "timer.h" +#include "params.h" + // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY #define DPU_BINARY "./bin/nw_dpu" @@ -184,7 +194,7 @@ int main(int argc, char **argv) { struct Params p = input_params(argc, argv); struct dpu_set_t dpu_set, dpu; - uint32_t nr_of_dpus, max_dpus; + uint32_t nr_of_dpus, nr_of_ranks, max_dpus; #if ENERGY struct dpu_probe_t probe; @@ -195,6 +205,7 @@ int main(int argc, char **argv) { DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); + DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); printf("Allocated %d DPU(s)\n", nr_of_dpus); printf("Allocated %d TASKLET(s) per DPU\n", NR_TASKLETS); #if DYNAMIC @@ -822,28 +833,6 @@ int main(int argc, char **argv) { stop(&timer, 1); } - - // Print timing results - printf("CPU version "); - print(&timer, 0, p.n_reps); - printf("CPU-DPU "); - print(&timer, 2, p.n_reps); - printf("DPU Kernel "); - print(&timer, 3, p.n_reps); - printf("Inter-DPU "); - print(&timer, 1, p.n_reps); - printf("DPU-CPU "); - print(&timer, 4, p.n_reps); - printf("\n"); - printf("Longest Diagonal CPU-DPU "); - print(&long_diagonal_timer, 2, p.n_reps); - printf("Longest Diagonal DPU Kernel "); - print(&long_diagonal_timer, 3, p.n_reps); - printf("Longest Diagonal Inter-DPU "); - print(&long_diagonal_timer, 1, p.n_reps); - printf("Longest Diagonal DPU-CPU "); - print(&long_diagonal_timer, 4, p.n_reps); - printf("\n"); #if ENERGY printf("DPU Energy (J): %f \t ", tavg_energy / p.n_reps); diff --git a/NW/support/common.h b/NW/include/common.h index 69069e7..69069e7 100755..100644 --- a/NW/support/common.h +++ b/NW/include/common.h diff --git a/NW/include/dfatool_host.ah b/NW/include/dfatool_host.ah new file mode 100644 index 0000000..d45aef3 --- /dev/null +++ b/NW/include/dfatool_host.ah @@ -0,0 +1,30 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned long n_elements; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(uint32_t); + } + + advice call("% input_params(...)"): after() { + Params* p = tjp->result(); + n_elements = p->max_rows; + printf("[>>] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } + + advice call("% srand(...)") : after() { + printf("[--] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } + + advice execution("% main(...)") : after() { + printf("[<<] NW | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } +}; diff --git a/NW/support/params.h b/NW/include/params.h index 8874248..8874248 100644 --- a/NW/support/params.h +++ b/NW/include/params.h diff --git a/NW/support/timer.h b/NW/include/timer.h index efaefcd..2fc798f 100755..100644 --- a/NW/support/timer.h +++ b/NW/include/timer.h @@ -1,59 +1,59 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[5];
- struct timeval stopTime[5];
- double time[5];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
+/* + * Copyright (c) 2016 University of Cordoba and University of Illinois + * All rights reserved. + * + * Developed by: IMPACT Research Group + * University of Cordoba and University of Illinois + * http://impact.crhc.illinois.edu/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * > Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * > Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * > Neither the names of IMPACT Research Group, University of Cordoba, + * University of Illinois nor the names of its contributors may be used + * to endorse or promote products derived from this Software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + */ + +#include <sys/time.h> + +typedef struct Timer{ + + struct timeval startTime[5]; + struct timeval stopTime[5]; + double time[5]; + +}Timer; + +void start(Timer *timer, int i, int rep) { + if(rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) { + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); } @@ -3,7 +3,7 @@ This is an improved and extended version of the PrIM benchmark suite originally developed for **UPMEM PIM** (near-memory computing / processing-in-memory) evaluation by Gómez-Luna et al. The extension adds -* support for **NUMA**-aware **HBM** (high-bandwidth memory) and **DRAM** benchmarks, +* support for **NUMA**-aware **UPMEM**, **CXL** (Compute eXpress Link), **HBM** (high-bandwidth memory), and **DRAM** benchmarks, * a new **COUNT** benchmark, and * numerous bugfixes. @@ -61,6 +61,7 @@ Up-to-date source code is available on the following mirrors: The following benchmark adjustments have been made: +* A (AspectC++ support, including DFA trace generation) * B (significant bugfixes) * D (dfatool-compatible output of benchmark metrics) * E (efficiency improvements; may affect input/output format) @@ -69,39 +70,41 @@ The following benchmark adjustments have been made: CPU and DPU benchmarks in this repository have been adjusted as follows: -* BFS: DL -* BS: DLN +* BFS: ADL +* BS: ADLN * COUNT: DLN (new benchmark, based on SEL) -* GEMV: DLN -* HST-L: D +* GEMV: ADLN +* HST-L: AD * HST-S: DLN -* MLP: – -* NW: – +* MLP: A +* NW: A * RED: DLN * SCAN-SSA: D -* SCAN-RSS: DLN +* SCAN-RSS: ADLN * SEL: DLN -* SpMV: DL -* TRNS: BDLN -* TS: DLN +* SpMV: ADL +* TRNS: ABDLN +* TS: ADLN * UNI: DL -* VA: DLN +* VA: ADLN GPU versions are un-changed. The original README follows. +It contains minor adjustments to the directory structure; +benchmark how-tos that no do not apply to this fork have been removed. --- # PrIM (Processing-In-Memory Benchmarks) -PrIM is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. -PrIM is developed to evaluate, analyze, and characterize the first publicly-available real-world processing-in-memory (PIM) architecture, the [UPMEM](https://www.upmem.com/) PIM architecture. +PrIM is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. +PrIM is developed to evaluate, analyze, and characterize the first publicly-available real-world processing-in-memory (PIM) architecture, the [UPMEM](https://www.upmem.com/) PIM architecture. The UPMEM PIM architecture combines traditional DRAM memory arrays with general-purpose in-order cores, called DRAM Processing Units (DPUs), integrated in the same chip. -PrIM provides a common set of workloads to evaluate the UPMEM PIM architecture with and can be useful for programming, architecture and system researchers all alike to improve multiple aspects of future PIM hardware and software. -The workloads have different characteristics, exhibiting heterogeneity in their memory access patterns, operations and data types, and communication patterns. -This repository also contains baseline CPU and GPU implementations of PrIM benchmarks for comparison purposes. +PrIM provides a common set of workloads to evaluate the UPMEM PIM architecture with and can be useful for programming, architecture and system researchers all alike to improve multiple aspects of future PIM hardware and software. +The workloads have different characteristics, exhibiting heterogeneity in their memory access patterns, operations and data types, and communication patterns. +This repository also contains baseline CPU and GPU implementations of PrIM benchmarks for comparison purposes. PrIM also includes a set of microbenchmarks can be used to assess various architecture limits such as compute throughput and memory bandwidth. @@ -155,18 +158,15 @@ Bibtex entries for citation: ## Repository Structure and Installation -We point out next the repository structure and some important folders and files. -All benchmark folders have similar structure to the one shown for BFS. -The microbenchmark folder contains eight different microbenchmarks, each with similar folder structure. +We point out next the repository structure and some important folders and files. +All benchmark folders have similar structure to the one shown for BFS. +The microbenchmark folder contains eight different microbenchmarks, each with similar folder structure. The repository also includes `run_*.py` scripts to run strong and weak scaling experiments for PrIM benchmarks. ``` . +-- LICENSE +-- README.md -+-- run_strong_full.py -+-- run_strong_rank.py -+-- run_weak.py +-- BFS/ | +-- baselines/ | | +-- cpu/ @@ -174,7 +174,7 @@ The repository also includes `run_*.py` scripts to run strong and weak scaling e | +-- data/ | +-- dpu/ | +-- host/ -| +-- support/ +| +-- include/ | +-- Makefile +-- BS/ | +-- ... @@ -219,60 +219,12 @@ The repository also includes `run_*.py` scripts to run strong and weak scaling e ### Prerequisites -Running PrIM requires installing the [UPMEM SDK](https://sdk.upmem.com). +Running PrIM requires installing the [UPMEM SDK](https://sdk.upmem.com). PrIM benchmarks and microbenchmarks are designed to run on a server with real UPMEM modules, but they also run on the functional simulator include in the UPMEM SDK. -### Getting Started - -Clone the repository: -```sh -git clone https://github.com/CMU-SAFARI/prim-benchmarks - -cd prim-benchmarks -./set-root-dir.sh -``` - ## Running PrIM -### PrIM Benchmarks - -The repository includes scripts to run weak scaling and strong scaling experiments: -* `run_weak.py`: Weak scaling experiments for 16 PrIM benchmarks using 1 rank of UPMEM DPUs (1 to 64 DPUs). -* `run_strong_rank.py`: Strong scaling experiments for 16 PrIM benchmarks using 1 rank of UPMEM DPUs (1 to 64 DPUs). -* `run_strong_full.py`: Strong scaling experiments for 16 PrIM benchmarks using 4 to 32 ranks of UPMEM DPUs (256 to 2048 DPUs). - -To run weak scaling experiments for BFS or SpMV, update the paths to input files in `run_weak.py`. -The scripts save the results in a folder called `profile` inside each benchmark folder. - -```sh -cd prim-benchmarks - -# Weak scaling experiments for BFS -python3 run_weak.py BFS -``` - -Inside each PrIM benchmark folder, one can compile and run each benchmark with different input parameters. -Choose a benchmark and compile. Every Makefile accepts several input parameters: -```sh -cd BFS - -# Compile BFS for 32 DPUs and 16 tasklets (i.e., software threads) per DPU -NR_DPUS=32 NR_TASKLETS=16 make all -``` - -For help instructions: -```sh -./bin/host_code -h -``` - -Run the benchmark: -```sh -./bin/host_code -v 0 -f data/loc-gowalla_edges.txt -``` - -Several benchmark folders (HST-S, HST-L, RED, SCAN-SSA, SCAN-RSS) contain a script (`run.sh`) that compiles and runs the benchmark for the experiments in the appendix of the [paper](https://arxiv.org/pdf/2105.03814.pdf). - -### Microbenchmarks +### Microbenchmarks Each microbenchmark folder contains a script (`run.sh`) that compiles and runs the microbenchmark for the experiments in the [paper](https://arxiv.org/pdf/2105.03814.pdf): @@ -284,10 +236,9 @@ cd Microbenchmarks/Arithmetic-Throughput ### Getting Help -If you have any suggestions for improvement, please contact el1goluj at gmail dot com. +If you have any suggestions for improvement, please contact el1goluj at gmail dot com. If you find any bugs or have further questions or requests, please post an issue at the [issue page](https://github.com/CMU-SAFARI/prim-benchmarks/issues). +## Acknowledgments -## Acknowledgments - -We thank UPMEM’s Fabrice Devaux, Rémy Cimadomo, Romaric Jodin, and Vincent Palatin for their valuable support. We acknowledge the support of SAFARI Research Group’s industrial partners, especially ASML, Facebook, Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. Izzat El Hajj acknowledges the support of the University Research Board of the American University of Beirut (URB-AUB-103951-25960). +We thank UPMEM’s Fabrice Devaux, Rémy Cimadomo, Romaric Jodin, and Vincent Palatin for their valuable support. We acknowledge the support of SAFARI Research Group’s industrial partners, especially ASML, Facebook, Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. Izzat El Hajj acknowledges the support of the University Research Board of the American University of Beirut (URB-AUB-103951-25960). diff --git a/RED/Makefile b/RED/Makefile index f20e1f7..c65df94 100644 --- a/RED/Makefile +++ b/RED/Makefile @@ -8,17 +8,34 @@ WITH_ALLOC_OVERHEAD ?= 0 WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DPERF=${PERF} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DPERF=${PERF} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DPERF=${PERF} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -27,10 +44,12 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: diff --git a/RED/benchmark-scripts/ccmcc25-sim.sh b/RED/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..bc97344 --- /dev/null +++ b/RED/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 5 -i ${nr_elements} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks BS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 nr_elements={nr_elements} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: nr_elements $((2**20)) $((2**21)) $((2**22)) \ +>> ${fn}.txt diff --git a/RED/benchmark-scripts/ccmcc25.sh b/RED/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..074933d --- /dev/null +++ b/RED/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 5 -i ${nr_elements} 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks RED $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any nr_elements={nr_elements} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: nr_elements $((2**27)) $((2**28)) $((2**29)) \ + >> ${fn}.txt + +done diff --git a/RED/dpu/task.c b/RED/dpu/task.c index 5536d4d..90386b2 100644 --- a/RED/dpu/task.c +++ b/RED/dpu/task.c @@ -11,8 +11,8 @@ #include <handshake.h> #include <barrier.h> -#include "../support/common.h" -#include "../support/cyclecount.h" +#include "common.h" +#include "cyclecount.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; diff --git a/RED/host/app.c b/RED/host/app.c index 204f056..cb7a6ac 100644 --- a/RED/host/app.c +++ b/RED/host/app.c @@ -7,15 +7,31 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#include <dpu_management.h> +#include <dpu_target_macros.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -25,13 +41,6 @@ #define XSTR(x) STR(x) #define STR(x) #x -#if ENERGY -#include <dpu_probe.h> -#endif - -#include <dpu_management.h> -#include <dpu_target_macros.h> - // Pointer declaration static T* A; @@ -70,17 +79,17 @@ int main(int argc, char **argv) { // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + zero(&timer, 0); // alloc #endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + zero(&timer, 1); // load #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + zero(&timer, 6); // free #endif #if ENERGY @@ -102,7 +111,7 @@ int main(int argc, char **argv) { ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned // Input/output allocation - A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); T *bufferA = A; T count = 0; T count_host = 0; @@ -168,12 +177,12 @@ int main(int argc, char **argv) { // Input arguments unsigned int kernel = 0; dpu_arguments_t input_arguments[NR_DPUS]; - for(i=0; i<NR_DPUS-1; i++) { - input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); - input_arguments[i].kernel=kernel; + for(int j=0; j<NR_DPUS-1; j++) { + input_arguments[j].size=input_size_dpu_8bytes * sizeof(T); + input_arguments[j].kernel=(enum kernels)kernel; } input_arguments[NR_DPUS-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); - input_arguments[NR_DPUS-1].kernel=kernel; + input_arguments[NR_DPUS-1].kernel=(enum kernels)kernel; // Copy input arrays i = 0; DPU_FOREACH(dpu_set, dpu, i) { @@ -218,7 +227,7 @@ int main(int argc, char **argv) { //printf("Retrieve results\n"); dpu_results_t results[NR_DPUS]; - T* results_count = malloc(NR_DPUS * sizeof(T)); + T* results_count = (T*)malloc(NR_DPUS * sizeof(T)); if(rep >= p.n_warmup) start(&timer, 5, 0); i = 0; @@ -302,11 +311,11 @@ int main(int argc, char **argv) { if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); if (rep >= p.n_warmup) { - printf("[::] RED UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d", + dfatool_printf("[::] RED UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d", NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + dfatool_printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", + dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], @@ -314,19 +323,19 @@ int main(int argc, char **argv) { timer.time[4], timer.time[5], timer.time[6]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", input_size * sizeof(T) / timer.time[2], input_size * sizeof(T) / (timer.time[4]), input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", + dfatool_printf(" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5]), input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", input_size / timer.time[2], input_size / (timer.time[4]), input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", + dfatool_printf(" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", input_size / (timer.time[3] + timer.time[4] + timer.time[5]), input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5]), input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); diff --git a/RED/support/common.h b/RED/include/common.h index 121bf31..6cc1ae2 100755..100644 --- a/RED/support/common.h +++ b/RED/include/common.h @@ -38,19 +38,21 @@ #define DIV 1 // Shift right to divide by sizeof(T) #endif +enum kernels { + kernel1 = 0, + nr_kernels = 1, +}; + // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; - enum kernels { - kernel1 = 0, - nr_kernels = 1, - } kernel; - T t_count; + uint32_t size; + enum kernels kernel; + T t_count; } dpu_arguments_t; typedef struct { - uint64_t cycles; - T t_count; + uint64_t cycles; + T t_count; } dpu_results_t; #ifndef PERF diff --git a/RED/support/cyclecount.h b/RED/include/cyclecount.h index c4247b5..c4247b5 100644 --- a/RED/support/cyclecount.h +++ b/RED/include/cyclecount.h diff --git a/RED/include/dfatool_host.ah b/RED/include/dfatool_host.ah new file mode 100644 index 0000000..88dfbd8 --- /dev/null +++ b/RED/include/dfatool_host.ah @@ -0,0 +1,29 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_elements; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_elements = p->input_size; + printf("[>>] RED | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } + + advice call("% reduction_host(...)") : after() { + printf("[--] RED | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements); + } + + advice execution("% main(...)") : after() { + printf("[<<] RED | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } +}; diff --git a/RED/support/params.h b/RED/include/params.h index 97bc50a..ee90908 100644 --- a/RED/support/params.h +++ b/RED/include/params.h @@ -18,7 +18,7 @@ static void usage() { "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=6553600 elements)" @@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) { p.input_size = 6553600; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; int opt; while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { diff --git a/RED/include/timer.h b/RED/include/timer.h new file mode 100644 index 0000000..7b80823 --- /dev/null +++ b/RED/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 7 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/RED/support/timer.h b/RED/support/timer.h deleted file mode 100755 index 4d597b9..0000000 --- a/RED/support/timer.h +++ /dev/null @@ -1,66 +0,0 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
diff --git a/SCAN-RSS/Makefile b/SCAN-RSS/Makefile index f1975e8..d55eb07 100644 --- a/SCAN-RSS/Makefile +++ b/SCAN-RSS/Makefile @@ -8,17 +8,34 @@ WITH_ALLOC_OVERHEAD ?= 0 WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL} +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS := ${COMMON_FLAGS} -O2 +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -27,10 +44,13 @@ all: bin/dpu_code bin/host_code bin: ${QUIET}mkdir -p bin -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +# cp/rm are needed to work around AspectC++ not liking symlinks +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: diff --git a/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh b/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..2715db7 --- /dev/null +++ b/SCAN-RSS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 5 -i ${input_size} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks SCAN-RSS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: input_size $((2**22)) $((2**23)) $((2**24)) \ +>> ${fn}.txt diff --git a/SCAN-RSS/benchmark-scripts/ccmcc25.sh b/SCAN-RSS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..ff0a31e --- /dev/null +++ b/SCAN-RSS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 50 -i ${input_size} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks SCAN-RSS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size $((2**27)) $((2**28)) $((2**29)) \ + >> ${fn}.txt + +done diff --git a/SCAN-RSS/dpu/task.c b/SCAN-RSS/dpu/task.c index 7a4b029..afc42c7 100644 --- a/SCAN-RSS/dpu/task.c +++ b/SCAN-RSS/dpu/task.c @@ -11,7 +11,7 @@ #include <handshake.h> #include <barrier.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; diff --git a/SCAN-RSS/host/app.c b/SCAN-RSS/host/app.c index 6771207..ffcc2cf 100644 --- a/SCAN-RSS/host/app.c +++ b/SCAN-RSS/host/app.c @@ -7,15 +7,31 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#include <dpu_management.h> +#include <dpu_target_macros.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -25,12 +41,7 @@ #define XSTR(x) STR(x) #define STR(x) #x -#if ENERGY -#include <dpu_probe.h> -#endif - -#include <dpu_management.h> -#include <dpu_target_macros.h> +unsigned int kernel; // Pointer declaration static T* A; @@ -78,17 +89,17 @@ int main(int argc, char **argv) { // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); - timer.time[0] = 0; // alloc + zero(&timer, 0); #endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); - timer.time[1] = 0; // load + zero(&timer, 1); #endif #if !WITH_FREE_OVERHEAD - timer.time[6] = 0; // free + zero(&timer, 6); #endif unsigned int i = 0; @@ -100,8 +111,8 @@ int main(int argc, char **argv) { (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned // Input/output allocation - A = malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); - C = malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); + A = (T*) malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); + C = (T*) malloc(input_size_dpu_round * NR_DPUS * sizeof(T)); T *bufferA = A; T *bufferC = C; @@ -167,8 +178,8 @@ int main(int argc, char **argv) { } // Input arguments const unsigned int input_size_dpu = input_size_dpu_round; - unsigned int kernel = 0; - dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel, 0}; + kernel = 0; + dpu_arguments_t input_arguments = {(uint32_t)(input_size_dpu * sizeof(T)), (enum kernels)kernel, 0}; // Copy input arrays i = 0; DPU_FOREACH(dpu_set, dpu, i) { @@ -214,7 +225,7 @@ int main(int argc, char **argv) { //printf("Retrieve results\n"); dpu_results_t results[nr_of_dpus]; - T* results_scan = malloc(nr_of_dpus * sizeof(T)); + T* results_scan = (T*) malloc(nr_of_dpus * sizeof(T)); i = 0; accum = 0; @@ -251,7 +262,7 @@ int main(int argc, char **argv) { dpu_arguments_t input_arguments_2[NR_DPUS]; for(i=0; i<nr_of_dpus; i++) { input_arguments_2[i].size=input_size_dpu * sizeof(T); - input_arguments_2[i].kernel=kernel; + input_arguments_2[i].kernel=(enum kernels)kernel; input_arguments_2[i].t_count=results_scan[i]; } DPU_FOREACH(dpu_set, dpu, i) { @@ -332,11 +343,11 @@ int main(int argc, char **argv) { } if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - printf("[::] SCAN-RSS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%d", + dfatool_printf("[::] SCAN-RSS-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%d", NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, UNROLL, input_size); - printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", + dfatool_printf(" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_free_us=%f", + dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_sync_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], @@ -345,20 +356,20 @@ int main(int argc, char **argv) { timer.time[5], // sync timer.time[7], // read timer.time[8]); - printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", + dfatool_printf(" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", input_size * sizeof(T) / timer.time[2], input_size * sizeof(T) / (timer.time[4] + timer.time[5] + timer.time[6]), input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8])); - printf(" throughput_upmem_s_MBps=%f throughput_upmem_wxsxr_MBps=%f throughput_upmem_lwxsxr_MBps=%f throughput_upmem_alwxsxr_MBps=%f", + dfatool_printf(" throughput_upmem_s_MBps=%f throughput_upmem_wxsxr_MBps=%f throughput_upmem_lwxsxr_MBps=%f throughput_upmem_alwxsxr_MBps=%f", input_size * sizeof(T) / (timer.time[5]), input_size * sizeof(T) / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]), input_size * sizeof(T) / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]), input_size * sizeof(T) / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7])); - printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", + dfatool_printf(" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", input_size / timer.time[2], input_size / (timer.time[4] + timer.time[5] + timer.time[6]), input_size / (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7] + timer.time[8])); - printf(" throughput_upmem_s_MOpps=%f throughput_upmem_wxsxr_MOpps=%f throughput_upmem_lwxsxr_MOpps=%f throughput_upmem_alwxsxr_MOpps=%f\n", + dfatool_printf(" throughput_upmem_s_MOpps=%f throughput_upmem_wxsxr_MOpps=%f throughput_upmem_lwxsxr_MOpps=%f throughput_upmem_alwxsxr_MOpps=%f\n", input_size / (timer.time[5]), input_size / (timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]), input_size / (timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6] + timer.time[7]), diff --git a/SCAN-RSS/support/common.h b/SCAN-RSS/include/common.h index be19a8c..859a3fe 100755..100644 --- a/SCAN-RSS/support/common.h +++ b/SCAN-RSS/include/common.h @@ -40,15 +40,17 @@ #define REGS (BLOCK_SIZE >> DIV) +enum kernels { + kernel1 = 0, + kernel2 = 1, + nr_kernels = 2, +}; + // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; - enum kernels { - kernel1 = 0, - kernel2 = 1, - nr_kernels = 2, - } kernel; - T t_count; + uint32_t size; + enum kernels kernel; + T t_count; } dpu_arguments_t; typedef struct { diff --git a/SCAN-RSS/include/dfatool_host.ah b/SCAN-RSS/include/dfatool_host.ah new file mode 100644 index 0000000..6d2fad5 --- /dev/null +++ b/SCAN-RSS/include/dfatool_host.ah @@ -0,0 +1,29 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_elements; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_elements = p->input_size; + printf("[>>] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } + + advice call("% scan_host(...)") : after() { + printf("[--] SCAN-RSS | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements); + } + + advice execution("% main(...)") : after() { + printf("[<<] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } +}; diff --git a/SCAN-RSS/support/params.h b/SCAN-RSS/include/params.h index 9f6aacc..a96b33f 100644 --- a/SCAN-RSS/support/params.h +++ b/SCAN-RSS/include/params.h @@ -18,7 +18,7 @@ static void usage() { "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=3932160 elements)" @@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) { p.input_size = 3932160; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; int opt; while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { diff --git a/SCAN-RSS/include/timer.h b/SCAN-RSS/include/timer.h new file mode 100644 index 0000000..313151d --- /dev/null +++ b/SCAN-RSS/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 9 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/SCAN-RSS/run-paper-strong-full.sh b/SCAN-RSS/run-paper-strong-full.sh deleted file mode 100755 index a00e96d..0000000 --- a/SCAN-RSS/run-paper-strong-full.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SCAN-RSS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 is not part of upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 251658240 -x 1 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/SCAN-RSS/run-paper-strong-rank.sh b/SCAN-RSS/run-paper-strong-rank.sh deleted file mode 100755 index 3391a1b..0000000 --- a/SCAN-RSS/run-paper-strong-rank.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SCAN-RSS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream config space -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 1 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/SCAN-RSS/run-paper-weak.sh b/SCAN-RSS/run-paper-weak.sh deleted file mode 100755 index 053d9a6..0000000 --- a/SCAN-RSS/run-paper-weak.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SCAN-RSS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# upstream does not include 256 and 512 in config space -for nr_dpus in 512 256 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 verbose=1; then - timeout --foreground -k 1m 30m bin/host_code -w 0 -e 100 -i 3932160 -x 0 || true - fi - done -done -) | tee log-paper-weak.txt diff --git a/SCAN-RSS/run.sh b/SCAN-RSS/run.sh deleted file mode 100755 index 1c39f7c..0000000 --- a/SCAN-RSS/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -for i in 2048 4096 8192 16384 65536 262144 1048576 3932160 -do - NR_DPUS=1 NR_TASKLETS=16 BL=10 VERSION=SINGLE make all - wait - ./bin/host_code -w 10 -e 100 -i ${i} > profile/out${i}_tl16_bl10_dpu11 - wait - make clean - wait -done diff --git a/SCAN-RSS/support/timer.h b/SCAN-RSS/support/timer.h deleted file mode 100755 index 3ec6d87..0000000 --- a/SCAN-RSS/support/timer.h +++ /dev/null @@ -1,66 +0,0 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[9];
- struct timeval stopTime[9];
- double time[9];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
diff --git a/SCAN-SSA/Makefile b/SCAN-SSA/Makefile index 319f2da..c741138 100644 --- a/SCAN-SSA/Makefile +++ b/SCAN-SSA/Makefile @@ -9,14 +9,31 @@ HOST_SOURCES := $(wildcard host/app.c) OMP_SOURCES := $(wildcard host/omp.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_INCLUDES := support -COMMON_FLAGS = -Wall -Wextra -O2 -I${COMMON_INCLUDES} -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL} -HOST_FLAGS = ${COMMON_FLAGS} -std=c11 `dpu-pkg-config --cflags --libs dpu` +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS = -Wall -Wextra -O2 -Iinclude -DNR_DPUS=${NR_DPUS} -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} -DUNROLL=${UNROLL} +HOST_FLAGS = ${COMMON_FLAGS} `dpu-pkg-config --cflags --libs dpu` -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS = ${COMMON_FLAGS} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -29,7 +46,9 @@ bin/dpu_code: ${DPU_SOURCES} bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} ${DPU_SOURCES} -o $@ bin/host_code: ${HOST_SOURCES} bin - ${QUIET}${CC} ${HOST_FLAGS} ${HOST_SOURCES} -o $@ + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah bin/omp_code: ${OMP_SOURCES} ${QUIET}${CC} ${HOST_FLAGS} -fopenmp ${OMP_SOURCES} -o $@ diff --git a/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh b/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..2715db7 --- /dev/null +++ b/SCAN-SSA/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 5 -i ${input_size} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks SCAN-RSS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: input_size $((2**22)) $((2**23)) $((2**24)) \ +>> ${fn}.txt diff --git a/SCAN-SSA/benchmark-scripts/ccmcc25.sh b/SCAN-SSA/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..c9655c8 --- /dev/null +++ b/SCAN-SSA/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 50 -i ${input_size} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks SCAN-SSA $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size $((2**27)) $((2**28)) $((2**29)) \ + >> ${fn}.txt + +done diff --git a/SCAN-SSA/dpu/task.c b/SCAN-SSA/dpu/task.c index 15411a4..76f393d 100644 --- a/SCAN-SSA/dpu/task.c +++ b/SCAN-SSA/dpu/task.c @@ -11,7 +11,7 @@ #include <handshake.h> #include <barrier.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; __host dpu_results_t DPU_RESULTS[NR_TASKLETS]; diff --git a/SCAN-SSA/host/app.c b/SCAN-SSA/host/app.c index 25471fe..8675f17 100644 --- a/SCAN-SSA/host/app.c +++ b/SCAN-SSA/host/app.c @@ -7,15 +7,29 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -25,15 +39,13 @@ #define XSTR(x) STR(x) #define STR(x) #x -#if ENERGY -#include <dpu_probe.h> -#endif - // Pointer declaration static T* A; static T* C; static T* C2; +unsigned int kernel; + // Create input arrays static void read_input(T* A, unsigned int nr_elements, unsigned int nr_elements_round) { srand(0); @@ -95,9 +107,9 @@ int main(int argc, char **argv) { (input_size_dpu_ % (NR_TASKLETS * REGS) != 0) ? roundup(input_size_dpu_, (NR_TASKLETS * REGS)) : input_size_dpu_; // Input size per DPU (max.), 8-byte aligned // Input/output allocation - A = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); - C = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); - C2 = malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); + A = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); + C = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); + C2 = (T*) malloc(input_size_dpu_round * nr_of_dpus * sizeof(T)); T *bufferA = A; T *bufferC = C2; @@ -124,8 +136,8 @@ int main(int argc, char **argv) { start(&timer, 1, 0); // Input arguments const unsigned int input_size_dpu = input_size_dpu_round; - unsigned int kernel = 0; - dpu_arguments_t input_arguments = {input_size_dpu * sizeof(T), kernel, 0}; + kernel = 0; + dpu_arguments_t input_arguments = {(uint32_t)(input_size_dpu * sizeof(T)), (enum kernels)kernel, 0}; // Copy input arrays i = 0; DPU_FOREACH(dpu_set, dpu, i) { @@ -170,7 +182,7 @@ int main(int argc, char **argv) { //printf("Retrieve results\n"); dpu_results_t results[nr_of_dpus]; - T* results_scan = malloc(nr_of_dpus * sizeof(T)); + T* results_scan = (T*) malloc(nr_of_dpus * sizeof(T)); i = 0; accum = 0; @@ -207,7 +219,7 @@ int main(int argc, char **argv) { dpu_arguments_t input_arguments_2[NR_DPUS]; for(i=0; i<nr_of_dpus; i++) { input_arguments_2[i].size=input_size_dpu * sizeof(T); - input_arguments_2[i].kernel=kernel; + input_arguments_2[i].kernel=(enum kernels)kernel; input_arguments_2[i].t_count=results_scan[i]; } DPU_FOREACH(dpu_set, dpu, i) { @@ -272,17 +284,16 @@ int main(int argc, char **argv) { } if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); - printf("[::] SCAN-SSA NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%u " - "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f\n", + dfatool_printf("[::] SCAN-SSA NMC | n_dpus=%d n_tasklets=%d e_type=%s block_size_B=%d b_unroll=%d n_elements=%u " + "| throughput_cpu_MBps=%f throughput_pim_MBps=%f throughput_MBps=%f", nr_of_dpus, NR_TASKLETS, XSTR(T), BLOCK_SIZE, UNROLL, input_size, input_size * sizeof(T) / timer.time[0], input_size * sizeof(T) / (timer.time[2] + timer.time[3] + timer.time[4]), input_size * sizeof(T) / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5])); - printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f\n", + dfatool_printf(" throughput_cpu_MOpps=%f throughput_pim_MOpps=%f throughput_MOpps=%f\n", input_size / timer.time[0], input_size / (timer.time[2] + timer.time[3] + timer.time[4]), input_size / (timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5])); - printall(&timer, 5); } else { printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n"); } diff --git a/SCAN-SSA/host/omp.c b/SCAN-SSA/host/omp.c index efa5360..3e722dc 100644 --- a/SCAN-SSA/host/omp.c +++ b/SCAN-SSA/host/omp.c @@ -12,9 +12,9 @@ #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" #define XSTR(x) STR(x) #define STR(x) #x diff --git a/SCAN-SSA/support/common.h b/SCAN-SSA/include/common.h index 0bdf7ca..f395cc5 100644 --- a/SCAN-SSA/support/common.h +++ b/SCAN-SSA/include/common.h @@ -40,15 +40,17 @@ #define REGS (BLOCK_SIZE >> DIV) +enum kernels { + kernel1 = 0, + kernel2 = 1, + nr_kernels = 2, +}; + // Structures used by both the host and the dpu to communicate information typedef struct { - uint32_t size; - enum kernels { - kernel1 = 0, - kernel2 = 1, - nr_kernels = 2, - } kernel; - T t_count; + uint32_t size; + enum kernels kernel; + T t_count; } dpu_arguments_t; typedef struct { diff --git a/SCAN-SSA/include/dfatool_host.ah b/SCAN-SSA/include/dfatool_host.ah new file mode 100644 index 0000000..6d2fad5 --- /dev/null +++ b/SCAN-SSA/include/dfatool_host.ah @@ -0,0 +1,29 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_elements; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_elements = p->input_size; + printf("[>>] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } + + advice call("% scan_host(...)") : after() { + printf("[--] SCAN-RSS | n_dpus=%u n_elements=%lu\n", n_dpus, n_elements); + } + + advice execution("% main(...)") : after() { + printf("[<<] SCAN-RSS | n_dpus=%u n_elements=%lu\n", NR_DPUS, n_elements); + } +}; diff --git a/SCAN-SSA/support/params.h b/SCAN-SSA/include/params.h index 9f6aacc..a96b33f 100644 --- a/SCAN-SSA/support/params.h +++ b/SCAN-SSA/include/params.h @@ -18,7 +18,7 @@ static void usage() { "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=3932160 elements)" @@ -30,7 +30,7 @@ struct Params input_params(int argc, char **argv) { p.input_size = 3932160; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; int opt; while((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { diff --git a/SCAN-SSA/include/timer.h b/SCAN-SSA/include/timer.h new file mode 100644 index 0000000..5b8eba3 --- /dev/null +++ b/SCAN-SSA/include/timer.h @@ -0,0 +1,5 @@ +#pragma once
+
+#define N_TIMERS 7
+#include "../../include/timer_base.h"
+#undef N_TIMERS
diff --git a/SCAN-SSA/run-omp.sh b/SCAN-SSA/run-omp.sh deleted file mode 100755 index ccbb1bd..0000000 --- a/SCAN-SSA/run-omp.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i: input size (number of elements, not number of bytes!) - -echo "prim-benchmarks SCAN-SSA (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_threads in 1 2 4 6 8 12 16 20 24 32; do - for i in 2048 4096 8192 16384 65536 262144 1048576 3932160 15728640 31457280; do - for dt in UINT32 UINT64 INT32 INT64 FLOAT DOUBLE; do - echo - if make -B TYPE=${dt} bin/omp_code; then - OMP_NUM_THREADS=$nr_threads timeout -k 1m 30m bin/omp_code -w 0 -e 100 -i ${i} || true - fi - done - done -done diff --git a/SCAN-SSA/run.sh b/SCAN-SSA/run.sh deleted file mode 100755 index 54d5f93..0000000 --- a/SCAN-SSA/run.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i: input size (number of elements, not number of bytes!) - -echo "prim-benchmarks SCAN-SSA (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 2 4 8 16 32 64 128 256 512; do - for nr_tasklets in 1 2 3 4 6 8 10 12 16 20 24; do - for i in 2048 4096 8192 16384 65536 262144 1048576 3932160; do - for dt in UINT32 UINT64 INT32 INT64 FLOAT DOUBLE; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 TYPE=${dt} UNROLL=1 \ - || make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 TYPE=${dt} UNROLL=0; then - timeout -k 1m 30m bin/host_code -w 0 -e 100 -i ${i} || true - fi - done - done - done -done diff --git a/SCAN-SSA/support/timer.h b/SCAN-SSA/support/timer.h deleted file mode 100644 index 5411254..0000000 --- a/SCAN-SSA/support/timer.h +++ /dev/null @@ -1,64 +0,0 @@ -/*
- * Copyright (c) 2016 University of Cordoba and University of Illinois
- * All rights reserved.
- *
- * Developed by: IMPACT Research Group
- * University of Cordoba and University of Illinois
- * http://impact.crhc.illinois.edu/
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * with the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * > Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimers.
- * > Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimers in the
- * documentation and/or other materials provided with the distribution.
- * > Neither the names of IMPACT Research Group, University of Cordoba,
- * University of Illinois nor the names of its contributors may be used
- * to endorse or promote products derived from this Software without
- * specific prior written permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
- * THE SOFTWARE.
- *
- */
-
-#include <sys/time.h>
-
-typedef struct Timer{
-
- struct timeval startTime[7];
- struct timeval stopTime[7];
- double time[7];
-
-}Timer;
-
-void start(Timer *timer, int i, int rep) {
- if(rep == 0) {
- timer->time[i] = 0.0;
- }
- gettimeofday(&timer->startTime[i], NULL);
-}
-
-void stop(Timer *timer, int i) {
- gettimeofday(&timer->stopTime[i], NULL);
- timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
- (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
-}
-
-void printall(Timer *timer, int maxt) {
- for (int i = 0; i <= maxt; i++) {
- printf(" timer%d_us=%f", i, timer->time[i]);
- }
- printf("\n");
-}
diff --git a/SpMV/Makefile b/SpMV/Makefile index 0e7a70c..c2d9d50 100644 --- a/SpMV/Makefile +++ b/SpMV/Makefile @@ -1,21 +1,31 @@ NR_TASKLETS ?= 16 NR_DPUS ?= 1 -COMMON_INCLUDES := support -HOST_SOURCES := $(wildcard host/*.c) -DPU_SOURCES := $(wildcard dpu/*.c) -CPU_BASE_SOURCES := $(wildcard baselines/cpu/*.c) -GPU_BASE_SOURCES := $(wildcard baselines/gpu/*.cu) - -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DASPECTC=${aspectc} -DDFATOOL_TIMING=${dfatool_timing} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -CPU_BASE_FLAGS := -O3 -fopenmp -GPU_BASE_FLAGS := -O3 + +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -24,19 +34,13 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -gpu: bin/gpu_baseline - -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} - -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} - -bin/cpu_baseline: ${CPU_BASE_SOURCES} - ${QUIET}${CC} -o $@ ${CPU_BASE_SOURCES} ${CPU_BASE_FLAGS} +bin/host_code: host/app.c include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ host/app.c ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/gpu_baseline: ${GPU_BASE_SOURCES} - ${QUIET}nvcc -o $@ ${GPU_BASE_SOURCES} ${GPU_BASE_FLAGS} +bin/dpu_code: dpu/task.c include bin + ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ dpu/task.c clean: ${QUIET}rm -rf bin diff --git a/SpMV/benchmark-scripts/ccmcc25-sim.sh b/SpMV/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..9d1af4e --- /dev/null +++ b/SpMV/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -v 0 -f data/${data} 2>&1 +} + +export -f run_benchmark_nmc + +cd data/generate +for i in 4 8 16; do + ./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx +done +cd ../.. + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks SpMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} \ + ::: data bcsstk30.mtx bcsstk30.4.mtx bcsstk30.8.mtx bcsstk30.16.mtx \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ +>> ${fn}.txt + +rm -f data/bcsstk30.*.mtx diff --git a/SpMV/benchmark-scripts/ccmcc25.sh b/SpMV/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..176ea99 --- /dev/null +++ b/SpMV/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -v 0 -f data/${data} 2>&1 +} + +export -f run_benchmark_nmc + +cd data/generate +for i in 8 32 64; do + ./replicate ../bcsstk30.mtx $i ../bcsstk30.${i}.mtx +done +cd ../.. + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks SpMV $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 data={data} numa_rank={numa_rank} \ + ::: i $(seq 0 10) \ + ::: data bcsstk30.mtx bcsstk30.8.mtx bcsstk30.32.mtx bcsstk30.64.mtx \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + >> ${fn}.txt + +done + +rm -f data/bcsstk30.*.mtx diff --git a/SpMV/dpu/task.c b/SpMV/dpu/task.c index 501a62a..305a645 100644 --- a/SpMV/dpu/task.c +++ b/SpMV/dpu/task.c @@ -11,7 +11,7 @@ #include <perfcounter.h> #include <seqread.h> -#include "../support/common.h" +#include "common.h" #define PRINT_ERROR(fmt, ...) printf("\033[0;31mERROR:\033[0m "fmt"\n", ##__VA_ARGS__) diff --git a/SpMV/host/app.c b/SpMV/host/app.c index ffccb70..6cf2861 100644 --- a/SpMV/host/app.c +++ b/SpMV/host/app.c @@ -3,9 +3,24 @@ * SpMV Host Application Source File * */ +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#ifndef ENERGY +#define ENERGY 0 +#endif +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <assert.h> #include <getopt.h> #include <stdio.h> @@ -14,24 +29,17 @@ #include <unistd.h> #include "mram-management.h" -#include "../support/common.h" -#include "../support/matrix.h" -#include "../support/params.h" -#include "../support/timer.h" -#include "../support/utils.h" +#include "common.h" +#include "matrix.h" +#include "params.h" +#include "timer.h" +#include "utils.h" #define DPU_BINARY "./bin/dpu_code" #define XSTR(x) STR(x) #define STR(x) #x -#ifndef ENERGY -#define ENERGY 0 -#endif -#if ENERGY -#include <dpu_probe.h> -#endif - // Main of the Host Application int main(int argc, char **argv) { @@ -78,10 +86,10 @@ int main(int argc, char **argv) uint32_t *rowPtrs = csrMatrix.rowPtrs; struct Nonzero *nonzeros = csrMatrix.nonzeros; float *inVector = - malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))); + (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float))); initVector(inVector, numCols); float *outVector = - malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float))); + (float*)malloc(ROUND_UP_TO_MULTIPLE_OF_8(numRows * sizeof(float))); // Partition data structure across DPUs uint32_t numRowsPerDPU = @@ -158,22 +166,25 @@ int main(int argc, char **argv) PRINT_INFO(p.verbosity >= 2, " Copying data to DPU"); startTimer(&timer); - copyToDPU(dpu, (uint8_t *) dpuRowPtrs_h, dpuRowPtrs_m, - (dpuNumRows + 1) * sizeof(uint32_t)); - copyToDPU(dpu, (uint8_t *) dpuNonzeros_h, dpuNonzeros_m, - dpuNumNonzeros * sizeof(struct Nonzero)); - copyToDPU(dpu, (uint8_t *) inVector, dpuInVector_m, - numCols * sizeof(float)); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuRowPtrs_m, (uint8_t *) dpuRowPtrs_h, + ROUND_UP_TO_MULTIPLE_OF_8((dpuNumRows + 1) * sizeof(uint32_t)))); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuNonzeros_m, (uint8_t *) dpuNonzeros_h, + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumNonzeros * sizeof(struct Nonzero)))); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuInVector_m, (uint8_t *) inVector, + ROUND_UP_TO_MULTIPLE_OF_8(numCols * sizeof(float)))); stopTimer(&timer); writeTime += getElapsedTime(timer); - } // Send parameters to DPU PRINT_INFO(p.verbosity >= 2, " Copying parameters to DPU"); startTimer(&timer); - copyToDPU(dpu, (uint8_t *) & dpuParams[dpuIdx], dpuParams_m, - sizeof(struct DPUParams)); + DPU_ASSERT(dpu_copy_to(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams_m, (uint8_t *) & dpuParams[dpuIdx], + ROUND_UP_TO_MULTIPLE_OF_8(sizeof(struct DPUParams)))); stopTimer(&timer); writeTime += getElapsedTime(timer); @@ -204,13 +215,15 @@ int main(int argc, char **argv) PRINT_INFO(p.verbosity >= 1, "Copying back the result"); startTimer(&timer); dpuIdx = 0; + DPU_FOREACH(dpu_set, dpu) { unsigned int dpuNumRows = dpuParams[dpuIdx].dpuNumRows; if (dpuNumRows > 0) { uint32_t dpuStartRowIdx = dpuIdx * numRowsPerDPU; - copyFromDPU(dpu, dpuParams[dpuIdx].dpuOutVector_m, - (uint8_t *) (outVector + dpuStartRowIdx), - dpuNumRows * sizeof(float)); + DPU_ASSERT(dpu_copy_from(dpu, DPU_MRAM_HEAP_POINTER_NAME, + dpuParams[dpuIdx].dpuOutVector_m, + (uint8_t *) (outVector + dpuStartRowIdx), + ROUND_UP_TO_MULTIPLE_OF_8(dpuNumRows * sizeof(float)))); } ++dpuIdx; } @@ -220,7 +233,7 @@ int main(int argc, char **argv) // Calculating result on CPU PRINT_INFO(p.verbosity >= 1, "Calculating result on CPU"); - float *outVectorReference = malloc(numRows * sizeof(float)); + float *outVectorReference = (float*)malloc(numRows * sizeof(float)); for (uint32_t rowIdx = 0; rowIdx < numRows; ++rowIdx) { float sum = 0.0f; for (uint32_t i = rowPtrs[rowIdx]; i < rowPtrs[rowIdx + 1]; ++i) { @@ -254,22 +267,22 @@ int main(int argc, char **argv) freeTime += getElapsedTime(timer); if (status) { - printf + dfatool_printf ("[::] SpMV UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s n_elements=%d ", numDPUs, numRanks, NR_TASKLETS, "float", csrMatrix.numNonzeros); - printf + dfatool_printf ("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", allocTime, loadTime, writeTime, dpuTime, readTime, freeTime); - printf + dfatool_printf (" throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit csrMatrix.numNonzeros * sizeof(float) / (dpuTime * 1e6), csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6)); - printf + dfatool_printf (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", csrMatrix.numNonzeros * sizeof(float) / ((writeTime + dpuTime + readTime) * 1e6), @@ -278,14 +291,14 @@ int main(int argc, char **argv) csrMatrix.numNonzeros * sizeof(float) / ((allocTime + loadTime + writeTime + dpuTime + readTime) * 1e6)); - printf + dfatool_printf (" throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", // coomatrix / csrmatrix use uint32_t indexes and float values, so all 32bit csrMatrix.numNonzeros / (dpuTime * 1e6), csrMatrix.numNonzeros / ((allocTime + loadTime + writeTime + dpuTime + readTime + freeTime) * 1e6)); - printf + dfatool_printf (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", csrMatrix.numNonzeros / ((writeTime + dpuTime + readTime) * 1e6), diff --git a/SpMV/host/mram-management.h b/SpMV/host/mram-management.h index f2ee031..a953d6a 100644 --- a/SpMV/host/mram-management.h +++ b/SpMV/host/mram-management.h @@ -1,9 +1,7 @@ +#pragma once -#ifndef _MRAM_MANAGEMENT_H_ -#define _MRAM_MANAGEMENT_H_ - -#include "../support/common.h" -#include "../support/utils.h" +#include "common.h" +#include "utils.h" #define DPU_CAPACITY (64 << 20) // A DPU's capacity is 64 MiB @@ -29,21 +27,3 @@ static uint32_t mram_heap_alloc(struct mram_heap_allocator_t *allocator, } return ret; } - -static void copyToDPU(struct dpu_set_t dpu, uint8_t *hostPtr, uint32_t mramIdx, - uint32_t size) -{ - DPU_ASSERT(dpu_copy_to - (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, - ROUND_UP_TO_MULTIPLE_OF_8(size))); -} - -static void copyFromDPU(struct dpu_set_t dpu, uint32_t mramIdx, - uint8_t *hostPtr, uint32_t size) -{ - DPU_ASSERT(dpu_copy_from - (dpu, DPU_MRAM_HEAP_POINTER_NAME, mramIdx, hostPtr, - ROUND_UP_TO_MULTIPLE_OF_8(size))); -} - -#endif diff --git a/SpMV/support/common.h b/SpMV/include/common.h index 6118814..6118814 100644 --- a/SpMV/support/common.h +++ b/SpMV/include/common.h diff --git a/SpMV/include/dfatool_host.ah b/SpMV/include/dfatool_host.ah new file mode 100644 index 0000000..91d44bd --- /dev/null +++ b/SpMV/include/dfatool_host.ah @@ -0,0 +1,31 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_rows, n_cols, n_nonzero; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(float); + } + + advice call("% input_params(...)"): after() { + printf("[>>] SpMV | n_dpus=%u\n", NR_DPUS); + } + + advice call("% readCOOMatrix(...)") : after() { + struct COOMatrix* c = tjp->result(); + n_rows = c->numRows; + n_cols = c->numCols; + n_nonzero = c->numNonzeros; + printf("[--] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero); + } + + advice execution("% main(...)") : after() { + printf("[<<] SpMV | n_dpus=%u n_rows=%lu n_cols=%lu n_nonzero=%lu\n", NR_DPUS, n_rows, n_cols, n_nonzero); + } +}; diff --git a/SpMV/support/matrix.h b/SpMV/include/matrix.h index ce8745e..ce8745e 100644 --- a/SpMV/support/matrix.h +++ b/SpMV/include/matrix.h diff --git a/SpMV/support/params.h b/SpMV/include/params.h index bf60e79..bf60e79 100644 --- a/SpMV/support/params.h +++ b/SpMV/include/params.h diff --git a/SpMV/support/timer.h b/SpMV/include/timer.h index 7367b11..cb513cb 100644 --- a/SpMV/support/timer.h +++ b/SpMV/include/timer.h @@ -1,10 +1,12 @@ - -#ifndef _TIMER_H_ -#define _TIMER_H_ +#pragma once #include <stdio.h> #include <sys/time.h> +#if DFATOOL_TIMING + +#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0) + typedef struct Timer { struct timeval startTime; struct timeval endTime; @@ -27,4 +29,26 @@ static double getElapsedTime(Timer timer) timer.startTime.tv_usec) / 1.0e6)); } +#else + +#define dfatool_printf(fmt, ...) do {} while (0) + +typedef int Timer; + +static void startTimer(Timer* timer) +{ + (void)timer; +} + +static void stopTimer(Timer* timer) +{ + (void)timer; +} + +static double getElapsedTime(Timer timer) +{ + (void)timer; + return 0.0; +} + #endif diff --git a/SpMV/support/utils.h b/SpMV/include/utils.h index ccd8fbd..ccd8fbd 100644 --- a/SpMV/support/utils.h +++ b/SpMV/include/utils.h diff --git a/SpMV/run-paper-strong-full.sh b/SpMV/run-paper-strong-full.sh deleted file mode 100755 index 09b7085..0000000 --- a/SpMV/run-paper-strong-full.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SpMV strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -cd data/generate -./replicate ../bcsstk30.mtx 64 ../bcsstk30.mtx.64.mtx -cd ../.. - -# >2048 is not in upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 3m bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx || true - done - fi - done -done -) | tee log-paper-strong-full.txt - -rm -f data/bcsstk30.mtx.64.mtx diff --git a/SpMV/run-paper-strong-rank.sh b/SpMV/run-paper-strong-rank.sh deleted file mode 100755 index c73a6a0..0000000 --- a/SpMV/run-paper-strong-rank.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SpMV strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 3m bin/host_code -v 0 || true - done - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/SpMV/run-paper-weak.sh b/SpMV/run-paper-weak.sh deleted file mode 100755 index 74683cc..0000000 --- a/SpMV/run-paper-weak.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -set -e - -( - -echo "prim-benchmarks SpMV weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 16 64; do - cd data/generate - make - ./replicate ../bcsstk30.mtx ${nr_dpus} /tmp/bcsstk30.mtx.${nr_dpus}.mtx - cd ../.. - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} verbose=1; then - # repetition is not part of upstream setup - for i in `seq 1 50`; do - timeout --foreground -k 1m 3m bin/host_code -v 0 -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx || true - done - fi - done - rm -f /tmp/bcsstk30.mtx.${nr_dpus}.mtx -done | -) tee log-paper-weak.txt diff --git a/TRNS/Makefile b/TRNS/Makefile index 427a332..302fcd6 100644 --- a/TRNS/Makefile +++ b/TRNS/Makefile @@ -2,7 +2,6 @@ NR_DPUS ?= 1 NR_TASKLETS ?= 16 ENERGY ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) @@ -12,12 +11,12 @@ dfatool_timing ?= 1 HOST_CC := ${CC} -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} +COMMON_FLAGS := -Wall -Wextra -g -Iinclude HOST_FLAGS := ${COMMON_FLAGS} -O3 -march=native `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DENERGY=${ENERGY} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} ifeq (${aspectc_timing}, 1) - ASPECTC_HOST_FLAGS += -a support/dfatool_host.ah + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah endif ASPECTC_HOST_FLAGS ?= -a0 @@ -39,8 +38,11 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin +# cp/rm are needed to work around AspectC++ not liking symlinks bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} diff --git a/TRNS/benchmark-scripts/ccmcc25-sim.sh b/TRNS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..a7aa79c --- /dev/null +++ b/TRNS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +# Args: -m m -n n -o M_ -p N_ +# +# Input: (M_ * m) × (N_ * n) matrix +# Output: (N_* n) × (M_ * m) matrix +# Step 1: transpose (M_ * m) × N_ matrix that consists of tiles of size n +# CPU version: explicit +# DPU version: implicit (M_ * m write operations of #DPUs * n elements to DPUs) +# Step 2: transpose m × n matrix; this happens N_ * M_ times. +# DPU version: Each tasklet transposes a single m × n matrix / tile. +# (16 × 8 tile takes up 1 KiB WRAM) +# Step 3: Transpose M_ × n matrix that consists of tiles of size m. +# +# Note for DPU version: if M_ > #DPUs, steps 1 through 3 are repeated. +# Number of repetitions == ceil(M_ / #DPUS) +# For Hetsim benchmarks, we set M_ == #DPUs to simplify the task graph (no repetitions that depend on the number of available DPUs). +# Just in case, there is also a configuration with M_ == 2048 independent of #DPUs +# +# input size: uint64(DPU)/double(CPU) * M_ * m * N_ * n +# output size: uint64(DPU)/double(CPU) * M_ * m * N_ * n -- on DPU only; CPU version operates in-place +# Upstream DPU version uses int64_t, -p 2048 -o 12288 -x 1 [implicit -m 16 -n 8] +# Upstream CPU version uses double, -p 2556 -o 4096 -m 16 -n 8 and fails with -o 12288 (allocation error) +# +# -p 2048 -o 2048 -m 16 -n 8 -> matrix size: 4 GiB +# -p [64 .. 2304] -o 2048 -m 16 -n 8 -> matrix size: 128 MiB .. 4.5 GiB + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 2 -p ${cols} -o ${rows} -m ${tile_rows} -n ${tile_cols} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sdk${sdk}-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks TRNS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 cols={cols} rows={rows} tile_cols={tile_cols} tile_rows={tile_rows} \ + ::: nr_dpus 1 4 16 32 64 \ + ::: rows 64 128 256 512 \ + ::: cols 64 128 256 512 \ + ::: tile_rows 16 \ + ::: tile_cols 8 \ +>> ${fn}.txt diff --git a/TRNS/benchmark-scripts/ccmcc25.sh b/TRNS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..7c66306 --- /dev/null +++ b/TRNS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} dfatool_timing=0 aspectc=1 aspectc_timing=1 + bin/host_code -w 0 -e 4 -p ${cols} -o ${rows} -m ${tile_rows} -n ${tile_cols} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks TRNS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank=any cols={cols} rows={rows} tile_cols={tile_cols} tile_rows={tile_rows} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: rows 1024 2048 4096 \ + ::: cols 64 128 256 512 768 1024 1536 2048 2304 \ + ::: tile_rows 16 \ + ::: tile_cols 8 \ + >> ${fn}.txt + +done diff --git a/TRNS/dpu/task.c b/TRNS/dpu/task.c index 0f5e4be..9c0e0a8 100644 --- a/TRNS/dpu/task.c +++ b/TRNS/dpu/task.c @@ -12,7 +12,7 @@ #include <mutex.h> #include <barrier.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; diff --git a/TRNS/host/app.c b/TRNS/host/app.c index 735d2a9..c178a19 100644 --- a/TRNS/host/app.c +++ b/TRNS/host/app.c @@ -30,9 +30,9 @@ extern "C" { #include <assert.h> #include <math.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" #define XSTR(x) STR(x) #define STR(x) #x @@ -47,7 +47,6 @@ static T* A_host; static T* A_backup; static T* A_result; -struct Params p; unsigned int kernel = 0; // Create input arrays @@ -77,7 +76,7 @@ static void trns_host(T* input, unsigned int A, unsigned int B, unsigned int b){ // Main of the Host Application int main(int argc, char **argv) { - p = input_params(argc, argv); + struct Params p = input_params(argc, argv); struct dpu_set_t dpu_set, dpu; uint32_t nr_of_dpus; diff --git a/TRNS/support/common.h b/TRNS/include/common.h index 6a94c62..6a94c62 100755..100644 --- a/TRNS/support/common.h +++ b/TRNS/include/common.h diff --git a/TRNS/include/dfatool_host.ah b/TRNS/include/dfatool_host.ah new file mode 100644 index 0000000..72978cc --- /dev/null +++ b/TRNS/include/dfatool_host.ah @@ -0,0 +1,36 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned int n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile; + unsigned int element_size; + + virtual int getKernel() { return kernel; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + /* + * Input: (M_ * m) × (N_ * n) matrix + */ + n_rows_outer = p->M_; + n_rows_tile = p->m; + n_cols_outer = p->N_; + n_cols_tile = p->n; + printf("[>>] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile); + } + + advice call("% trns_host(...)") : after() { + printf("[--] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile); + } + + advice execution("% main(...)") : after() { + printf("[<<] TRNS | n_dpus=%u n_rows_outer=%u n_rows_tile=%u n_cols_outer=%u n_cols_tile=%u\n", NR_DPUS, n_rows_outer, n_rows_tile, n_cols_outer, n_cols_tile); + } +}; diff --git a/TRNS/support/params.h b/TRNS/include/params.h index 6b7e6f2..385490e 100644 --- a/TRNS/support/params.h +++ b/TRNS/include/params.h @@ -21,7 +21,7 @@ static void usage() { "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -m <I> m (default=16 elements)" @@ -39,7 +39,7 @@ struct Params input_params(int argc, char **argv) { p.n = 8; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; int opt; while((opt = getopt(argc, argv, "hw:e:x:m:n:o:p:")) >= 0) { diff --git a/TRNS/include/timer.h b/TRNS/include/timer.h new file mode 100644 index 0000000..8d5c3d5 --- /dev/null +++ b/TRNS/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 10 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/TRNS/run-fgbs24a.sh b/TRNS/run-fgbs24a.sh deleted file mode 100755 index 6ba8993..0000000 --- a/TRNS/run-fgbs24a.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -mkdir -p $(hostname) - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TRNS strong-full (dfatool fgbs24a edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 2304 2048 2543; do - for nr_tasklets in 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - # upstream uses -p 2048, but then the number of DPUs is always constant... - timeout --foreground -k 1m 180m bin/host_code -w 0 -e 100 -p $nr_dpus -o 12288 -x 1 || true - fi - done -done -echo "Completed at $(date)" -) | tee "$(hostname)/fgbs24a.txt" diff --git a/TRNS/run-paper-strong-full.sh b/TRNS/run-paper-strong-full.sh deleted file mode 100755 index 9d3792c..0000000 --- a/TRNS/run-paper-strong-full.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TRNS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 is not in upstream -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - # upstream uses -p 2048, but then the number of DPUs is always constant... - timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true - fi - done -done - -echo "Completed at $(date)" - -) | tee "log-$(hostname)-prim-strong-full.txt" diff --git a/TRNS/run-paper-strong-rank.sh b/TRNS/run-paper-strong-rank.sh deleted file mode 100755 index f5f00cb..0000000 --- a/TRNS/run-paper-strong-rank.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TRNS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - # upstream uses -p 64, but then the number of DPUs is always constant... - timeout --foreground -k 1m 60m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true - fi - done -done - -echo "Completed at $(date)" - -) | tee "log-$(hostname)-prim-strong-rank.txt" diff --git a/TRNS/run-paper-weak.sh b/TRNS/run-paper-weak.sh deleted file mode 100755 index f02d7d6..0000000 --- a/TRNS/run-paper-weak.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TRNS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - timeout --foreground -k 1m 60m bin/host_code -w 0 -e 40 -p 1 -o 12288 -x 0 || true - fi - done -done | tee log-paper-weak.txt - -echo "Completed at $(date)" - -) | tee "log-$(hostname)-prim-weak.txt" diff --git a/TRNS/run-rank.sh b/TRNS/run-rank.sh deleted file mode 100755 index 00f6898..0000000 --- a/TRNS/run-rank.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i: input size (number of elements, not number of bytes!) - -( - -echo "prim-benchmarks TRNS (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 1 4 8 16 32 48 64; do - for nr_tasklets in 8 12 16; do - # 12288 run-paper-weak, run-paper-strong-full - for i in 12288; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - # upstream uses -p 2048 in strong-full, but then the number of DPUs is always constant... - timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p 1 -o 12288 -x 0 || true - fi - done - done -done - -echo "Completed at $(date)" - -) | tee "log-$(hostname)-rank.txt" diff --git a/TRNS/run.sh b/TRNS/run.sh deleted file mode 100755 index 8d574a9..0000000 --- a/TRNS/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i: input size (number of elements, not number of bytes!) - -( - -echo "prim-benchmarks TRNS (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -for nr_dpus in 2542 2304 1 4 8 16 32 64 128 256 512 768 1024 1536 2048; do - for nr_tasklets in 8 12 16; do - # 12288 run-paper-weak, run-paper-strong-full - for i in 12288; do - echo - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets}; then - # upstream uses -p 2048 in strong-full, but then the number of DPUs is always constant... - timeout --foreground -k 1m 90m bin/host_code -w 0 -e 40 -p $nr_dpus -o 12288 -x 1 || true - fi - done - done -done - -echo "Completed at $(date)" - -) | tee "log-$(hostname).txt" diff --git a/TRNS/support/timer.h b/TRNS/support/timer.h deleted file mode 100755 index e04a202..0000000 --- a/TRNS/support/timer.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> - -#if DFATOOL_TIMING - -typedef struct Timer { - - struct timeval startTime[10]; - struct timeval stopTime[10]; - double time[10]; - -} Timer; - -#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0) - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -#else - -#define dfatool_printf(fmt, ...) do {} while (0) - -typedef int Timer; - -void start(Timer *timer, int i, int rep) -{ - (void)timer; - (void)i; - (void)rep; -} - -void stop(Timer *timer, int i) -{ - (void)timer; - (void)i; -} - -#endif diff --git a/TS/Makefile b/TS/Makefile index ac081bd..2fce611 100644 --- a/TS/Makefile +++ b/TS/Makefile @@ -5,14 +5,31 @@ WITH_ALLOC_OVERHEAD ?= 0 WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -DBL=${BL} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -lm +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude -DBL=${BL} +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} -lm DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ ifdef verbose @@ -25,7 +42,9 @@ bin: ${QUIET}mkdir -p bin bin/ts_host: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah bin/ts_dpu: ${DPU_SOURCES} ${COMMON_INCLUDES} bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} diff --git a/TS/benchmark-scripts/ccmcc25-sim.sh b/TS/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..0df03d9 --- /dev/null +++ b/TS/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/ts_host -w 0 -e 5 -n ${ts_size} 2>&1 +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks TS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 ts_size={ts_size} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: ts_size 2048 4096 8192 16384 32768 \ +>> ${fn}.txt diff --git a/TS/benchmark-scripts/ccmcc25.sh b/TS/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..74c8371 --- /dev/null +++ b/TS/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/ts_host -w 0 -e 50 -n ${ts_size} 2>&1 +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks TS $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=8 numa_rank=any ts_size={ts_size} \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: ts_size 8388608 16777216 33554432 67108864 \ + >> ${fn}.txt + +done diff --git a/TS/host/app.c b/TS/host/app.c index a19232b..bfa14df 100644 --- a/TS/host/app.c +++ b/TS/host/app.c @@ -7,8 +7,18 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> @@ -86,26 +96,26 @@ static void compute_ts_statistics(unsigned int timeSeriesLength, unsigned int ProfileLength, unsigned int queryLength) { - double *ACumSum = malloc(sizeof(double) * timeSeriesLength); + double *ACumSum = (double*)malloc(sizeof(double) * timeSeriesLength); ACumSum[0] = tSeries[0]; for (uint64_t i = 1; i < timeSeriesLength; i++) ACumSum[i] = tSeries[i] + ACumSum[i - 1]; - double *ASqCumSum = malloc(sizeof(double) * timeSeriesLength); + double *ASqCumSum = (double*)malloc(sizeof(double) * timeSeriesLength); ASqCumSum[0] = tSeries[0] * tSeries[0]; for (uint64_t i = 1; i < timeSeriesLength; i++) ASqCumSum[i] = tSeries[i] * tSeries[i] + ASqCumSum[i - 1]; - double *ASum = malloc(sizeof(double) * ProfileLength); + double *ASum = (double*)malloc(sizeof(double) * ProfileLength); ASum[0] = ACumSum[queryLength - 1]; for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++) ASum[i + 1] = ACumSum[queryLength + i] - ACumSum[i]; - double *ASumSq = malloc(sizeof(double) * ProfileLength); + double *ASumSq = (double*)malloc(sizeof(double) * ProfileLength); ASumSq[0] = ASqCumSum[queryLength - 1]; for (uint64_t i = 0; i < timeSeriesLength - queryLength; i++) ASumSq[i + 1] = ASqCumSum[queryLength + i] - ASqCumSum[i]; - double *AMean_tmp = malloc(sizeof(double) * ProfileLength); + double *AMean_tmp = (double*)malloc(sizeof(double) * ProfileLength); for (uint64_t i = 0; i < ProfileLength; i++) AMean_tmp[i] = ASum[i] / queryLength; - double *ASigmaSq = malloc(sizeof(double) * ProfileLength); + double *ASigmaSq = (double*)malloc(sizeof(double) * ProfileLength); for (uint64_t i = 0; i < ProfileLength; i++) ASigmaSq[i] = ASumSq[i] / queryLength - AMean[i] * AMean[i]; for (uint64_t i = 0; i < ProfileLength; i++) { @@ -136,18 +146,24 @@ int main(int argc, char **argv) // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); +#if DFATOOL_TIMING timer.time[0] = 0; // alloc #endif +#endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); +#if DFATOOL_TIMING timer.time[1] = 0; // load #endif +#endif #if !WITH_FREE_OVERHEAD +#if DFATOOL_TIMING timer.time[6] = 0; // free #endif +#endif #if ENERGY struct dpu_probe_t probe; @@ -195,8 +211,8 @@ int main(int argc, char **argv) unsigned int kernel = 0; dpu_arguments_t input_arguments = - { ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, - kernel + { (uint32_t)ts_size, query_length, query_mean, query_std, slice_per_dpu, 0, + (enum kernels) kernel }; uint32_t mem_offset; @@ -208,6 +224,15 @@ int main(int argc, char **argv) for (int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { + if (rep >= p.n_warmup) { + start(&timer, 6, 0); + } + streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, + query, query_length, query_mean, query_std); + if (rep >= p.n_warmup) { + stop(&timer, 6); + } + #if WITH_ALLOC_OVERHEAD if (rep >= p.n_warmup) { start(&timer, 0, 0); @@ -234,16 +259,10 @@ int main(int argc, char **argv) start(&timer, 2, 0); } uint32_t i = 0; - - DPU_FOREACH(dpu_set, dpu) { - input_arguments.exclusion_zone = 0; - - DPU_ASSERT(dpu_copy_to - (dpu, "DPU_INPUT_ARGUMENTS", 0, - (const void *)&input_arguments, - sizeof(input_arguments))); - i++; + DPU_FOREACH(dpu_set, dpu, i) { + DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments)); } + DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT)); i = 0; mem_offset = 0; @@ -386,36 +405,27 @@ int main(int argc, char **argv) #endif #endif - if (rep >= p.n_warmup) { - start(&timer, 6, 0); - } - streamp(tSeries, AMean, ASigma, ts_size - query_length - 1, - query, query_length, query_mean, query_std); - if (rep >= p.n_warmup) { - stop(&timer, 6); - } - int status = (minHost == result.minValue); if (status) { printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n"); if (rep >= p.n_warmup) { - printf + dfatool_printf ("[::] TS UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu", NR_DPUS, nr_of_ranks, NR_TASKLETS, XSTR(DTYPE), BLOCK_SIZE, ts_size); - printf + dfatool_printf (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD); - printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", timer.time[0], // alloc + dfatool_printf("| latency_alloc_us=%f latency_load_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f latency_cpu_us=%f ", timer.time[0], // alloc timer.time[1], // load timer.time[2], // write timer.time[3], // kernel timer.time[4], // read timer.time[5], // free timer.time[6]); // CPU - printf + dfatool_printf (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", ts_size * sizeof(DTYPE) / timer.time[6], ts_size * sizeof(DTYPE) / (timer.time[3]), @@ -425,7 +435,7 @@ int main(int argc, char **argv) timer.time[3] + timer.time[4] + timer.time[5])); - printf + dfatool_printf (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", ts_size * sizeof(DTYPE) / (timer.time[2] + timer.time[3] + @@ -439,14 +449,14 @@ int main(int argc, char **argv) timer.time[2] + timer.time[3] + timer.time[4])); - printf + dfatool_printf (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", ts_size / timer.time[6], ts_size / (timer.time[3]), ts_size / (timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3] + timer.time[4] + timer.time[5])); - printf + dfatool_printf (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", ts_size / (timer.time[2] + timer.time[3] + timer.time[4]), diff --git a/TS/support/common.h b/TS/include/common.h index 7585b90..6d37bdc 100755..100644 --- a/TS/support/common.h +++ b/TS/include/common.h @@ -14,6 +14,11 @@ #define DTYPE int32_t #define DTYPE_MAX INT32_MAX +enum kernels { + kernel1 = 0, + nr_kernels = 1, +} kernel; + typedef struct { uint32_t ts_length; uint32_t query_length; @@ -21,10 +26,7 @@ typedef struct { DTYPE query_std; uint32_t slice_per_dpu; int32_t exclusion_zone; - enum kernels { - kernel1 = 0, - nr_kernels = 1, - } kernel; + enum kernels kernel; } dpu_arguments_t; typedef struct { diff --git a/TS/include/dfatool_host.ah b/TS/include/dfatool_host.ah new file mode 100644 index 0000000..4192c73 --- /dev/null +++ b/TS/include/dfatool_host.ah @@ -0,0 +1,31 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + + unsigned long ts_size, query_length; + unsigned int element_size; + + virtual int getKernel() { return kernel; } + + DfatoolHostTiming() { + element_size = sizeof(DTYPE); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + ts_size = p->input_size_n; + query_length = p->input_size_m; + printf("[>>] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length); + } + + advice call("% streamp(...)") : before() { + printf("[--] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length); + } + + advice execution("% main(...)") : after() { + printf("[<<] TS | n_dpus=%u n_elements_ts=%lu n_elements_query=%lu\n", NR_DPUS, ts_size, query_length); + } +}; diff --git a/TS/support/params.h b/TS/include/params.h index b7d9763..b7d9763 100644 --- a/TS/support/params.h +++ b/TS/include/params.h diff --git a/TS/include/timer.h b/TS/include/timer.h new file mode 100644 index 0000000..7b80823 --- /dev/null +++ b/TS/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 7 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/TS/run-paper-strong-full.sh b/TS/run-paper-strong-full.sh deleted file mode 100755 index 5b7656d..0000000 --- a/TS/run-paper-strong-full.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS strong-full (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >2048 is not part of upstream -# 12 tasklets are not part of upstream (code does not work with 16…) -for nr_dpus in 2543 2304 256 512 1024 2048; do - for nr_tasklets in 1 2 4 8 12 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # This appears to be faster than BL=10. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then - timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n 33554432 || true - fi - done -done -) | tee log-paper-strong-full.txt diff --git a/TS/run-paper-strong-rank.sh b/TS/run-paper-strong-rank.sh deleted file mode 100755 index 58ad641..0000000 --- a/TS/run-paper-strong-rank.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS strong-rank (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# >64 are not part of upstream config space -for nr_dpus in 128 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # BL=10 appears to be slightly faster. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10; then - timeout --foreground -k 1m 60m bin/ts_host -w 0 -e 50 -n 524288 || true - fi - done -done -) | tee log-paper-strong-rank.txt diff --git a/TS/run-paper-weak.sh b/TS/run-paper-weak.sh deleted file mode 100755 index 64892f4..0000000 --- a/TS/run-paper-weak.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -# BL: use 2^(BL) B blocks for MRAM <-> WRAM transfers on PIM module -# T: data type -# -w: number of un-timed warmup iterations -# -e: number of timed iterations -# -i; ignored, always uses 262144 elements - -( - -echo "prim-benchmarks TS weak (dfatool edition)" -echo "Started at $(date)" -echo "Revision $(git describe --always)" - -# 256 and 512 are not part of upstream -for nr_dpus in 1 4 16 64; do - for nr_tasklets in 1 2 4 8 16; do - echo - # upstream code did not respect $BL in the makefile and used 256B (BL=8) instead. - # BL=10 appears to be slightly faster. - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=8; then - i=$(( nr_dpus * 524288 )) - timeout --foreground -k 1m 30m bin/ts_host -w 0 -e 100 -n $i || true - fi - done -done -) | tee log-paper-weak.txt diff --git a/TS/support/timer.h b/TS/support/timer.h deleted file mode 100755 index c569de7..0000000 --- a/TS/support/timer.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> - -typedef struct Timer { - - struct timeval startTime[7]; - struct timeval stopTime[7]; - double time[7]; - -} Timer; - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -void print(Timer *timer, int i, int REP) -{ - printf("%f\t", timer->time[i] / (1000 * REP)); -} - -void printall(Timer *timer, int maxt) -{ - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); -} diff --git a/VA/Makefile b/VA/Makefile index 040dd4a..a67c600 100644 --- a/VA/Makefile +++ b/VA/Makefile @@ -8,17 +8,34 @@ WITH_LOAD_OVERHEAD ?= 0 WITH_FREE_OVERHEAD ?= 0 WITH_DPUINFO ?= 0 -COMMON_INCLUDES := support HOST_SOURCES := $(wildcard host/*.c) DPU_SOURCES := $(wildcard dpu/*.c) -COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES} -HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} +aspectc ?= 0 +aspectc_timing ?= 0 +dfatool_timing ?= 1 + +HOST_CC := ${CC} + +COMMON_FLAGS := -Wall -Wextra -g -Iinclude +HOST_FLAGS := ${COMMON_FLAGS} -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY} -DWITH_ALLOC_OVERHEAD=${WITH_ALLOC_OVERHEAD} -DWITH_LOAD_OVERHEAD=${WITH_LOAD_OVERHEAD} -DWITH_FREE_OVERHEAD=${WITH_FREE_OVERHEAD} -DWITH_DPUINFO=${WITH_DPUINFO} -DDFATOOL_TIMING=${dfatool_timing} -DASPECTC=${aspectc} DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE} +ifeq (${aspectc_timing}, 1) + ASPECTC_HOST_FLAGS += -ainclude/dfatool_host_dpu.ah -ainclude/dfatool_host.ah +endif + +ASPECTC_HOST_FLAGS ?= -a0 + +ifeq (${aspectc}, 1) + HOST_CC = ag++ -r repo.acp -v 0 ${ASPECTC_HOST_FLAGS} --c_compiler ${UPMEM_HOME}/bin/clang++ -p . --Xcompiler +else + HOST_FLAGS += -std=c11 +endif + QUIET = @ -ifdef verbose +ifeq (${verbose}, 1) QUIET = endif @@ -27,10 +44,13 @@ all: bin/host_code bin/dpu_code bin: ${QUIET}mkdir -p bin -bin/host_code: ${HOST_SOURCES} ${COMMON_INCLUDES} bin - ${QUIET}${CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} +# cp/rm are needed to work around AspectC++ not liking symlinks +bin/host_code: ${HOST_SOURCES} include bin + ${QUIET}cp ../include/dfatool_host_dpu.ah include + ${QUIET}${HOST_CC} -o $@ ${HOST_SOURCES} ${HOST_FLAGS} + ${QUIET}rm -f include/dfatool_host_dpu.ah -bin/dpu_code: ${DPU_SOURCES} ${COMMON_INCLUDES} bin +bin/dpu_code: ${DPU_SOURCES} include bin ${QUIET}dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES} clean: diff --git a/VA/baselines/cpu/Makefile b/VA/baselines/cpu/Makefile index 04aacb6..279b0f3 100644 --- a/VA/baselines/cpu/Makefile +++ b/VA/baselines/cpu/Makefile @@ -5,7 +5,7 @@ nop_sync ?= 0 numa ?= 0 numa_memcpy ?= 0 -CFLAGS = +CFLAGS = -DDFATOOL_TIMING=1 LDFLAGS = ifeq (${debug}, 1) diff --git a/VA/baselines/cpu/app_baseline.c b/VA/baselines/cpu/app_baseline.c index 7975200..fe5125d 100644 --- a/VA/baselines/cpu/app_baseline.c +++ b/VA/baselines/cpu/app_baseline.c @@ -15,7 +15,7 @@ #include <omp.h> #if WITH_BENCHMARK -#include "../../support/timer.h" +#include "../../include/timer.h" #else #define start(...) #define stop(...) @@ -109,7 +109,7 @@ struct Params input_params(int argc, char **argv) p.n_warmup = 1; p.n_reps = 3; p.exp = 1; - p.n_threads = 5; + p.n_threads = 8; #if NUMA p.bitmask_in = NULL; p.bitmask_out = NULL; @@ -213,9 +213,11 @@ int main(int argc, char **argv) C = (T *) malloc(input_size * sizeof(T)); #endif + omp_set_num_threads(p.n_threads); + #pragma omp parallel for for (unsigned long i = 0; i < input_size; i++) { - A[i] = (T) (rand()); - B[i] = (T) (rand()); + A[i] = (T) i % (1<<31) + 5; + B[i] = (T) i % (1<<31) + 6; } #if NUMA diff --git a/VA/benchmark-scripts/ccmcc25-sim.sh b/VA/benchmark-scripts/ccmcc25-sim.sh new file mode 100755 index 0000000..386cf90 --- /dev/null +++ b/VA/benchmark-scripts/ccmcc25-sim.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 5 -i ${input_size} +} + +export -f run_benchmark_nmc + +fn=log/$(hostname)/ccmcc25-sim + +source ~/lib/local/upmem/upmem-2025.1.0-Linux-x86_64/upmem_env.sh simulator + +echo "prim-benchmarks VA $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} \ + ::: nr_dpus 1 2 4 8 16 32 48 64 \ + ::: input_size 327680 655360 1310720 2621440 \ +>> ${fn}.txt diff --git a/VA/benchmark-scripts/ccmcc25.sh b/VA/benchmark-scripts/ccmcc25.sh new file mode 100755 index 0000000..f6d441d --- /dev/null +++ b/VA/benchmark-scripts/ccmcc25.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +mkdir -p log/$(hostname) + +run_benchmark_nmc() { + local "$@" + set -e + sudo limit_ranks_to_numa_node ${numa_rank} + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} BL=10 \ + aspectc=1 aspectc_timing=1 dfatool_timing=0 + bin/host_code -w 0 -e 50 -i ${input_size} +} + +export -f run_benchmark_nmc + +for sdk in 2023.2.0 2024.1.0 2024.2.0 2025.1.0; do + + fn=log/$(hostname)/ccmcc25-sdk${sdk} + + source /opt/upmem/upmem-${sdk}-Linux-x86_64/upmem_env.sh + + echo "prim-benchmarks VA $(git describe --all --long) $(git rev-parse HEAD) $(date -R)" >> ${fn}.txt + + parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 input_size={input_size} numa_rank={numa_rank} \ + ::: numa_rank any \ + ::: nr_dpus 64 128 256 512 768 1024 1536 2048 2304 \ + ::: input_size 83886080 167772160 335544320 671088640 \ + >> ${fn}.txt + +done diff --git a/VA/dpu/task.c b/VA/dpu/task.c index 9622911..91b1176 100644 --- a/VA/dpu/task.c +++ b/VA/dpu/task.c @@ -10,7 +10,7 @@ #include <perfcounter.h> #include <barrier.h> -#include "../support/common.h" +#include "common.h" __host dpu_arguments_t DPU_INPUT_ARGUMENTS; diff --git a/VA/host/app.c b/VA/host/app.c index 1a2cdfd..27a64f2 100644 --- a/VA/host/app.c +++ b/VA/host/app.c @@ -7,15 +7,31 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> + +#if ASPECTC +extern "C" { +#endif + #include <dpu.h> #include <dpu_log.h> +#include <dpu_management.h> +#include <dpu_target_macros.h> + +#if ENERGY +#include <dpu_probe.h> +#endif + +#if ASPECTC +} +#endif + #include <unistd.h> #include <getopt.h> #include <assert.h> -#include "../support/common.h" -#include "../support/timer.h" -#include "../support/params.h" +#include "common.h" +#include "timer.h" +#include "params.h" // Define the DPU Binary path as DPU_BINARY here #ifndef DPU_BINARY @@ -25,13 +41,6 @@ #define XSTR(x) STR(x) #define STR(x) #x -#if ENERGY -#include <dpu_probe.h> -#endif - -#include <dpu_management.h> -#include <dpu_target_macros.h> - // Pointer declaration static T *A; static T *B; @@ -39,19 +48,19 @@ static T *C; static T *C2; // Create input arrays -static void read_input(T *A, T *B, unsigned int nr_elements) +static void read_input(T *A, T *B, unsigned long int nr_elements) { srand(0); - for (unsigned int i = 0; i < nr_elements; i++) { + for (unsigned long int i = 0; i < nr_elements; i++) { A[i] = (T) (rand()); B[i] = (T) (rand()); } } // Compute output in the host -static void vector_addition_host(T *C, T *A, T *B, unsigned int nr_elements) +static void vector_addition_host(T *C, T *A, T *B, unsigned long int nr_elements) { - for (unsigned int i = 0; i < nr_elements; i++) { + for (unsigned long int i = 0; i < nr_elements; i++) { C[i] = A[i] + B[i]; } } @@ -79,31 +88,37 @@ int main(int argc, char **argv) // Allocate DPUs and load binary #if !WITH_ALLOC_OVERHEAD DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); +#if DFATOOL_TIMING timer.time[0] = 0; // alloc #endif +#endif #if !WITH_LOAD_OVERHEAD DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus)); DPU_ASSERT(dpu_get_nr_ranks(dpu_set, &nr_of_ranks)); assert(nr_of_dpus == NR_DPUS); +#if DFATOOL_TIMING timer.time[1] = 0; // load #endif +#endif #if !WITH_FREE_OVERHEAD +#if DFATOOL_TIMING timer.time[6] = 0; // free #endif +#endif unsigned int i = 0; - const unsigned int input_size = + const unsigned long int input_size = p.exp == 0 ? p.input_size * NR_DPUS : p.input_size; - const unsigned int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned - const unsigned int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) - const unsigned int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned + const unsigned long int input_size_8bytes = ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned + const unsigned long int input_size_dpu = divceil(input_size, NR_DPUS); // Input size per DPU (max.) + const unsigned long int input_size_dpu_8bytes = ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned // Input/output allocation - A = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - B = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - C = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); - C2 = malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + A = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + B = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + C = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); + C2 = (T*)malloc(input_size_dpu_8bytes * NR_DPUS * sizeof(T)); T *bufferA = A; T *bufferB = B; T *bufferC = C2; @@ -192,14 +207,14 @@ int main(int argc, char **argv) input_size_dpu_8bytes * sizeof(T); input_arguments[i].transfer_size = input_size_dpu_8bytes * sizeof(T); - input_arguments[i].kernel = kernel; + input_arguments[i].kernel = (enum kernels)kernel; } input_arguments[nr_of_dpus - 1].size = (input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS - 1)) * sizeof(T); input_arguments[nr_of_dpus - 1].transfer_size = input_size_dpu_8bytes * sizeof(T); - input_arguments[nr_of_dpus - 1].kernel = kernel; + input_arguments[nr_of_dpus - 1].kernel = (enum kernels)kernel; // Copy input arrays i = 0; @@ -306,22 +321,22 @@ int main(int argc, char **argv) printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n"); if (rep >= p.n_warmup) { - printf - ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%d n_elements_per_dpu=%d", + dfatool_printf + ("[::] VA-UPMEM | n_dpus=%d n_ranks=%d n_tasklets=%d e_type=%s block_size_B=%d n_elements=%lu n_elements_per_dpu=%lu", nr_of_dpus, nr_of_ranks, NR_TASKLETS, XSTR(T), BLOCK_SIZE, input_size, input_size / NR_DPUS); - printf + dfatool_printf (" b_with_alloc_overhead=%d b_with_load_overhead=%d b_with_free_overhead=%d numa_node_rank=%d ", WITH_ALLOC_OVERHEAD, WITH_LOAD_OVERHEAD, WITH_FREE_OVERHEAD, numa_node_rank); - printf + dfatool_printf ("| latency_alloc_us=%f latency_load_us=%f latency_cpu_us=%f latency_write_us=%f latency_kernel_us=%f latency_read_us=%f latency_free_us=%f", timer.time[0], timer.time[1], timer.time[2], timer.time[3], timer.time[4], timer.time[5], timer.time[6]); - printf + dfatool_printf (" throughput_cpu_MBps=%f throughput_upmem_kernel_MBps=%f throughput_upmem_total_MBps=%f", input_size * 3 * sizeof(T) / timer.time[2], input_size * 3 * sizeof(T) / @@ -330,7 +345,7 @@ int main(int argc, char **argv) (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5] + timer.time[6])); - printf + dfatool_printf (" throughput_upmem_wxr_MBps=%f throughput_upmem_lwxr_MBps=%f throughput_upmem_alwxr_MBps=%f", input_size * 3 * sizeof(T) / (timer.time[3] + timer.time[4] + @@ -342,7 +357,7 @@ int main(int argc, char **argv) (timer.time[0] + timer.time[1] + timer.time[3] + timer.time[4] + timer.time[5])); - printf + dfatool_printf (" throughput_cpu_MOpps=%f throughput_upmem_kernel_MOpps=%f throughput_upmem_total_MOpps=%f", input_size / timer.time[2], input_size / (timer.time[4]), @@ -352,7 +367,7 @@ int main(int argc, char **argv) timer.time[4] + timer.time[5] + timer.time[6])); - printf + dfatool_printf (" throughput_upmem_wxr_MOpps=%f throughput_upmem_lwxr_MOpps=%f throughput_upmem_alwxr_MOpps=%f\n", input_size / (timer.time[3] + timer.time[4] + diff --git a/VA/support/common.h b/VA/include/common.h index cee09e2..6ce6e23 100755..100644 --- a/VA/support/common.h +++ b/VA/include/common.h @@ -1,14 +1,20 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +enum kernels { + kernel1 = 0, + nr_kernels = 1, +}; + // Structures used by both the host and the dpu to communicate information typedef struct { + /* + * Size per DPU cannot exceed 32 bit, as each DPU only has 64 MiB of memory + * (i.e., only needs 26 bit for addressing). + */ uint32_t size; uint32_t transfer_size; - enum kernels { - kernel1 = 0, - nr_kernels = 1, - } kernel; + enum kernels kernel; } dpu_arguments_t; // Transfer size between MRAM and WRAM diff --git a/VA/include/dfatool_host.ah b/VA/include/dfatool_host.ah new file mode 100644 index 0000000..e74f466 --- /dev/null +++ b/VA/include/dfatool_host.ah @@ -0,0 +1,29 @@ +#pragma once + +#include <sys/time.h> +#include "dfatool_host_dpu.ah" + +aspect DfatoolHostTiming : public DfatoolHostDPUTiming { + unsigned long n_rows; + unsigned int element_size; + + virtual int getKernel() { return 1; } + + DfatoolHostTiming() { + element_size = sizeof(T); + } + + advice call("% input_params(...)") : after() { + Params* p = tjp->result(); + n_rows = p->input_size; + printf("[>>] VA | n_dpus=%u n_rows=%lu\n", NR_DPUS, n_rows); + } + + advice call("% vector_addition_host(...)") : after() { + printf("[--] VA | n_dpus=%u n_rows=%lu\n", n_dpus, n_rows); + } + + advice execution("% main(...)") : after() { + printf("[<<] VA | n_dpus=%u n_rows=%lu\n", NR_DPUS, n_rows); + } +}; diff --git a/VA/support/params.h b/VA/include/params.h index 47c10ef..31327d8 100644 --- a/VA/support/params.h +++ b/VA/include/params.h @@ -4,7 +4,7 @@ #include "common.h" typedef struct Params { - unsigned int input_size; + unsigned long int input_size; int n_warmup; int n_reps; int exp; @@ -19,7 +19,7 @@ static void usage() "\n -h help" "\n -w <W> # of untimed warmup iterations (default=1)" "\n -e <E> # of timed repetition iterations (default=3)" - "\n -x <X> Weak (0) or strong (1) scaling (default=0)" + "\n -x <X> Weak (0) or strong (1) scaling (default=1)" "\n" "\nBenchmark-specific options:" "\n -i <I> input size (default=2621440 elements)" "\n"); @@ -31,7 +31,7 @@ struct Params input_params(int argc, char **argv) p.input_size = 2621440; p.n_warmup = 1; p.n_reps = 3; - p.exp = 0; + p.exp = 1; int opt; while ((opt = getopt(argc, argv, "hi:w:e:x:")) >= 0) { @@ -41,7 +41,7 @@ struct Params input_params(int argc, char **argv) exit(0); break; case 'i': - p.input_size = atoi(optarg); + p.input_size = atol(optarg); break; case 'w': p.n_warmup = atoi(optarg); diff --git a/VA/include/timer.h b/VA/include/timer.h new file mode 100644 index 0000000..7b80823 --- /dev/null +++ b/VA/include/timer.h @@ -0,0 +1,5 @@ +#pragma once + +#define N_TIMERS 7 +#include "../../include/timer_base.h" +#undef N_TIMERS diff --git a/VA/support/timer.h b/VA/support/timer.h deleted file mode 100755 index df68334..0000000 --- a/VA/support/timer.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2016 University of Cordoba and University of Illinois - * All rights reserved. - * - * Developed by: IMPACT Research Group - * University of Cordoba and University of Illinois - * http://impact.crhc.illinois.edu/ - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * > Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * > Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * > Neither the names of IMPACT Research Group, University of Cordoba, - * University of Illinois nor the names of its contributors may be used - * to endorse or promote products derived from this Software without - * specific prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - */ - -#include <sys/time.h> - -typedef struct Timer { - - struct timeval startTime[7]; - struct timeval stopTime[7]; - double time[7]; - -} Timer; - -void start(Timer *timer, int i, int rep) -{ - if (rep == 0) { - timer->time[i] = 0.0; - } - gettimeofday(&timer->startTime[i], NULL); -} - -void stop(Timer *timer, int i) -{ - gettimeofday(&timer->stopTime[i], NULL); - timer->time[i] += - (timer->stopTime[i].tv_sec - - timer->startTime[i].tv_sec) * 1000000.0 + - (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); -} - -void print(Timer *timer, int i, int REP) -{ - printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); -} - -void printall(Timer *timer, int maxt) -{ - for (int i = 0; i <= maxt; i++) { - printf(" timer%d_us=%f", i, timer->time[i]); - } - printf("\n"); -} diff --git a/TRNS/support/dfatool_host.ah b/include/dfatool_host_dpu.ah index c884c9d..c676f50 100644 --- a/TRNS/support/dfatool_host.ah +++ b/include/dfatool_host_dpu.ah @@ -2,7 +2,7 @@ #include <sys/time.h> -aspect DfatoolHostTiming { +aspect DfatoolHostDPUTiming { struct timeval starttime; struct timeval stoptime; uint32_t n_ranks = 0; @@ -10,6 +10,8 @@ aspect DfatoolHostTiming { double const M_to_Mi = 1.048576; /* 2^20 / 1e6 */ + virtual int getKernel() = 0; + advice call("% dpu_get_nr_dpus(...)") : after() { n_dpus = **(tjp->arg<1>()); } @@ -23,7 +25,9 @@ aspect DfatoolHostTiming { tjp->proceed(); gettimeofday(&stoptime, NULL); n_dpus = *(tjp->arg<0>()); - printf("[::] dpu_alloc | n_dpus=%u | latency_us=%f\n", + printf("[::] dpu_alloc @ %s:%d | n_dpus=%u | latency_us=%f\n", + tjp->filename(), + tjp->line(), n_dpus, (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec) ); @@ -34,7 +38,9 @@ aspect DfatoolHostTiming { tjp->proceed(); gettimeofday(&stoptime, NULL); n_ranks = *(tjp->arg<0>()); - printf("[::] dpu_alloc_ranks | n_ranks=%u | latency_us=%f\n", + printf("[::] dpu_alloc_ranks @ %s:%d | n_ranks=%u | latency_us=%f\n", + tjp->filename(), + tjp->line(), n_ranks, (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec) ); @@ -44,7 +50,9 @@ aspect DfatoolHostTiming { gettimeofday(&starttime, NULL); tjp->proceed(); gettimeofday(&stoptime, NULL); - printf("[::] dpu_load | n_dpus=%u n_ranks=%u | latency_us=%f\n", + printf("[::] dpu_load @ %s:%d | n_dpus=%u n_ranks=%u | latency_us=%f\n", + tjp->filename(), + tjp->line(), n_dpus, n_ranks, (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec) ); @@ -54,7 +62,9 @@ aspect DfatoolHostTiming { gettimeofday(&starttime, NULL); tjp->proceed(); gettimeofday(&stoptime, NULL); - printf("[::] dpu_free | n_dpus=%u n_ranks=%u | latency_us=%f\n", + printf("[::] dpu_free @ %s:%d | n_dpus=%u n_ranks=%u | latency_us=%f\n", + tjp->filename(), + tjp->line(), n_dpus, n_ranks, (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec) ); @@ -65,14 +75,44 @@ aspect DfatoolHostTiming { tjp->proceed(); gettimeofday(&stoptime, NULL); double latency_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec); - unsigned long input_size = p.M_ * p.m * p.N_ * p.n; - printf("[::] dpu_launch | n_dpus=%u n_ranks=%u e_kernel=kernel%d n_elements=%lu | latency_us=%f throughput_Mrps=%f throughput_MiBps=%f\n", + printf("[::] dpu_launch @ %s:%d | n_dpus=%u n_ranks=%u e_kernel=kernel%d | latency_us=%f\n", + tjp->filename(), + tjp->line(), + n_dpus, n_ranks, + getKernel(), + latency_us + ); + } + + advice call("% dpu_copy_to(...)") : around() { + size_t payload_size = *(tjp->arg<4>()); + gettimeofday(&starttime, NULL); + tjp->proceed(); + gettimeofday(&stoptime, NULL); + double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec); + printf("[::] dpu_copy_to @ %s:%d | n_dpus=%u n_ranks=%u payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + tjp->filename(), + tjp->line(), + n_dpus, n_ranks, + payload_size, + time_us, + payload_size / (time_us * M_to_Mi) + ); + } + + advice call("% dpu_copy_from(...)") : around() { + size_t payload_size = *(tjp->arg<4>()); + gettimeofday(&starttime, NULL); + tjp->proceed(); + gettimeofday(&stoptime, NULL); + double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec); + printf("[::] dpu_copy_from @ %s:%d | n_dpus=%u n_ranks=%u payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + tjp->filename(), + tjp->line(), n_dpus, n_ranks, - kernel + 1, - input_size, - latency_us, - input_size / latency_us, - input_size * sizeof(T) / (latency_us * M_to_Mi) + payload_size, + time_us, + payload_size / (time_us * M_to_Mi) ); } @@ -83,14 +123,18 @@ aspect DfatoolHostTiming { gettimeofday(&stoptime, NULL); double time_us = (stoptime.tv_sec - starttime.tv_sec) * 1000000.0 + (stoptime.tv_usec - starttime.tv_usec); if (*(tjp->arg<1>()) == DPU_XFER_TO_DPU) { - printf("[::] dpu_push_to_dpu | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + printf("[::] dpu_push_to_dpu @ %s:%d | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + tjp->filename(), + tjp->line(), n_dpus, n_ranks, payload_size * n_dpus, payload_size, time_us, payload_size * n_dpus / (time_us * M_to_Mi) ); } else if (*(tjp->arg<1>()) == DPU_XFER_FROM_DPU) { - printf("[::] dpu_push_from_dpu | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + printf("[::] dpu_push_from_dpu @ %s:%d | n_dpus=%u n_ranks=%u total_payload_B=%lu dpu_payload_B=%lu | latency_us=%f throughput_MiBps=%f\n", + tjp->filename(), + tjp->line(), n_dpus, n_ranks, payload_size * n_dpus, payload_size, time_us, diff --git a/include/timer_base.h b/include/timer_base.h new file mode 100644 index 0000000..160136c --- /dev/null +++ b/include/timer_base.h @@ -0,0 +1,64 @@ +#pragma once + +#include <sys/time.h> + +#if DFATOOL_TIMING + +typedef struct Timer { + + struct timeval startTime[N_TIMERS]; + struct timeval stopTime[N_TIMERS]; + double time[N_TIMERS]; + +} Timer; + +#define dfatool_printf(fmt, ...) do { printf(fmt, __VA_ARGS__); } while (0) + +void start(Timer *timer, int i, int rep) +{ + if (rep == 0) { + timer->time[i] = 0.0; + } + gettimeofday(&timer->startTime[i], NULL); +} + +void stop(Timer *timer, int i) +{ + gettimeofday(&timer->stopTime[i], NULL); + timer->time[i] += + (timer->stopTime[i].tv_sec - + timer->startTime[i].tv_sec) * 1000000.0 + + (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); +} + +void zero(Timer *timer, int i) +{ + timer->time[i] = 0; +} + +#else + +#define dfatool_printf(fmt, ...) do {} while (0) + +typedef int Timer; + +void start(Timer *timer, int i, int rep) +{ + (void)timer; + (void)i; + (void)rep; +} + +void stop(Timer *timer, int i) +{ + (void)timer; + (void)i; +} + +void zero(Timer *timer, int i) +{ + (void)timer; + (void)i; +} + +#endif diff --git a/run_strong_full.py b/run_strong_full.py deleted file mode 100644 index c65fecd..0000000 --- a/run_strong_full.py +++ /dev/null @@ -1,122 +0,0 @@ -import os -import sys -import getpass - -rootdir = "/" # Include path to repo - -applications = {"VA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 167772160 -x 1"], - "GEMV" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m 163840 -n 4096"], - "SpMV" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/bcsstk30.mtx.64.mtx"], - "SEL" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"], - "UNI" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"], - "BS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i 16777216"], - "TS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n 33554432"], - "BFS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/loc-gowalla_edges.txt"], - "MLP" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m 163840 -n 4096"], - "NW" : ["NR_DPUS=X NR_TASKLETS=Y BL=32 BL_IN=2 make all", "./bin/nw_host -w 0 -e 1 -n 65536"], - "HST-S" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 2"], - "HST-L" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 2"], - "RED" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i 419430400 -x 1"], - "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"], - "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 251658240 -x 1"], - "TRNS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p 2048 -o 12288 -x 1"],} - -def run(app_name): - - NR_TASKLETS = [1, 2, 4, 8, 16] - NR_DPUS = [256, 512, 1024, 2048] - BL = [10] - - if app_name in applications: - print ("------------------------ Running: "+app_name+"----------------------") - print ("--------------------------------------------------------------------") - if(len(applications[app_name]) > 1): - make = applications[app_name][0] - run_cmd = applications[app_name][1] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - os.system("make clean") - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - except OSError: - print ("Creation of the direction /bin failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log") - except OSError: - print ("Creation of the direction /log failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log/host") - except OSError: - print ("Creation of the direction /log/host failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction /profile failed") - - - for r in NR_DPUS: - for t in NR_TASKLETS: - for b in BL: - m = make.replace("X", str(r)) - m = m.replace("Y", str(t)) - m = m.replace("Z", str(b)) - print ("Running = " + m) - try: - os.system(m) - except: - pass - - r_cmd = run_cmd.replace("#ranks", str(r)) - r_cmd = r_cmd + " >> profile/outss_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) - - print ("Running = " + app_name + " -> "+ r_cmd) - try: - os.system(r_cmd) - except: - pass - else: - make = applications[app_name] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - os.mkdir(rootdir + "/"+ app_name +"/log") - os.mkdir(rootdir + "/"+ app_name +"/log/host") - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction failed") - - print (make) - os.system(make + ">& profile/out") - - else: - print ( "Application "+app_name+" not available" ) - -def main(): - if(len(sys.argv) < 2): - print ("Usage: python run.py application") - print ("Applications available: ") - for key, value in applications.items(): - print (key ) - print ("All") - - else: - cmd = sys.argv[1] - print ("Application to run is: " + cmd ) - if cmd == "All": - for key, value in applications.items(): - run(key) - os.chdir(rootdir) - else: - run(cmd) - -if __name__ == "__main__": - main() diff --git a/run_strong_rank.py b/run_strong_rank.py deleted file mode 100644 index 68f401e..0000000 --- a/run_strong_rank.py +++ /dev/null @@ -1,133 +0,0 @@ -import os -import sys -import getpass - -rootdir = "/" # Include path to repo -print("Root dir: " + rootdir) - -applications = {"VA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 2621440 -x 1"], - "GEMV" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m 8192 -n 1024"], - "SpMV" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0"], - "SEL" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"], - "UNI" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"], - "BS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i 262144"], - "TS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n 524288"], - "BFS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f data/loc-gowalla_edges.txt"], - "MLP" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m 8192 -n 1024"], - "NW" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z BL_IN=2 make all", "./bin/nw_host -w 0 -e 1 -n 2560"], - "HST-S" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 1"], - "HST-L" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 1"], - "RED" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i 6553600 -x 1"], - "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"], - "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i 3932160 -x 1"], - "TRNS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p 64 -o 12288 -x 1"],} - -def run(app_name): - - NR_TASKLETS = [1, 2, 4, 8, 16] - NR_DPUS = [1, 4, 16, 64] - BL = [10] - - if app_name in applications: - print ("------------------------ Running: "+app_name+"----------------------") - print ("--------------------------------------------------------------------") - if(len(applications[app_name]) > 1): - make = applications[app_name][0] - run_cmd = applications[app_name][1] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - os.system("make clean") - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - except OSError: - print ("Creation of the direction /bin failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log") - except OSError: - print ("Creation of the direction /log failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log/host") - except OSError: - print ("Creation of the direction /log/host failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction /profile failed") - - - for r in NR_DPUS: - for t in NR_TASKLETS: - for b in BL: - m = make.replace("X", str(r)) - m = m.replace("Y", str(t)) - if (app_name == "NW"): - if (r == 1): - m = m.replace("Z", str(2560)) - elif (r == 4): - m = m.replace("Z", str(640)) - elif (r == 16): - m = m.replace("Z", str(160)) - elif (r == 64): - m = m.replace("Z", str(40)) - else: - m = m.replace("Z", str(b)) - print ("Running = " + m) - try: - os.system(m) - except: - pass - - r_cmd = run_cmd.replace("#ranks", str(r)) - r_cmd = r_cmd + " >> profile/outs_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) - - print ("Running = " + app_name + " -> "+ r_cmd) - try: - os.system(r_cmd) - except: - pass - else: - make = applications[app_name] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - os.mkdir(rootdir + "/"+ app_name +"/log") - os.mkdir(rootdir + "/"+ app_name +"/log/host") - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction failed") - - print (make) - os.system(make + ">& profile/out") - - else: - print ( "Application "+app_name+" not available" ) - -def main(): - if(len(sys.argv) < 2): - print ("Usage: python run.py application") - print ("Applications available: ") - for key, value in applications.items(): - print (key ) - print ("All") - - else: - cmd = sys.argv[1] - print ("Application to run is: " + cmd ) - if cmd == "All": - for key, value in applications.items(): - run(key) - os.chdir(rootdir) - else: - run(cmd) - -if __name__ == "__main__": - main() diff --git a/run_weak.py b/run_weak.py deleted file mode 100644 index a613675..0000000 --- a/run_weak.py +++ /dev/null @@ -1,173 +0,0 @@ -import os -import sys -import getpass - -rootdir = "/" # Include path to repo - -applications = {"VA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "GEMV" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/gemv_host -m #elements -n 2048"], - "SpMV" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f file_name"], - "SEL" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "UNI" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "BS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/bs_host -i #elements"], - "TS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/ts_host -n #elements"], - "BFS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -v 0 -f file_name"], - "MLP" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/mlp_host -m #elements -n 1024"], - "NW" : ["NR_DPUS=X NR_TASKLETS=Y BL=512 BL_IN=8 make all", "./bin/nw_host -w 0 -e 1 -n #elements"], - "HST-S" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 0"], - "HST-L" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -b 256 -x 0"], - "RED" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z VERSION=SINGLE make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "SCAN-SSA" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "SCAN-RSS" : ["NR_DPUS=X NR_TASKLETS=Y BL=Z make all", "./bin/host_code -w 0 -e 1 -i #elements -x 0"], - "TRNS" : ["NR_DPUS=X NR_TASKLETS=Y make all", "./bin/host_code -w 0 -e 1 -p #elements -o 12288 -x 0"],} - -def run(app_name): - - NR_DPUS = [1, 4, 16, 64] - NR_TASKLETS = [1, 2, 4, 8, 16] - size = 1 - BL = [10] - if(app_name == "VA"): - size = 2621440 - if(app_name == "GEMV"): - size = 1024 - if(app_name == "SEL" or app_name == "UNI" or app_name == "SCAN-SSA" or app_name == "SCAN-RSS"): - size = 3932160 - if(app_name == "TS"): - size = 524288 - if(app_name == "BS"): - size = 262144 - if(app_name == "MLP"): - size = 1024 - if(app_name == "RED"): - size = 6553600 - if(app_name == "TRNS"): - size = 1 - - - if app_name in applications: - print ("------------------------ Running: "+app_name+"----------------------") - print ("--------------------------------------------------------------------") - if(len(applications[app_name]) > 1): - make = applications[app_name][0] - run_cmd = applications[app_name][1] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - os.system("make clean") - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - except OSError: - print ("Creation of the direction /bin failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log") - except OSError: - print ("Creation of the direction /log failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/log/host") - except OSError: - print ("Creation of the direction /log/host failed") - - try: - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction /profile failed") - - - for r in NR_DPUS: - for t in NR_TASKLETS: - for b in BL: - m = make.replace("X", str(r)) - m = m.replace("Y", str(t)) - m = m.replace("Z", str(b)) - print ("Running = " + m) - try: - os.system(m) - except: - pass - - if(app_name == "NW"): - if(r == 1): - r_cmd = run_cmd.replace("#elements", str(512)) - if(r == 4): - r_cmd = run_cmd.replace("#elements", str(2048)) - if(r == 16): - r_cmd = run_cmd.replace("#elements", str(8192)) - if(r == 64): - r_cmd = run_cmd.replace("#elements", str(32768)) - elif(app_name == "GEMV" or app_name == "MLP" or app_name == "TS" or app_name == "BS"): - r_cmd = run_cmd.replace("#elements", str(r * size)) - else: - r_cmd = run_cmd.replace("#elements", str(size)) - if(app_name == "BFS"): - if(r == 1): - # Generate rMat graphs using: - # https://github.com/cmuparlay/pbbsbench/blob/master/testData/graphData/rMatGraph.html - # https://github.com/cmuparlay/pbbsbench/blob/master/testData/graphData/rMatGraph.C - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph - if(r == 4): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph - if(r == 16): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph - if(r == 64): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - rMat graph - if(app_name == "SpMV"): - if(r == 1): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate - if(r == 4): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate - if(r == 16): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate - if(r == 64): - r_cmd = run_cmd.replace("file_name", "/") # Include path to input file - Check SpMV/data/generate - r_cmd = r_cmd + " >> profile/out_tl"+str(t)+"_bl"+str(b)+"_dpus"+str(r) - - print ("Running = " + app_name + " -> "+ r_cmd) - try: - os.system(r_cmd) - except: - pass - else: - make = applications[app_name] - - os.chdir(rootdir + "/"+app_name) - os.getcwd() - - try: - os.mkdir(rootdir + "/"+ app_name +"/bin") - os.mkdir(rootdir + "/"+ app_name +"/log") - os.mkdir(rootdir + "/"+ app_name +"/log/host") - os.mkdir(rootdir + "/"+ app_name +"/profile") - except OSError: - print ("Creation of the direction failed") - - print (make) - os.system(make + ">& profile/out") - - else: - print ( "Application "+app_name+" not available" ) - -def main(): - if(len(sys.argv) < 2): - print ("Usage: python run.py application") - print ("Applications available: ") - for key, value in applications.items(): - print (key ) - print ("All") - - else: - cmd = sys.argv[1] - print ("Application to run is: " + cmd ) - if cmd == "All": - for key, value in applications.items(): - run(key) - os.chdir(rootdir) - else: - run(cmd) - -if __name__ == "__main__": - main() diff --git a/set-root-dir.sh b/set-root-dir.sh deleted file mode 100755 index 35be69f..0000000 --- a/set-root-dir.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -sed -i 's!rootdir = "/"!rootdir = "'"$(pwd)"'"!' *.py |
