diff options
-rw-r--r-- | BS/baselines/cpu/Makefile | 3 | ||||
-rw-r--r-- | BS/baselines/cpu/bs_omp.c | 98 | ||||
-rwxr-xr-x | BS/dimes-hetsim-hbm.sh | 16 | ||||
-rwxr-xr-x | BS/dimes-hetsim-nmc.sh | 78 |
4 files changed, 153 insertions, 42 deletions
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile index bed2561..b67602f 100644 --- a/BS/baselines/cpu/Makefile +++ b/BS/baselines/cpu/Makefile @@ -1,4 +1,5 @@ NUMA ?= 0 +NUMA_MEMCPY ?= 0 FLAGS = ifeq (${NUMA}, 1) @@ -9,7 +10,7 @@ endif all: bs_omp bs_omp: bs_omp.c - gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} bs_omp.c -o bs_omp -fopenmp ${FLAGS} + gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS} bs_omp_O0: bs_omp.c gcc bs_omp.c -o bs_omp_O0 -fopenmp diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c index 1b9947c..874299b 100644 --- a/BS/baselines/cpu/bs_omp.c +++ b/BS/baselines/cpu/bs_omp.c @@ -21,6 +21,15 @@ int numa_node_in = -1; int numa_node_cpu = -1; #endif + +#if NUMA_MEMCPY +struct bitmask* bitmask_cpu; +int numa_node_cpu_memcpy = -1; +int numa_node_local = -1; +int numa_node_in_is_local = 0; +#endif + + #define DTYPE uint64_t /* * @brief creates a "test file" by filling a bufferwith values @@ -90,6 +99,10 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne bitmask_in = numa_parse_nodestring(argv[3]); numa_node_cpu = atoi(argv[4]); #endif +#if NUMA_MEMCPY + bitmask_cpu = numa_parse_nodestring(argv[5]); + numa_node_cpu_memcpy = atoi(argv[6]); +#endif printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys); @@ -106,10 +119,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne #endif #if NUMA +#if NUMA_MEMCPY + if (bitmask_cpu) { + numa_set_membind(bitmask_cpu); + numa_free_nodemask(bitmask_cpu); + } +#else struct bitmask *bitmask_all = numa_allocate_nodemask(); numa_bitmask_setall(bitmask_all); numa_set_membind(bitmask_all); numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY #endif DTYPE result_host = -1; @@ -137,10 +157,71 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne } #endif +#if NUMA_MEMCPY + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; +#endif + +#if NUMA_MEMCPY + DTYPE *input_local = input; + DTYPE *querys_local = querys; + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + input_local = numa_alloc((input_size) * sizeof(DTYPE)); + querys_local = numa_alloc((n_querys) * sizeof(DTYPE)); + } + stop(&timer, 1); + if (!numa_node_in_is_local) { + if (numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(numa_node_cpu_memcpy) == -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + memcpy(input_local, input, input_size * sizeof(DTYPE)); + memcpy(querys_local, querys, n_querys * sizeof(DTYPE)); + } else { + input_local = input; + querys_local = querys; + } + stop(&timer, 2); + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + mp_pages[0] = input_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(input_local)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_local = mp_status[0]; + } +#endif + start(&timer, 0, 0); +#if NUMA_MEMCPY + result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys); +#else result_host = binarySearch(input, input_size - 1, querys, n_querys); +#endif stop(&timer, 0); +#if NUMA_MEMCPY + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(input_local, input_size * sizeof(DTYPE)); + numa_free(querys_local, n_querys * sizeof(DTYPE)); + } + stop(&timer, 3); +#endif + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic @@ -148,6 +229,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne int status = (result_host); if (status) { +#if NUMA_MEMCPY + printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu" + " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d" + " | throughput_MBps=%f throughput_MOpps=%f" + " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + nr_threads, "uint64_t", input_size, + numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), + n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0], + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); +#else printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu" #if NUMA " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" @@ -158,9 +250,9 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), #endif n_querys * sizeof(DTYPE) / timer.time[0]); - printf(" throughput_MOpps=%f", - n_querys / timer.time[0]); - printall(&timer, 0); + printf(" throughput_MOpps=%f latency_us=%f\n", + n_querys / timer.time[0], timer.time[0]); +#endif } else { printf("[ERROR]\n"); } diff --git a/BS/dimes-hetsim-hbm.sh b/BS/dimes-hetsim-hbm.sh index 2aabe15..4e1500d 100755 --- a/BS/dimes-hetsim-hbm.sh +++ b/BS/dimes-hetsim-hbm.sh @@ -1,12 +1,10 @@ #!/bin/bash -echo PrIM BS - cd baselines/cpu make -B NUMA=1 mkdir -p log/$(hostname) -fn=log/$(hostname)/$(date +%Y%m%d) +fn=log/$(hostname)/dimes-hetsim-hbm # upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB) # upstream DPU version uses 2 queries @@ -32,7 +30,7 @@ export -f run_benchmark echo "single-node execution, DPU ref (1/4)" >&2 -parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ input_size=${input_size_dpu} num_queries=${num_queries_dpu} \ ::: i $(seq 1 5) \ @@ -42,7 +40,7 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ echo "single-node execution, HBM ref (2/4)" >&2 -parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ input_size=${input_size_hbm} num_queries=${num_queries_hbm} \ ::: i $(seq 1 5) \ @@ -52,7 +50,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ echo "multi-node execution, DPU ref (3/4)" >&2 -parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ input_size=${input_size_dpu} num_queries=${num_queries_dpu} \ ::: i $(seq 1 40) \ @@ -62,7 +60,7 @@ parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ echo "multi-node execution, HBM ref (4/4)" >&2 -parallel -j1 --eta --joblog ${fn}.4.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ run_benchmark i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ input_size=${input_size_hbm} num_queries=${num_queries_hbm} \ ::: i $(seq 1 40) \ @@ -70,6 +68,4 @@ parallel -j1 --eta --joblog ${fn}.4.joblog --header : \ ::: cpu -1 \ ::: ram $(seq 0 15) -) > ${fn}.txt - -xz -f -v -9 -M 800M ${fn}.txt +) >> ${fn}.txt diff --git a/BS/dimes-hetsim-nmc.sh b/BS/dimes-hetsim-nmc.sh index 2313985..195334b 100755 --- a/BS/dimes-hetsim-nmc.sh +++ b/BS/dimes-hetsim-nmc.sh @@ -1,9 +1,7 @@ #!/bin/bash -echo PrIM BS - mkdir -p log/$(hostname) baselines/cpu/log/$(hostname) -fn=log/$(hostname)/$(date +%Y%m%d) +fn=log/$(hostname)/dimes-hetsim-nmc # upstream DPU version uses 2048576 * uint64 ≈ 16 MiB (DPU max: 64 MiB) # upstream DPU version uses 2 queries @@ -15,19 +13,18 @@ num_queries_dpu=1048576 run_benchmark_nmc() { local "$@" + set -e sudo limit_ranks_to_numa_node ${numa_rank} - if make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 INPUT_SIZE=${input_size} PROBLEM_SIZE=${num_queries}; then - bin/bs_host -w 0 -e 100 2>&1 - fi - return $? + make -B NR_DPUS=${nr_dpus} NR_TASKLETS=${nr_tasklets} WITH_ALLOC_OVERHEAD=1 WITH_LOAD_OVERHEAD=1 WITH_FREE_OVERHEAD=1 INPUT_SIZE=${input_size} PROBLEM_SIZE=${num_queries} + bin/bs_host -w 0 -e 100 2>&1 } export -f run_benchmark_nmc run_benchmark_baseline() { local "$@" - OMP_NUM_THREADS=${nr_threads} ./bs_omp ${input_size} ${num_queries} ${ram} ${cpu} 2>&1 - return $? + set -e + OMP_NUM_THREADS=${nr_threads} ./bs_omp ${input_size} ${num_queries} ${ram} ${cpu} ${ram_local} ${cpu_memcpy} 2>&1 } export -f run_benchmark_baseline @@ -36,7 +33,7 @@ export -f run_benchmark_baseline echo "NMC single-node upstream-ref (1/4)" >&2 -parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ num_queries=${num_queries_upstream} input_size=${input_size_upstream} \ ::: numa_rank 0 1 \ @@ -44,7 +41,7 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ echo "NMC multi-node upstream-ref (2/4)" >&2 -parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ num_queries=${num_queries_upstream} input_size=${input_size_upstream} \ ::: numa_rank -1 \ @@ -52,7 +49,7 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ echo "NMC single-node DPU-ref (3/4)" >&2 -parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ num_queries=${num_queries_dpu} input_size=${input_size_dpu} \ ::: numa_rank 0 1 \ @@ -60,24 +57,51 @@ parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ echo "NMC multi-node DPU-ref (4/4)" >&2 -parallel -j1 --eta --joblog ${fn}.4.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ run_benchmark_nmc nr_dpus={nr_dpus} nr_tasklets=16 numa_rank={numa_rank} \ num_queries=${num_queries_dpu} input_size=${input_size_dpu} \ ::: numa_rank -1 \ ::: nr_dpus 1536 2048 2304 -) > ${fn}.txt - -xz -f -v -9 -M 800M ${fn}.txt +) >> ${fn}.txt cd baselines/cpu -make -B NUMA=1 ( -echo "CPU single-node upstream-ref (1/4)" >&2 +make -B NUMA=1 NUMA_MEMCPY=1 -parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ +echo "CPU single-node upstream-ref with memcpy, copy node == input node (1/6)" >&2 + +parallel -j1 --eta --joblog ${fn}.1.joblog --resume --header : \ + run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ram_local={ram_local} cpu_memcpy={cpu_memcpy} \ + num_queries=${num_queries_upstream} input_size=${input_size_upstream} \ + ::: i $(seq 1 20) \ + ::: ram 0 1 \ + :::+ cpu_memcpy 0 1 \ + ::: ram_local 0 1 \ + :::+ cpu 0 1 \ + ::: nr_threads 1 2 4 8 12 16 + +echo "CPU single-node dpu-ref with memcpy, copy node == input node (2/6)" >&2 + +parallel -j1 --eta --joblog ${fn}.2.joblog --resume --header : \ + run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ + ram_local={ram_local} cpu_memcpy={cpu_memcpy} \ + num_queries=${num_queries_dpu} input_size=${input_size_dpu} \ + ::: i $(seq 1 20) \ + ::: ram 0 1 \ + :::+ cpu_memcpy 0 1 \ + ::: ram_local 0 1 \ + :::+ cpu 0 1 \ + ::: nr_threads 1 2 4 8 12 16 + +make -B NUMA=1 + +echo "CPU single-node upstream-ref (3/6)" >&2 + +parallel -j1 --eta --joblog ${fn}.3.joblog --resume --header : \ run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ num_queries=${num_queries_upstream} input_size=${input_size_upstream} \ ::: i $(seq 1 20) \ @@ -85,9 +109,9 @@ parallel -j1 --eta --joblog ${fn}.1.joblog --header : \ ::: ram 0 1 \ ::: nr_threads 1 2 4 8 12 16 -echo "CPU single-node DPU-ref (2/4)" >&2 +echo "CPU single-node DPU-ref (4/6)" >&2 -parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.4.joblog --resume --header : \ run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ num_queries=${num_queries_dpu} input_size=${input_size_dpu} \ ::: i $(seq 1 20) \ @@ -95,9 +119,9 @@ parallel -j1 --eta --joblog ${fn}.2.joblog --header : \ ::: ram 0 1 \ ::: nr_threads 1 2 4 8 12 16 -echo "CPU multi-node upstream-ref (3/4)" >&2 +echo "CPU multi-node upstream-ref (5/6)" >&2 -parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.5.joblog --resume --header : \ run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ num_queries=${num_queries_upstream} input_size=${input_size_upstream} \ ::: i $(seq 1 20) \ @@ -105,9 +129,9 @@ parallel -j1 --eta --joblog ${fn}.3.joblog --header : \ ::: ram 0 1 \ ::: nr_threads 24 32 -echo "CPU multi-node DPU-ref (4/4)" >&2 +echo "CPU multi-node DPU-ref (6/6)" >&2 -parallel -j1 --eta --joblog ${fn}.4.joblog --header : \ +parallel -j1 --eta --joblog ${fn}.6.joblog --resume --header : \ run_benchmark_baseline i={i} nr_threads={nr_threads} ram={ram} cpu={cpu} \ num_queries=${num_queries_dpu} input_size=${input_size_dpu} \ ::: i $(seq 1 20) \ @@ -115,6 +139,4 @@ parallel -j1 --eta --joblog ${fn}.4.joblog --header : \ ::: ram 0 1 \ ::: nr_threads 24 32 -) > ${fn}.txt - -xz -f -v -9 -M 800M ${fn}.txt +) >> ${fn}.txt |