diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-04 08:29:32 +0200 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-07-04 08:29:32 +0200 |
commit | 6ea34c34074793b4989614207ffe414cd3824e6b (patch) | |
tree | 3d8b79169e03b66fdcfef646eda3424bab1ec3d3 | |
parent | 0be67659d02b62cee3a34c19fa25a758eb2472d1 (diff) |
BS: Add NUMA variant for HBM
-rw-r--r-- | BS/baselines/cpu/Makefile | 9 | ||||
-rw-r--r-- | BS/baselines/cpu/bs_omp.c | 82 | ||||
-rwxr-xr-x | BS/baselines/cpu/run.sh | 7 | ||||
-rwxr-xr-x | BS/dimes-hetsim.sh | 40 |
4 files changed, 123 insertions, 15 deletions
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile index 8faf3c4..5c540d7 100644 --- a/BS/baselines/cpu/Makefile +++ b/BS/baselines/cpu/Makefile @@ -1,8 +1,15 @@ +NUMA ?= 0 +FLAGS = + +ifeq (${NUMA}, 1) + FLAGS += -lnuma +endif + .PHONY: all all: bs_omp bs_omp: bs_omp.c - gcc -O2 bs_omp.c -o bs_omp -fopenmp + gcc -Wall -Wextra -pedantic -O2 -DNUMA=${NUMA} bs_omp.c -o bs_omp -fopenmp ${FLAGS} bs_omp_O0: bs_omp.c gcc bs_omp.c -o bs_omp_O0 -fopenmp diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c index 3775bce..f160c58 100644 --- a/BS/baselines/cpu/bs_omp.c +++ b/BS/baselines/cpu/bs_omp.c @@ -1,4 +1,3 @@ - #include <stdio.h> #include <stdlib.h> #include <stdbool.h> @@ -10,15 +9,24 @@ #include <stdint.h> #include "timer.h" +#if NUMA +#include <numaif.h> +#include <numa.h> + +void* mp_pages[1]; +int mp_status[1]; +int mp_nodes[1]; +struct bitmask* bitmask_in; +int numa_node_in = -1; +int numa_node_cpu = -1; +#endif + #define DTYPE uint64_t /* * @brief creates a "test file" by filling a bufferwith values */ void create_test_file(DTYPE * input, uint64_t nr_elements, DTYPE * querys, uint64_t n_querys) { - uint64_t max = UINT64_MAX; - uint64_t min = 0; - srand(time(NULL)); input[0] = 1; @@ -74,23 +82,63 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne * @brief Main of the Host Application. */ int main(int argc, char **argv) { - + (void)argc; Timer timer; uint64_t input_size = atol(argv[1]); uint64_t n_querys = atol(argv[2]); +#if NUMA + bitmask_in = numa_parse_nodestring(argv[3]); + numa_node_cpu = atoi(argv[4]); +#endif printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys); - + +#if NUMA + if (bitmask_in) { + numa_set_membind(bitmask_in); + numa_free_nodemask(bitmask_in); + } + DTYPE * input = numa_alloc((input_size) * sizeof(DTYPE)); + DTYPE * querys = numa_alloc((n_querys) * sizeof(DTYPE)); +#else DTYPE * input = malloc((input_size) * sizeof(DTYPE)); DTYPE * querys = malloc((n_querys) * sizeof(DTYPE)); +#endif + +#if NUMA + struct bitmask *bitmask_all = numa_allocate_nodemask(); + numa_bitmask_setall(bitmask_all); + numa_set_membind(bitmask_all); + numa_free_nodemask(bitmask_all); +#endif DTYPE result_host = -1; // Create an input file with arbitrary data. create_test_file(input, input_size, querys, n_querys); - + +#if NUMA + mp_pages[0] = input; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(A)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_in = mp_status[0]; + } + + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } +#endif + start(&timer, 0, 0); - result_host = binarySearch(input, input_size - 1, querys, n_querys); + result_host = binarySearch(input, input_size - 1, querys, n_querys); stop(&timer, 0); unsigned int nr_threads = 0; @@ -100,18 +148,30 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne int status = (result_host); if (status) { - printf("[::] BS CPU | n_threads=%d e_type=%s n_elements=%d " - "| throughput_MBps=%f", + printf("[::] BS CPU | n_threads=%d e_type=%s n_elements=%lu" +#if NUMA + " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" +#endif + " | throughput_MBps=%f", nr_threads, "uint64_t", input_size, +#if NUMA + numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), +#endif n_querys * sizeof(DTYPE) / timer.time[0]); printf(" throughput_MOpps=%f", - nr_threads, "uint64_t", input_size, n_querys / timer.time[0]); printall(&timer, 0); } else { printf("[ERROR]\n"); } + +#if NUMA + numa_free(input, input_size * sizeof(DTYPE)); + numa_free(querys, n_querys * sizeof(DTYPE)); +#else free(input); + free(querys); +#endif return status ? 0 : 1; diff --git a/BS/baselines/cpu/run.sh b/BS/baselines/cpu/run.sh index 022ce0e..56f8cb3 100755 --- a/BS/baselines/cpu/run.sh +++ b/BS/baselines/cpu/run.sh @@ -4,7 +4,7 @@ set -e HOST="$(hostname)" -echo $HOST +mkdir -p "$HOST" ( echo "prim-benchmarks BS CPU (dfatool edition)" @@ -13,7 +13,7 @@ echo "Revision $(git describe --always)" make -B verbose=1 -for nr_threads in 88 64 44 32 24 20 1 2 4 6 8 12 16; do +for nr_threads in 128 96 88 64 44 32 24 20 1 2 4 6 8 12 16; do #for vs in 262144 524288 1048576 2097152; do # NMC also uses 262144 elements for vs in 262144; do @@ -23,4 +23,5 @@ for nr_threads in 88 64 44 32 24 20 1 2 4 6 8 12 16; do done done done -) | tee "${HOST}-explore.txt" +echo "Completed at $(date)" +) | tee "${HOST}/explore.txt" diff --git a/BS/dimes-hetsim.sh b/BS/dimes-hetsim.sh new file mode 100755 index 0000000..0ca7481 --- /dev/null +++ b/BS/dimes-hetsim.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +cd baselines/cpu +make -B NUMA=1 + +mkdir -p log/$(hostname) +fn=log/$(hostname)/$(date +%Y%m%d)-baseline.txt + +( + +for i in `seq 1 20`; do + for nr_threads in 1 2 4 8 12 16; do + for cpu in 0 1 2 3 4 5 6 7; do + for ram in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + # 2048576 is INPUT_SIZE for DPU version + # 2048576 * uint64 ≈ 16 MiB (fits into cache) + OMP_NUM_THREADS=$nr_threads ./bs_omp 2048576 16777216 $ram $cpu + # 2^27 * uint64 == 1 GiB + OMP_NUM_THREADS=$nr_threads ./bs_omp $(perl -E 'say 2 ** 27') 16777216 $ram $cpu + # 2^29 * uint64 == 4 GiB + OMP_NUM_THREADS=$nr_threads ./bs_omp $(perl -E 'say 2 ** 29') 16777216 $ram $cpu + done + done + done + for nr_threads in 32 48 64 96 128; do + for ram in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + # 2048576 is INPUT_SIZE for DPU version + # 2048576 * uint64 ≈ 16 MiB (fits into cache) + OMP_NUM_THREADS=$nr_threads ./bs_omp 2048576 16777216 $ram -1 + # 2^27 * uint64 == 1 GiB + OMP_NUM_THREADS=$nr_threads ./bs_omp $(perl -E 'say 2 ** 27') 16777216 $ram -1 + # 2^29 * uint64 == 4 GiB + OMP_NUM_THREADS=$nr_threads ./bs_omp $(perl -E 'say 2 ** 29') 16777216 $ram -1 + done + done +done + +) | tee $fn + +xz -f -v -9 -M 800M $fn |