diff options
Diffstat (limited to 'BS/baselines/cpu')
-rw-r--r-- | BS/baselines/cpu/Makefile | 3 | ||||
-rw-r--r-- | BS/baselines/cpu/bs_omp.c | 98 |
2 files changed, 97 insertions, 4 deletions
diff --git a/BS/baselines/cpu/Makefile b/BS/baselines/cpu/Makefile index bed2561..b67602f 100644 --- a/BS/baselines/cpu/Makefile +++ b/BS/baselines/cpu/Makefile @@ -1,4 +1,5 @@ NUMA ?= 0 +NUMA_MEMCPY ?= 0 FLAGS = ifeq (${NUMA}, 1) @@ -9,7 +10,7 @@ endif all: bs_omp bs_omp: bs_omp.c - gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} bs_omp.c -o bs_omp -fopenmp ${FLAGS} + gcc -Wall -Wextra -pedantic -march=native -O2 -DNUMA=${NUMA} -DNUMA_MEMCPY=${NUMA_MEMCPY} bs_omp.c -o bs_omp -fopenmp ${FLAGS} bs_omp_O0: bs_omp.c gcc bs_omp.c -o bs_omp_O0 -fopenmp diff --git a/BS/baselines/cpu/bs_omp.c b/BS/baselines/cpu/bs_omp.c index 1b9947c..874299b 100644 --- a/BS/baselines/cpu/bs_omp.c +++ b/BS/baselines/cpu/bs_omp.c @@ -21,6 +21,15 @@ int numa_node_in = -1; int numa_node_cpu = -1; #endif + +#if NUMA_MEMCPY +struct bitmask* bitmask_cpu; +int numa_node_cpu_memcpy = -1; +int numa_node_local = -1; +int numa_node_in_is_local = 0; +#endif + + #define DTYPE uint64_t /* * @brief creates a "test file" by filling a bufferwith values @@ -90,6 +99,10 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne bitmask_in = numa_parse_nodestring(argv[3]); numa_node_cpu = atoi(argv[4]); #endif +#if NUMA_MEMCPY + bitmask_cpu = numa_parse_nodestring(argv[5]); + numa_node_cpu_memcpy = atoi(argv[6]); +#endif printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys); @@ -106,10 +119,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne #endif #if NUMA +#if NUMA_MEMCPY + if (bitmask_cpu) { + numa_set_membind(bitmask_cpu); + numa_free_nodemask(bitmask_cpu); + } +#else struct bitmask *bitmask_all = numa_allocate_nodemask(); numa_bitmask_setall(bitmask_all); numa_set_membind(bitmask_all); numa_free_nodemask(bitmask_all); +#endif // NUMA_MEMCPY #endif DTYPE result_host = -1; @@ -137,10 +157,71 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne } #endif +#if NUMA_MEMCPY + numa_node_in_is_local = ((numa_node_cpu == numa_node_in) || (numa_node_cpu + 8 == numa_node_in)) * 1; +#endif + +#if NUMA_MEMCPY + DTYPE *input_local = input; + DTYPE *querys_local = querys; + start(&timer, 1, 0); + if (!numa_node_in_is_local) { + input_local = numa_alloc((input_size) * sizeof(DTYPE)); + querys_local = numa_alloc((n_querys) * sizeof(DTYPE)); + } + stop(&timer, 1); + if (!numa_node_in_is_local) { + if (numa_node_cpu_memcpy != -1) { + if (numa_run_on_node(numa_node_cpu_memcpy) == -1) { + perror("numa_run_on_node"); + numa_node_cpu_memcpy = -1; + } + } + } + start(&timer, 2, 0); + if (!numa_node_in_is_local) { + memcpy(input_local, input, input_size * sizeof(DTYPE)); + memcpy(querys_local, querys, n_querys * sizeof(DTYPE)); + } else { + input_local = input; + querys_local = querys; + } + stop(&timer, 2); + if (numa_node_cpu != -1) { + if (numa_run_on_node(numa_node_cpu) == -1) { + perror("numa_run_on_node"); + numa_node_cpu = -1; + } + } + mp_pages[0] = input_local; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(input_local)"); + } + else if (mp_status[0] < 0) { + printf("move_pages error: %d", mp_status[0]); + } + else { + numa_node_local = mp_status[0]; + } +#endif + start(&timer, 0, 0); +#if NUMA_MEMCPY + result_host = binarySearch(input_local, input_size - 1, querys_local, n_querys); +#else result_host = binarySearch(input, input_size - 1, querys, n_querys); +#endif stop(&timer, 0); +#if NUMA_MEMCPY + start(&timer, 3, 0); + if (!numa_node_in_is_local) { + numa_free(input_local, input_size * sizeof(DTYPE)); + numa_free(querys_local, n_querys * sizeof(DTYPE)); + } + stop(&timer, 3); +#endif + unsigned int nr_threads = 0; #pragma omp parallel #pragma omp atomic @@ -148,6 +229,17 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne int status = (result_host); if (status) { +#if NUMA_MEMCPY + printf("[::] BS-CPU-MEMCPY | n_threads=%d e_type=%s n_elements=%lu" + " numa_node_in=%d numa_node_local=%d numa_node_cpu=%d numa_node_cpu_memcpy=%d numa_distance_in_cpu=%d" + " | throughput_MBps=%f throughput_MOpps=%f" + " latency_kernel_us=%f latency_alloc_us=%f latency_memcpy_us=%f latency_free_us=%f latency_total_us=%f\n", + nr_threads, "uint64_t", input_size, + numa_node_in, numa_node_local, numa_node_cpu, numa_node_cpu_memcpy, numa_distance(numa_node_in, numa_node_cpu), + n_querys * sizeof(DTYPE) / timer.time[0], n_querys / timer.time[0], + timer.time[0], timer.time[1], timer.time[2], timer.time[3], + timer.time[0] + timer.time[1] + timer.time[2] + timer.time[3]); +#else printf("[::] BS-CPU | n_threads=%d e_type=%s n_elements=%lu" #if NUMA " numa_node_in=%d numa_node_cpu=%d numa_distance_in_cpu=%d" @@ -158,9 +250,9 @@ uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigne numa_node_in, numa_node_cpu, numa_distance(numa_node_in, numa_node_cpu), #endif n_querys * sizeof(DTYPE) / timer.time[0]); - printf(" throughput_MOpps=%f", - n_querys / timer.time[0]); - printall(&timer, 0); + printf(" throughput_MOpps=%f latency_us=%f\n", + n_querys / timer.time[0], timer.time[0]); +#endif } else { printf("[ERROR]\n"); } |