diff options
-rw-r--r-- | Makefile | 23 | ||||
-rwxr-xr-x | benchmark-scripts/milos-copy.sh | 27 | ||||
-rwxr-xr-x | benchmark-scripts/milos-read.sh | 27 | ||||
-rwxr-xr-x | benchmark-scripts/milos-write.sh | 27 | ||||
-rw-r--r-- | mbw.c | 262 | ||||
-rwxr-xr-x | milos-roofline.sh | 14 |
6 files changed, 304 insertions, 76 deletions
@@ -1,20 +1,35 @@ EXTRA_CFLAGS = EXTRA_LIBS = -ifdef pthread + +native ?= 1 +pthread ?= 0 +numa ?= 0 +avx512 ?= 0 +debug ?= 0 + +ifeq (${native}, 1) + EXTRA_CFLAGS += -march=native +endif + +ifeq (${pthread}, 1) EXTRA_CFLAGS += -DMULTITHREADED -pthread endif -ifdef numa +ifeq (${numa}, 1) EXTRA_CFLAGS += -DNUMA EXTRA_LIBS += -lnuma endif -ifdef avx512 +ifeq (${avx512}, 1) EXTRA_CFLAGS += -DHAVE_AVX512 endif +ifeq (${debug}, 1) + EXTRA_CFLAGS += -ggdb +endif + mbw: mbw.c - gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS} + gcc -Wall -Wextra -pedantic -O3 ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS} .PHONY: clean clean: diff --git a/benchmark-scripts/milos-copy.sh b/benchmark-scripts/milos-copy.sh new file mode 100755 index 0000000..81c7764 --- /dev/null +++ b/benchmark-scripts/milos-copy.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +mkdir -p log/${HOST} + +make -B numa=1 pthread=1 avx512=1 + +fn=log/${HOST}/copy-memcpy +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \ + ::: ram_in $(seq 0 17) \ + ::: ram_out $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads 1 2 4 6 8 10 12 14 16 \ +>> ${fn}.txt + +fn=log/${HOST}/copy-avx512 +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t3 4096 \ + ::: ram_in $(seq 0 17) \ + ::: ram_out $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads 1 2 4 6 8 10 12 14 16 \ +>> ${fn}.txt diff --git a/benchmark-scripts/milos-read.sh b/benchmark-scripts/milos-read.sh new file mode 100755 index 0000000..1f77cfe --- /dev/null +++ b/benchmark-scripts/milos-read.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +mkdir -p log/${HOST} + +make -B numa=1 pthread=1 avx512=1 + +fn=log/${HOST}/read-64bit +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t4 4096 \ + ::: ram_in $(seq 0 17) \ + :::+ ram_out $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads $(seq 1 16) \ +>> ${fn}.txt + +fn=log/${HOST}/read-avx512 +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t6 4096 \ + ::: ram_in $(seq 0 17) \ + :::+ ram_out $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads $(seq 1 16) \ +>> ${fn}.txt diff --git a/benchmark-scripts/milos-write.sh b/benchmark-scripts/milos-write.sh new file mode 100755 index 0000000..ba50133 --- /dev/null +++ b/benchmark-scripts/milos-write.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +mkdir -p log/${HOST} + +make -B numa=1 pthread=1 avx512=1 + +fn=log/${HOST}/write-64bit +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t5 4096 \ + ::: ram_out $(seq 0 17) \ + :::+ ram_in $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads $(seq 1 16) \ +>> ${fn}.txt + +fn=log/${HOST}/write-avx512 +echo "\n${fn}\n" +echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt +parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ + ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t7 4096 \ + ::: ram_out $(seq 0 17) \ + :::+ ram_in $(seq 0 17) \ + ::: cpu $(seq 0 7) \ + ::: nr_threads $(seq 1 16) \ +>> ${fn}.txt @@ -3,6 +3,7 @@ */ #define _GNU_SOURCE +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -41,7 +42,9 @@ #define TEST_AVX512 3 #define TEST_READ_PLAIN 4 #define TEST_WRITE_PLAIN 5 -#define MAX_TESTS 6 +#define TEST_READ_AVX512 6 +#define TEST_WRITE_AVX512 7 +#define MAX_TESTS 8 /* version number */ #define VERSION "1.5+smaug" @@ -74,12 +77,17 @@ pthread_t *threads; sem_t start_sem, stop_sem, sync_sem; #endif -long *arr_a, *arr_b; /* the two arrays to be copied from/to */ +long *arr_a = NULL; +long *arr_b = NULL; /* the two arrays to be copied from/to */ unsigned long long arr_size=0; /* array size (elements in array) */ unsigned int test_type; /* fixed memcpy block size for -t2 */ unsigned long long block_size=DEFAULT_BLOCK_SIZE; +int sanity_check = 0; +long arr_a_sum = 0; +long *partial_sum; + #ifdef NUMA void* mp_pages[1]; int mp_status[1]; @@ -189,27 +197,27 @@ static inline void rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; - - while (n >= 512) { - zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); - n -= 512; - zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); - zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); - zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); - zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); - zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); - zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); - zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); - src = src + 512; - _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); - _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); - _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); - _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); - _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); - _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); - _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); - _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); - dst = dst + 512; + const uint8_t *end = src + n; + + while (src < end) { + zmm0 = _mm512_load_si512((const void *)(src + 0 * 64)); + zmm1 = _mm512_load_si512((const void *)(src + 1 * 64)); + zmm2 = _mm512_load_si512((const void *)(src + 2 * 64)); + zmm3 = _mm512_load_si512((const void *)(src + 3 * 64)); + zmm4 = _mm512_load_si512((const void *)(src + 4 * 64)); + zmm5 = _mm512_load_si512((const void *)(src + 5 * 64)); + zmm6 = _mm512_load_si512((const void *)(src + 6 * 64)); + zmm7 = _mm512_load_si512((const void *)(src + 7 * 64)); + _mm512_store_si512((void *)(dst + 0 * 64), zmm0); + _mm512_store_si512((void *)(dst + 1 * 64), zmm1); + _mm512_store_si512((void *)(dst + 2 * 64), zmm2); + _mm512_store_si512((void *)(dst + 3 * 64), zmm3); + _mm512_store_si512((void *)(dst + 4 * 64), zmm4); + _mm512_store_si512((void *)(dst + 5 * 64), zmm5); + _mm512_store_si512((void *)(dst + 6 * 64), zmm6); + _mm512_store_si512((void *)(dst + 7 * 64), zmm7); + src += 512; + dst += 512; } } @@ -339,14 +347,19 @@ void usage() printf("Options:\n"); printf(" -n: number of runs per test (0 to run forever)\n"); printf(" -a: Don't display average\n"); + printf(" -C: enable sanity checks\n"); printf(" -t%d: memcpy test\n", TEST_MEMCPY); printf(" -t%d: plain (b[i]=a[i] style) test\n", TEST_PLAIN); printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK); #ifdef HAVE_AVX512 printf(" -t%d: AVX512 copy test\n", TEST_AVX512); #endif - printf(" -t%d: plain read test\n", TEST_READ_PLAIN); - printf(" -t%d: plain write test\n", TEST_WRITE_PLAIN); + printf(" -t%d: plain read test (sum)\n", TEST_READ_PLAIN); + printf(" -t%d: plain write test (const fill)\n", TEST_WRITE_PLAIN); +#ifdef HAVE_AVX512 + printf(" -t%d: AVX512 read test (sum)\n", TEST_READ_AVX512); + printf(" -t%d: AVX512 write test (const fill)\n", TEST_WRITE_AVX512); +#endif printf(" -b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE); printf(" -q: quiet (print statistics only)\n"); #ifdef NUMA @@ -363,7 +376,7 @@ void usage() /* allocate a test array and fill it with data * so as to force Linux to _really_ allocate it */ -long *make_array() +long *make_array(long *sum) { unsigned long long t; unsigned int long_size=sizeof(long); @@ -384,6 +397,12 @@ long *make_array() for(t=0; t<arr_size; t++) { a[t]=0xaa; } + if (sum != NULL) { + *sum = 0; + for(t=0; t<arr_size; t++) { + *sum += 0xaa; + } + } return a; } @@ -426,14 +445,42 @@ void *thread_worker(void *arg) } else if(test_type==TEST_READ_PLAIN) { long tmp = 0; for(t=plain_start; t<plain_stop; t++) { - tmp ^= arr_a[t]; + tmp += arr_a[t]; + } + if (sanity_check) { + partial_sum[thread_id] = tmp; } - arr_b[plain_stop-1] = tmp; } else if(test_type==TEST_WRITE_PLAIN) { - long tmp = 0; + long tmp = 1374181804651713298; for(t=plain_start; t<plain_stop; t++) { - arr_b[t] = ++tmp; + arr_b[t] = tmp; + } +#ifdef HAVE_AVX512 + } else if(test_type==TEST_READ_AVX512) { + __m512i zmm0 = _mm512_setzero_epi32(); + __m512i zmm1; + uint8_t *src = (uint8_t*)(arr_a + (plain_start & ~0x0000000000000007)); + const uint8_t *end = (uint8_t*)(arr_a + (plain_stop & ~0x0000000000000007)); + long tmp = 0; + while (src < end) { + zmm1 = _mm512_load_si512((const void *)src); + zmm0 = _mm512_add_epi64(zmm0, zmm1); + src += 64; } + tmp += (long)_mm512_reduce_add_epi64(zmm0); + if (sanity_check) { + partial_sum[thread_id] = tmp; + } + } else if(test_type==TEST_WRITE_AVX512) { + const long src = 0x0707070707070707; + uint8_t *dst = (uint8_t*)(arr_b + (plain_start & ~0x0000000000000007)); + const uint8_t *end = (uint8_t*)(arr_b + (plain_stop & ~0x0000000000000007)); + __m512i zmm0 = _mm512_load_si512(&src); + while (dst < end) { + _mm512_store_si512((void*)(dst), zmm0); + dst += 64; + } +#endif // HAVE_AVX512 } if (sem_post(&stop_sem) != 0) { err(1, "sem_post(stop_sem)"); @@ -522,10 +569,12 @@ double worker() long tmp = 0; clock_gettime(CLOCK_MONOTONIC, &starttime); for(t=0; t<arr_size; t++) { - tmp ^= arr_a[t]; + tmp += arr_a[t]; } clock_gettime(CLOCK_MONOTONIC, &endtime); - arr_b[arr_size-1] = tmp; + if (sanity_check) { + assert(tmp == arr_a_sum); + } } else if(test_type==TEST_WRITE_PLAIN) { long tmp = 0; clock_gettime(CLOCK_MONOTONIC, &starttime); @@ -533,8 +582,42 @@ double worker() arr_b[t] = ++tmp; } clock_gettime(CLOCK_MONOTONIC, &endtime); +#ifdef HAVE_AVX512 + } else if(test_type==TEST_READ_AVX512) { + __m512i zmm0 = _mm512_setzero_epi32(); + __m512i zmm1; + long tmp = 0; + uint8_t *src = (uint8_t*)arr_a; + const uint8_t *end = src + arr_size * sizeof(long); + clock_gettime(CLOCK_MONOTONIC, &starttime); + while (src < end) { + zmm1 = _mm512_load_si512((const void *)src); + zmm0 = _mm512_add_epi64(zmm0, zmm1); + src += 64; + } + clock_gettime(CLOCK_MONOTONIC, &endtime); + tmp = (long)_mm512_reduce_add_epi64(zmm0); + if (sanity_check) { + if (tmp != arr_a_sum) { + printf("expected: arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum); + printf("output: reduce_add == %12ld (%016lx)\n", tmp, tmp); + } + assert(tmp == arr_a_sum); + } + } else if(test_type==TEST_WRITE_AVX512) { + const uint8_t *src = (uint8_t*)arr_b; + uint8_t *dst = (uint8_t*)arr_b; + const uint8_t *end = dst + arr_size * sizeof(long); + __m512i zmm0 = _mm512_load_si512(src); + clock_gettime(CLOCK_MONOTONIC, &starttime); + while (dst < end) { + _mm512_store_si512((void*)(dst), zmm0); + dst += 64; + } + clock_gettime(CLOCK_MONOTONIC, &endtime); +#endif // HAVE_AVX512 } -#endif // MULTITHREADED +#endif // !MULTITHREADED te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000; @@ -590,8 +673,12 @@ int main(int argc, char **argv) tests[1]=0; tests[2]=0; tests[3]=0; + tests[4]=0; + tests[5]=0; + tests[6]=0; + tests[7]=0; - while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) { + while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:C")) != EOF) { switch(o) { case 'h': usage(); @@ -631,6 +718,9 @@ int main(int argc, char **argv) exit(1); } break; + case 'C': + sanity_check = 1; + break; case 'q': /* quiet */ quiet=1; break; @@ -639,14 +729,36 @@ int main(int argc, char **argv) } } - /* default is to run all tests if no specific tests were requested */ - if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) == 0) { +#ifndef HAVE_AVX512 + if (tests[TEST_AVX512]) { + printf("Error: AVX512 memcpy requested, but this mbw build has been compiled without AVX512 support\n"); + exit(1); + } + if (tests[TEST_READ_AVX512]) { + printf("Error: AVX512 read requested, but this mbw build has been compiled without AVX512 support\n"); + exit(1); + } + if (tests[TEST_WRITE_AVX512]) { + printf("Error: AVX512 write requested, but this mbw build has been compiled without AVX512 support\n"); + exit(1); + } +#endif + + /* default is to run most tests if no specific tests were requested */ + if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) == 0) { tests[0]=1; tests[1]=1; tests[2]=1; + tests[4]=1; + tests[5]=1; +#if HAVE_AVX512 + tests[3]=1; + tests[6]=1; + tests[7]=1; +#endif } - if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) != 1) ) { + if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) != 1) ) { printf("Error: nr_loops can be zero if only one test selected!\n"); exit(1); } @@ -675,7 +787,6 @@ int main(int argc, char **argv) if(!quiet) { printf("Long uses %d bytes. ", long_size); - printf("Allocating 2*%lld elements = %lld bytes of memory.\n", arr_size, 2*arr_size*long_size); if(tests[2]) { printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size); } @@ -689,7 +800,12 @@ int main(int argc, char **argv) numa_free_nodemask(bitmask_a); } #endif - arr_a=make_array(); + if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_READ_PLAIN]+tests[TEST_READ_AVX512]) { + if (!quiet) { + printf("Allocating %lld elements = %lld MiB of input memory.\n", arr_size, arr_size*long_size / 1024 / 1024); + } + arr_a=make_array(&arr_a_sum); + } #ifdef NUMA if (bitmask_b) { @@ -697,7 +813,12 @@ int main(int argc, char **argv) numa_free_nodemask(bitmask_b); } #endif - arr_b=make_array(); + if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_WRITE_PLAIN]+tests[TEST_WRITE_AVX512]) { + if (!quiet) { + printf("Allocating %lld elements = %lld MiB of output memory.\n", arr_size, arr_size*long_size / 1024 / 1024); + } + arr_b=make_array(NULL); + } #ifdef NUMA numa_set_membind(bitmask_all); @@ -705,26 +826,30 @@ int main(int argc, char **argv) #endif #ifdef NUMA - mp_pages[0] = arr_a; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(arr_a)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_a = mp_status[0]; + if (arr_a != NULL) { + mp_pages[0] = arr_a; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(arr_a)"); + } + else if (mp_status[0] < 0) { + printf("move_pages(arr_a) error: %d\n", mp_status[0]); + } + else { + numa_node_a = mp_status[0]; + } } - mp_pages[0] = arr_b; - if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { - perror("move_pages(arr_b)"); - } - else if (mp_status[0] < 0) { - printf("move_pages error: %d", mp_status[0]); - } - else { - numa_node_b = mp_status[0]; + if (arr_b != NULL) { + mp_pages[0] = arr_b; + if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { + perror("move_pages(arr_b)"); + } + else if (mp_status[0] < 0) { + printf("move_pages(arr_b) error: %d\n", mp_status[0]); + } + else { + numa_node_b = mp_status[0]; + } } if (numa_node_cpu != -1) { @@ -751,7 +876,13 @@ int main(int argc, char **argv) err(1, "sem_init"); } threads = calloc(num_threads, sizeof(pthread_t)); + if (sanity_check) { + partial_sum = calloc(num_threads, sizeof(long)); + } for (i=0; i < num_threads; i++) { + if (sanity_check) { + partial_sum[i] = 0; + } if (pthread_create(&threads[i], NULL, thread_worker, (void*)(unsigned long)i) != 0) { err(1, "pthread_create"); } @@ -777,6 +908,10 @@ int main(int argc, char **argv) printf("[::] read"); } else if (test_type == TEST_WRITE_PLAIN) { printf("[::] write"); + } else if (test_type == TEST_READ_AVX512) { + printf("[::] read-avx512"); + } else if (test_type == TEST_WRITE_AVX512) { + printf("[::] write-avx512"); } printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size); #ifdef MULTITHREADED @@ -802,6 +937,17 @@ int main(int argc, char **argv) err(1, "pthread_join"); } } + if (sanity_check && (tests[TEST_READ_PLAIN] || tests[TEST_READ_AVX512])) { + long tmp = 0; + for (i=0; i < num_threads; i++) { + tmp += partial_sum[i]; + } + if (tmp != arr_a_sum) { + printf("expected: arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum); + printf("output: sum(partial) == %12ld (%016lx)\n", tmp, tmp); + } + assert(tmp == arr_a_sum); + } #endif free(arr_a); diff --git a/milos-roofline.sh b/milos-roofline.sh deleted file mode 100755 index 092d147..0000000 --- a/milos-roofline.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -mkdir -p log -fn=log/${HOST}-roofline - -make -B numa=1 pthread=1 - -parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \ - ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \ - ::: ram_in $(seq 0 15) \ - ::: ram_out $(seq 0 15) \ - ::: cpu $(seq 0 7) \ - ::: nr_threads $(seq 1 16) \ ->> ${fn}.txt |