diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-09-19 15:15:01 +0200 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-09-19 15:15:01 +0200 |
commit | ae4d77d864fb89b4b2d2d39445da6017156f236c (patch) | |
tree | fa2aa0b86bd7db172483acb9b0e19f6cfa9d24a9 | |
parent | 9f814a130b40c455144c0c2f946ea9faebeeba34 (diff) |
Add AVX512 copy variant. Not particularly efficient yet, might be missing sth
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | mbw.c | 277 |
2 files changed, 276 insertions, 5 deletions
@@ -9,6 +9,10 @@ ifdef numa EXTRA_LIBS += -lnuma endif +ifdef avx512 + EXTRA_CFLAGS += -DHAVE_AVX512 +endif + mbw: mbw.c gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS} @@ -13,6 +13,11 @@ #include <string.h> #include <unistd.h> +#ifdef HAVE_AVX512 +#include <stdint.h> +#include <immintrin.h> +#endif + #ifdef MULTITHREADED #include <pthread.h> #include <semaphore.h> @@ -26,8 +31,8 @@ /* how many runs to average by default */ #define DEFAULT_NR_LOOPS 40 -/* we have 3 tests at the moment */ -#define MAX_TESTS 3 +/* we have 4 tests at the moment */ +#define MAX_TESTS 4 /* default block size for test 2, in bytes */ #define DEFAULT_BLOCK_SIZE 262144 @@ -36,6 +41,7 @@ #define TEST_MEMCPY 0 #define TEST_DUMB 1 #define TEST_MCBLOCK 2 +#define TEST_AVX512 3 /* version number */ #define VERSION "1.5+smaug" @@ -85,6 +91,247 @@ struct bitmask* bitmask_a = NULL; struct bitmask* bitmask_b = NULL; #endif +#ifdef HAVE_AVX512 + +/** + * AVX512 implementation taken from + * <https://lore.kernel.org/all/1453086314-30158-4-git-send-email-zhihong.wang@intel.com/> + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __m256i ymm0; + + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + __m512i zmm0; + + zmm0 = _mm512_loadu_si512((const void *)src); + _mm512_storeu_si512((void *)dst, zmm0); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); +} + +/** + * Copy 256 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); + rte_mov64(dst + 2 * 64, src + 2 * 64); + rte_mov64(dst + 3 * 64, src + 3 * 64); +} + +/** + * Copy 128-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1; + + while (n >= 128) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 128; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + src = src + 128; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + dst = dst + 128; + } +} + +/** + * Copy 512-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; + + while (n >= 512) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 512; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); + zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); + zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); + zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); + zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); + zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); + src = src + 512; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); + _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); + _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); + _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); + _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); + _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); + dst = dst + 512; + } +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t bits; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) + *(uint64_t *)dstu = *(const uint64_t *)srcu; + return ret; + } + + /** + * Fast way when copy size doesn't exceed 512 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n <= 512) { + if (n >= 256) { + n -= 256; + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 256; + dst = (uint8_t *)dst + 256; + } + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK63: + if (n > 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + if (n > 0) + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /** + * Make store aligned when copy size exceeds 512 bytes + */ + dstofss = ((uintptr_t)dst & 0x3F); + if (dstofss > 0) { + dstofss = 64 - dstofss; + n -= dstofss; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + + /** + * Copy 512-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 511; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + + /** + * Copy 128-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + if (n >= 128) { + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 127; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + } + + /** + * Copy whatever left + */ + goto COPY_BLOCK_128_BACK63; +} +#endif + void usage() { printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION); @@ -95,6 +342,9 @@ void usage() printf(" -t%d: memcpy test\n", TEST_MEMCPY); printf(" -t%d: dumb (b[i]=a[i] style) test\n", TEST_DUMB); printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK); +#ifdef HAVE_AVX512 + printf(" -t%d: AVX512 copy test\n", TEST_AVX512); +#endif printf(" -b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE); printf(" -q: quiet (print statistics only)\n"); #ifdef NUMA @@ -117,7 +367,11 @@ long *make_array() unsigned int long_size=sizeof(long); long *a; +#ifdef HAVE_AVX512 + a=aligned_alloc(64, arr_size * long_size); +#else a=calloc(arr_size, long_size); +#endif if(NULL==a) { perror("Error allocating memory"); @@ -163,6 +417,10 @@ void *thread_worker(void *arg) for(t=dumb_start; t<dumb_stop; t++) { arr_b[t]=arr_a[t]; } +#ifdef HAVE_AVX512 + } else if(test_type==TEST_AVX512) { + rte_memcpy(arr_b, arr_a, array_bytes); +#endif // HAVE_AVX512 } if (sem_post(&stop_sem) != 0) { err(1, "sem_post(stop_sem)"); @@ -241,8 +499,14 @@ double worker() arr_b[t]=arr_a[t]; } clock_gettime(CLOCK_MONOTONIC, &endtime); +#ifdef HAVE_AVX512 + } else if(test_type==TEST_AVX512) { + clock_gettime(CLOCK_MONOTONIC, &starttime); + rte_memcpy(arr_b, arr_a, array_bytes); + clock_gettime(CLOCK_MONOTONIC, &endtime); +#endif // HAVE_AVX512 } -#endif +#endif // MULTITHREADED te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000; @@ -297,6 +561,7 @@ int main(int argc, char **argv) tests[0]=0; tests[1]=0; tests[2]=0; + tests[3]=0; while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) { switch(o) { @@ -347,13 +612,13 @@ int main(int argc, char **argv) } /* default is to run all tests if no specific tests were requested */ - if( (tests[0]+tests[1]+tests[2]) == 0) { + if( (tests[0]+tests[1]+tests[2]+tests[3]) == 0) { tests[0]=1; tests[1]=1; tests[2]=1; } - if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]) != 1) ) { + if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]) != 1) ) { printf("Error: nr_loops can be zero if only one test selected!\n"); exit(1); } @@ -478,6 +743,8 @@ int main(int argc, char **argv) printf("[::] copy"); } else if (test_type == TEST_MCBLOCK) { printf("[::] mcblock"); + } else if (test_type == TEST_AVX512) { + printf("[::] copy-avx512"); } printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size); #ifdef MULTITHREADED |