Add AVX512 copy variant. Not particularly efficient yet, might be missing sth

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-09-19 15:15:01 +0200
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-09-19 15:15:01 +0200
commit: ae4d77d864fb89b4b2d2d39445da6017156f236c (patch)
tree: fa2aa0b86bd7db172483acb9b0e19f6cfa9d24a9
parent: 9f814a130b40c455144c0c2f946ea9faebeeba34 (diff)
2 files changed, 276 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index 139bb69..83f3f7d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,10 @@ ifdef numa
 	EXTRA_LIBS += -lnuma
 endif
 
+ifdef avx512
+	EXTRA_CFLAGS += -DHAVE_AVX512
+endif
+
 mbw: mbw.c
 	gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
 
diff --git a/mbw.c b/mbw.c
index 25cb37d..85eff8a 100644
--- a/mbw.c
+++ b/mbw.c
@@ -13,6 +13,11 @@
 #include <string.h>
 #include <unistd.h>
 
+#ifdef HAVE_AVX512
+#include <stdint.h>
+#include <immintrin.h>
+#endif
+
 #ifdef MULTITHREADED
 #include <pthread.h>
 #include <semaphore.h>
@@ -26,8 +31,8 @@
 /* how many runs to average by default */
 #define DEFAULT_NR_LOOPS 40
 
-/* we have 3 tests at the moment */
-#define MAX_TESTS 3
+/* we have 4 tests at the moment */
+#define MAX_TESTS 4
 
 /* default block size for test 2, in bytes */
 #define DEFAULT_BLOCK_SIZE 262144
@@ -36,6 +41,7 @@
 #define TEST_MEMCPY 0
 #define TEST_DUMB 1
 #define TEST_MCBLOCK 2
+#define TEST_AVX512 3
 
 /* version number */
 #define VERSION "1.5+smaug"
@@ -85,6 +91,247 @@ struct bitmask* bitmask_a = NULL;
 struct bitmask* bitmask_b = NULL;
 #endif
 
+#ifdef HAVE_AVX512
+
+/**
+ * AVX512 implementation taken from
+ * <https://lore.kernel.org/all/1453086314-30158-4-git-send-email-zhihong.wang@intel.com/>
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_loadu_si512((const void *)src);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	rte_mov64(dst + 2 * 64, src + 2 * 64);
+	rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1;
+
+	while (n >= 128) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 128;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		src = src + 128;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		dst = dst + 128;
+	}
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+	while (n >= 512) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 512;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+		src = src + 512;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
+		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
+		dst = dst + 512;
+	}
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				  (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				  (const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK63:
+		if (n > 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+			return ret;
+		}
+		if (n > 0)
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes
+	 */
+	dstofss = ((uintptr_t)dst & 0x3F);
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 512-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 511;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy 128-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	if (n >= 128) {
+		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		src = (const uint8_t *)src + bits;
+		dst = (uint8_t *)dst + bits;
+	}
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK63;
+}
+#endif
+
 void usage()
 {
     printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION);
@@ -95,6 +342,9 @@ void usage()
     printf("	-t%d: memcpy test\n", TEST_MEMCPY);
     printf("	-t%d: dumb (b[i]=a[i] style) test\n", TEST_DUMB);
     printf("	-t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
+#ifdef HAVE_AVX512
+    printf("	-t%d: AVX512 copy test\n", TEST_AVX512);
+#endif
     printf("	-b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
     printf("	-q: quiet (print statistics only)\n");
 #ifdef NUMA
@@ -117,7 +367,11 @@ long *make_array()
     unsigned int long_size=sizeof(long);
     long *a;
 
+#ifdef HAVE_AVX512
+    a=aligned_alloc(64, arr_size * long_size);
+#else
     a=calloc(arr_size, long_size);
+#endif
 
     if(NULL==a) {
         perror("Error allocating memory");
@@ -163,6 +417,10 @@ void *thread_worker(void *arg)
             for(t=dumb_start; t<dumb_stop; t++) {
                 arr_b[t]=arr_a[t];
             }
+#ifdef HAVE_AVX512
+        } else if(test_type==TEST_AVX512) {
+            rte_memcpy(arr_b, arr_a, array_bytes);
+#endif // HAVE_AVX512
         }
         if (sem_post(&stop_sem) != 0) {
             err(1, "sem_post(stop_sem)");
@@ -241,8 +499,14 @@ double worker()
             arr_b[t]=arr_a[t];
         }
         clock_gettime(CLOCK_MONOTONIC, &endtime);
+#ifdef HAVE_AVX512
+    } else if(test_type==TEST_AVX512) {
+        clock_gettime(CLOCK_MONOTONIC, &starttime);
+        rte_memcpy(arr_b, arr_a, array_bytes);
+        clock_gettime(CLOCK_MONOTONIC, &endtime);
+#endif // HAVE_AVX512
     }
-#endif
+#endif // MULTITHREADED
 
     te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000;
 
@@ -297,6 +561,7 @@ int main(int argc, char **argv)
     tests[0]=0;
     tests[1]=0;
     tests[2]=0;
+    tests[3]=0;
 
     while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) {
         switch(o) {
@@ -347,13 +612,13 @@ int main(int argc, char **argv)
     }
 
     /* default is to run all tests if no specific tests were requested */
-    if( (tests[0]+tests[1]+tests[2]) == 0) {
+    if( (tests[0]+tests[1]+tests[2]+tests[3]) == 0) {
         tests[0]=1;
         tests[1]=1;
         tests[2]=1;
     }
 
-    if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]) != 1) ) {
+    if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]) != 1) ) {
         printf("Error: nr_loops can be zero if only one test selected!\n");
         exit(1);
     }
@@ -478,6 +743,8 @@ int main(int argc, char **argv)
                     printf("[::] copy");
                 } else if (test_type == TEST_MCBLOCK) {
                     printf("[::] mcblock");
+                } else if (test_type == TEST_AVX512) {
+                    printf("[::] copy-avx512");
                 }
                 printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size);
 #ifdef MULTITHREADED
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-09-19 15:15:01 +0200
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-09-19 15:15:01 +0200
commit	ae4d77d864fb89b4b2d2d39445da6017156f236c (patch)
tree	fa2aa0b86bd7db172483acb9b0e19f6cfa9d24a9
parent	9f814a130b40c455144c0c2f946ea9faebeeba34 (diff)