summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-09-19 15:15:01 +0200
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-09-19 15:15:01 +0200
commitae4d77d864fb89b4b2d2d39445da6017156f236c (patch)
treefa2aa0b86bd7db172483acb9b0e19f6cfa9d24a9
parent9f814a130b40c455144c0c2f946ea9faebeeba34 (diff)
Add AVX512 copy variant. Not particularly efficient yet, might be missing sth
-rw-r--r--Makefile4
-rw-r--r--mbw.c277
2 files changed, 276 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index 139bb69..83f3f7d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,10 @@ ifdef numa
EXTRA_LIBS += -lnuma
endif
+ifdef avx512
+ EXTRA_CFLAGS += -DHAVE_AVX512
+endif
+
mbw: mbw.c
gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
diff --git a/mbw.c b/mbw.c
index 25cb37d..85eff8a 100644
--- a/mbw.c
+++ b/mbw.c
@@ -13,6 +13,11 @@
#include <string.h>
#include <unistd.h>
+#ifdef HAVE_AVX512
+#include <stdint.h>
+#include <immintrin.h>
+#endif
+
#ifdef MULTITHREADED
#include <pthread.h>
#include <semaphore.h>
@@ -26,8 +31,8 @@
/* how many runs to average by default */
#define DEFAULT_NR_LOOPS 40
-/* we have 3 tests at the moment */
-#define MAX_TESTS 3
+/* we have 4 tests at the moment */
+#define MAX_TESTS 4
/* default block size for test 2, in bytes */
#define DEFAULT_BLOCK_SIZE 262144
@@ -36,6 +41,7 @@
#define TEST_MEMCPY 0
#define TEST_DUMB 1
#define TEST_MCBLOCK 2
+#define TEST_AVX512 3
/* version number */
#define VERSION "1.5+smaug"
@@ -85,6 +91,247 @@ struct bitmask* bitmask_a = NULL;
struct bitmask* bitmask_b = NULL;
#endif
+#ifdef HAVE_AVX512
+
+/**
+ * AVX512 implementation taken from
+ * <https://lore.kernel.org/all/1453086314-30158-4-git-send-email-zhihong.wang@intel.com/>
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+ __m128i xmm0;
+
+ xmm0 = _mm_loadu_si128((const __m128i *)src);
+ _mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+ __m256i ymm0;
+
+ ymm0 = _mm256_loadu_si256((const __m256i *)src);
+ _mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+ __m512i zmm0;
+
+ zmm0 = _mm512_loadu_si512((const void *)src);
+ _mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov64(dst + 0 * 64, src + 0 * 64);
+ rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov64(dst + 0 * 64, src + 0 * 64);
+ rte_mov64(dst + 1 * 64, src + 1 * 64);
+ rte_mov64(dst + 2 * 64, src + 2 * 64);
+ rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+ __m512i zmm0, zmm1;
+
+ while (n >= 128) {
+ zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+ n -= 128;
+ zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+ src = src + 128;
+ _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+ _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+ dst = dst + 128;
+ }
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+ __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+ while (n >= 512) {
+ zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+ n -= 512;
+ zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+ zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+ zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+ zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+ zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+ zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+ zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+ src = src + 512;
+ _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+ _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+ _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+ _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+ _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+ _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+ _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
+ _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
+ dst = dst + 512;
+ }
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+ uintptr_t dstu = (uintptr_t)dst;
+ uintptr_t srcu = (uintptr_t)src;
+ void *ret = dst;
+ size_t dstofss;
+ size_t bits;
+
+ /**
+ * Copy less than 16 bytes
+ */
+ if (n < 16) {
+ if (n & 0x01) {
+ *(uint8_t *)dstu = *(const uint8_t *)srcu;
+ srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+ dstu = (uintptr_t)((uint8_t *)dstu + 1);
+ }
+ if (n & 0x02) {
+ *(uint16_t *)dstu = *(const uint16_t *)srcu;
+ srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+ dstu = (uintptr_t)((uint16_t *)dstu + 1);
+ }
+ if (n & 0x04) {
+ *(uint32_t *)dstu = *(const uint32_t *)srcu;
+ srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+ dstu = (uintptr_t)((uint32_t *)dstu + 1);
+ }
+ if (n & 0x08)
+ *(uint64_t *)dstu = *(const uint64_t *)srcu;
+ return ret;
+ }
+
+ /**
+ * Fast way when copy size doesn't exceed 512 bytes
+ */
+ if (n <= 32) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
+ return ret;
+ }
+ if (n <= 64) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
+ return ret;
+ }
+ if (n <= 512) {
+ if (n >= 256) {
+ n -= 256;
+ rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+ src = (const uint8_t *)src + 256;
+ dst = (uint8_t *)dst + 256;
+ }
+ if (n >= 128) {
+ n -= 128;
+ rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+ src = (const uint8_t *)src + 128;
+ dst = (uint8_t *)dst + 128;
+ }
+COPY_BLOCK_128_BACK63:
+ if (n > 64) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov64((uint8_t *)dst - 64 + n,
+ (const uint8_t *)src - 64 + n);
+ return ret;
+ }
+ if (n > 0)
+ rte_mov64((uint8_t *)dst - 64 + n,
+ (const uint8_t *)src - 64 + n);
+ return ret;
+ }
+
+ /**
+ * Make store aligned when copy size exceeds 512 bytes
+ */
+ dstofss = ((uintptr_t)dst & 0x3F);
+ if (dstofss > 0) {
+ dstofss = 64 - dstofss;
+ n -= dstofss;
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ src = (const uint8_t *)src + dstofss;
+ dst = (uint8_t *)dst + dstofss;
+ }
+
+ /**
+ * Copy 512-byte blocks.
+ * Use copy block function for better instruction order control,
+ * which is important when load is unaligned.
+ */
+ rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+ bits = n;
+ n = n & 511;
+ bits -= n;
+ src = (const uint8_t *)src + bits;
+ dst = (uint8_t *)dst + bits;
+
+ /**
+ * Copy 128-byte blocks.
+ * Use copy block function for better instruction order control,
+ * which is important when load is unaligned.
+ */
+ if (n >= 128) {
+ rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+ bits = n;
+ n = n & 127;
+ bits -= n;
+ src = (const uint8_t *)src + bits;
+ dst = (uint8_t *)dst + bits;
+ }
+
+ /**
+ * Copy whatever left
+ */
+ goto COPY_BLOCK_128_BACK63;
+}
+#endif
+
void usage()
{
printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION);
@@ -95,6 +342,9 @@ void usage()
printf(" -t%d: memcpy test\n", TEST_MEMCPY);
printf(" -t%d: dumb (b[i]=a[i] style) test\n", TEST_DUMB);
printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
+#ifdef HAVE_AVX512
+ printf(" -t%d: AVX512 copy test\n", TEST_AVX512);
+#endif
printf(" -b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
printf(" -q: quiet (print statistics only)\n");
#ifdef NUMA
@@ -117,7 +367,11 @@ long *make_array()
unsigned int long_size=sizeof(long);
long *a;
+#ifdef HAVE_AVX512
+ a=aligned_alloc(64, arr_size * long_size);
+#else
a=calloc(arr_size, long_size);
+#endif
if(NULL==a) {
perror("Error allocating memory");
@@ -163,6 +417,10 @@ void *thread_worker(void *arg)
for(t=dumb_start; t<dumb_stop; t++) {
arr_b[t]=arr_a[t];
}
+#ifdef HAVE_AVX512
+ } else if(test_type==TEST_AVX512) {
+ rte_memcpy(arr_b, arr_a, array_bytes);
+#endif // HAVE_AVX512
}
if (sem_post(&stop_sem) != 0) {
err(1, "sem_post(stop_sem)");
@@ -241,8 +499,14 @@ double worker()
arr_b[t]=arr_a[t];
}
clock_gettime(CLOCK_MONOTONIC, &endtime);
+#ifdef HAVE_AVX512
+ } else if(test_type==TEST_AVX512) {
+ clock_gettime(CLOCK_MONOTONIC, &starttime);
+ rte_memcpy(arr_b, arr_a, array_bytes);
+ clock_gettime(CLOCK_MONOTONIC, &endtime);
+#endif // HAVE_AVX512
}
-#endif
+#endif // MULTITHREADED
te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000;
@@ -297,6 +561,7 @@ int main(int argc, char **argv)
tests[0]=0;
tests[1]=0;
tests[2]=0;
+ tests[3]=0;
while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) {
switch(o) {
@@ -347,13 +612,13 @@ int main(int argc, char **argv)
}
/* default is to run all tests if no specific tests were requested */
- if( (tests[0]+tests[1]+tests[2]) == 0) {
+ if( (tests[0]+tests[1]+tests[2]+tests[3]) == 0) {
tests[0]=1;
tests[1]=1;
tests[2]=1;
}
- if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]) != 1) ) {
+ if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]) != 1) ) {
printf("Error: nr_loops can be zero if only one test selected!\n");
exit(1);
}
@@ -478,6 +743,8 @@ int main(int argc, char **argv)
printf("[::] copy");
} else if (test_type == TEST_MCBLOCK) {
printf("[::] mcblock");
+ } else if (test_type == TEST_AVX512) {
+ printf("[::] copy-avx512");
}
printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size);
#ifdef MULTITHREADED