/* * vim: ai ts=4 sts=4 sw=4 cinoptions=>4 expandtab */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #ifdef HAVE_AVX512 #include #include #endif #ifdef MULTITHREADED #include #include #endif #ifdef NUMA #include #include #endif /* how many runs to average by default */ #define DEFAULT_NR_LOOPS 40 /* default block size for test 2, in bytes */ #define DEFAULT_BLOCK_SIZE 262144 /* test types */ #define TEST_MEMCPY 0 #define TEST_PLAIN 1 #define TEST_MCBLOCK 2 #define TEST_AVX512 3 #define TEST_READ_PLAIN 4 #define TEST_WRITE_PLAIN 5 #define MAX_TESTS 6 /* version number */ #define VERSION "1.5+smaug" /* * MBW memory bandwidth benchmark * * 2006, 2012 Andras.Horvath@gmail.com * 2013 j.m.slocum@gmail.com * (Special thanks to Stephen Pasich) * * http://github.com/raas/mbw * * compile with: * gcc -O -o mbw mbw.c * * run with eg.: * * ./mbw 300 * * or './mbw -h' for help * * watch out for swap usage (or turn off swap) */ #ifdef MULTITHREADED unsigned long num_threads = 1; volatile unsigned int done = 0; pthread_t *threads; sem_t start_sem, stop_sem, sync_sem; #endif long *arr_a, *arr_b; /* the two arrays to be copied from/to */ unsigned long long arr_size=0; /* array size (elements in array) */ unsigned int test_type; /* fixed memcpy block size for -t2 */ unsigned long long block_size=DEFAULT_BLOCK_SIZE; #ifdef NUMA void* mp_pages[1]; int mp_status[1]; int mp_nodes[1]; int numa_node_a = -1; int numa_node_b = -1; int numa_node_cpu = -1; struct bitmask* bitmask_a = NULL; struct bitmask* bitmask_b = NULL; #endif #ifdef HAVE_AVX512 /** * AVX512 implementation taken from * */ /** * Copy 16 bytes from one location to another, * locations should not overlap. */ static inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; xmm0 = _mm_loadu_si128((const __m128i *)src); _mm_storeu_si128((__m128i *)dst, xmm0); } /** * Copy 32 bytes from one location to another, * locations should not overlap. */ static inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); } /** * Copy 64 bytes from one location to another, * locations should not overlap. */ static inline void rte_mov64(uint8_t *dst, const uint8_t *src) { __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); } /** * Copy 128 bytes from one location to another, * locations should not overlap. */ static inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); rte_mov64(dst + 1 * 64, src + 1 * 64); } /** * Copy 256 bytes from one location to another, * locations should not overlap. */ static inline void rte_mov256(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); rte_mov64(dst + 1 * 64, src + 1 * 64); rte_mov64(dst + 2 * 64, src + 2 * 64); rte_mov64(dst + 3 * 64, src + 3 * 64); } /** * Copy 128-byte blocks from one location to another, * locations should not overlap. */ static inline void rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m512i zmm0, zmm1; while (n >= 128) { zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); n -= 128; zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); src = src + 128; _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); dst = dst + 128; } } /** * Copy 512-byte blocks from one location to another, * locations should not overlap. */ static inline void rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; while (n >= 512) { zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); n -= 512; zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); src = src + 512; _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); dst = dst + 512; } } static inline void * rte_memcpy(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; /** * Copy less than 16 bytes */ if (n < 16) { if (n & 0x01) { *(uint8_t *)dstu = *(const uint8_t *)srcu; srcu = (uintptr_t)((const uint8_t *)srcu + 1); dstu = (uintptr_t)((uint8_t *)dstu + 1); } if (n & 0x02) { *(uint16_t *)dstu = *(const uint16_t *)srcu; srcu = (uintptr_t)((const uint16_t *)srcu + 1); dstu = (uintptr_t)((uint16_t *)dstu + 1); } if (n & 0x04) { *(uint32_t *)dstu = *(const uint32_t *)srcu; srcu = (uintptr_t)((const uint32_t *)srcu + 1); dstu = (uintptr_t)((uint32_t *)dstu + 1); } if (n & 0x08) *(uint64_t *)dstu = *(const uint64_t *)srcu; return ret; } /** * Fast way when copy size doesn't exceed 512 bytes */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); return ret; } if (n <= 512) { if (n >= 256) { n -= 256; rte_mov256((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 256; dst = (uint8_t *)dst + 256; } if (n >= 128) { n -= 128; rte_mov128((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 128; dst = (uint8_t *)dst + 128; } COPY_BLOCK_128_BACK63: if (n > 64) { rte_mov64((uint8_t *)dst, (const uint8_t *)src); rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n); return ret; } if (n > 0) rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n); return ret; } /** * Make store aligned when copy size exceeds 512 bytes */ dstofss = ((uintptr_t)dst & 0x3F); if (dstofss > 0) { dstofss = 64 - dstofss; n -= dstofss; rte_mov64((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + dstofss; dst = (uint8_t *)dst + dstofss; } /** * Copy 512-byte blocks. * Use copy block function for better instruction order control, * which is important when load is unaligned. */ rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); bits = n; n = n & 511; bits -= n; src = (const uint8_t *)src + bits; dst = (uint8_t *)dst + bits; /** * Copy 128-byte blocks. * Use copy block function for better instruction order control, * which is important when load is unaligned. */ if (n >= 128) { rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); bits = n; n = n & 127; bits -= n; src = (const uint8_t *)src + bits; dst = (uint8_t *)dst + bits; } /** * Copy whatever left */ goto COPY_BLOCK_128_BACK63; } #endif void usage() { printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION); printf("Usage: mbw [options] array_size_in_MiB\n"); printf("Options:\n"); printf(" -n: number of runs per test (0 to run forever)\n"); printf(" -a: Don't display average\n"); printf(" -t%d: memcpy test\n", TEST_MEMCPY); printf(" -t%d: plain (b[i]=a[i] style) test\n", TEST_PLAIN); printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK); #ifdef HAVE_AVX512 printf(" -t%d: AVX512 copy test\n", TEST_AVX512); #endif printf(" -t%d: plain read test\n", TEST_READ_PLAIN); printf(" -t%d: plain write test\n", TEST_WRITE_PLAIN); printf(" -b : block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE); printf(" -q: quiet (print statistics only)\n"); #ifdef NUMA printf(" -a : allocate source array on NUMA node\n"); printf(" -b : allocate target array on NUMA node\n"); printf(" -c : schedule task/threads on NUME node\n"); #endif printf("(will then use two arrays, watch out for swapping)\n"); printf("'Bandwidth' is amount of data copied over the time this operation took.\n"); printf("\nThe default is to run all tests available.\n"); } /* ------------------------------------------------------ */ /* allocate a test array and fill it with data * so as to force Linux to _really_ allocate it */ long *make_array() { unsigned long long t; unsigned int long_size=sizeof(long); long *a; #ifdef HAVE_AVX512 a=aligned_alloc(64, arr_size * long_size); #else a=calloc(arr_size, long_size); #endif if(NULL==a) { perror("Error allocating memory"); exit(1); } /* make sure both arrays are allocated, fill with pattern */ for(t=0; t= block_size; t-=block_size, src+=block_size){ dst=(char *) memcpy(dst, src, block_size) + block_size; } if(t) { dst=(char *) memcpy(dst, src, t) + t; } } else if(test_type==TEST_PLAIN) { /* plain test */ for(t=plain_start; t= block_size; t-=block_size, src+=block_size){ dst=(char *) memcpy(dst, src, block_size) + block_size; } if(t) { dst=(char *) memcpy(dst, src, t) + t; } clock_gettime(CLOCK_MONOTONIC, &endtime); } else if(test_type==TEST_PLAIN) { /* plain test */ clock_gettime(CLOCK_MONOTONIC, &starttime); for(t=0; tMAX_TESTS-1) { printf("Error: test number must be between 0 and %d\n", MAX_TESTS-1); exit(1); } tests[testno]=1; break; case 'B': /* block size in bytes*/ block_size=strtoull(optarg, (char **)NULL, 10); if(0>=block_size) { printf("Error: what block size do you mean?\n"); exit(1); } break; case 'q': /* quiet */ quiet=1; break; default: break; } } #ifndef HAVE_AVX512 if (tests[TEST_AVX512]) { printf("Error: AVX512 memcpy requested, but this mbw build has been compiled without AVX512 support\n"); exit(1); } #endif /* default is to run most tests if no specific tests were requested */ if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) == 0) { tests[0]=1; tests[1]=1; tests[2]=1; tests[4]=1; tests[5]=1; } if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) != 1) ) { printf("Error: nr_loops can be zero if only one test selected!\n"); exit(1); } if(optind=mt) { printf("Error: array size wrong!\n"); exit(1); } /* ------------------------------------------------------ */ long_size=sizeof(long); /* the size of long on this platform */ arr_size=1024*1024/long_size*mt; /* how many longs then in one array? */ if(arr_size*long_size < block_size) { printf("Error: array size larger than block size (%llu bytes)!\n", block_size); exit(1); } if(!quiet) { printf("Long uses %d bytes. ", long_size); if(tests[2]) { printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size); } } #ifdef NUMA struct bitmask *bitmask_all = numa_allocate_nodemask(); numa_bitmask_setall(bitmask_all); if (bitmask_a) { numa_set_membind(bitmask_a); numa_free_nodemask(bitmask_a); } #endif if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_READ_PLAIN]) { if (!quiet) { printf("Allocating %lld elements = %lld MiB of input memory.\n", arr_size, arr_size*long_size / 1024 / 1024); } arr_a=make_array(); } #ifdef NUMA if (bitmask_b) { numa_set_membind(bitmask_b); numa_free_nodemask(bitmask_b); } #endif if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_WRITE_PLAIN]) { if (!quiet) { printf("Allocating %lld elements = %lld MiB of output memory.\n", arr_size, arr_size*long_size / 1024 / 1024); } arr_b=make_array(); } #ifdef NUMA numa_set_membind(bitmask_all); numa_free_nodemask(bitmask_all); #endif #ifdef NUMA mp_pages[0] = arr_a; if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { perror("move_pages(arr_a)"); } else if (mp_status[0] < 0) { printf("move_pages error: %d", mp_status[0]); } else { numa_node_a = mp_status[0]; } mp_pages[0] = arr_b; if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) { perror("move_pages(arr_b)"); } else if (mp_status[0] < 0) { printf("move_pages error: %d", mp_status[0]); } else { numa_node_b = mp_status[0]; } if (numa_node_cpu != -1) { if (numa_run_on_node(numa_node_cpu) == -1) { perror("numa_run_on_node"); numa_node_cpu = -1; } } #endif /* ------------------------------------------------------ */ if(!quiet) { printf("Getting down to business... Doing %d runs per test.\n", nr_loops); } #ifdef MULTITHREADED if (sem_init(&start_sem, 0, 0) != 0) { err(1, "sem_init"); } if (sem_init(&stop_sem, 0, 0) != 0) { err(1, "sem_init"); } if (sem_init(&sync_sem, 0, 0) != 0) { err(1, "sem_init"); } threads = calloc(num_threads, sizeof(pthread_t)); for (i=0; i < num_threads; i++) { if (pthread_create(&threads[i], NULL, thread_worker, (void*)(unsigned long)i) != 0) { err(1, "pthread_create"); } } #endif /* run all tests requested, the proper number of times */ for(test_type=0; test_type