6 files changed, 304 insertions, 76 deletions
diff --git a/Makefile b/Makefile
index 83f3f7d..750d35f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,20 +1,35 @@
 EXTRA_CFLAGS =
 EXTRA_LIBS =
-ifdef pthread
+
+native ?= 1
+pthread ?= 0
+numa ?= 0
+avx512 ?= 0
+debug ?= 0
+
+ifeq (${native}, 1)
+	EXTRA_CFLAGS += -march=native
+endif
+
+ifeq (${pthread}, 1)
 	EXTRA_CFLAGS += -DMULTITHREADED -pthread
 endif
 
-ifdef numa
+ifeq (${numa}, 1)
 	EXTRA_CFLAGS += -DNUMA
 	EXTRA_LIBS += -lnuma
 endif
 
-ifdef avx512
+ifeq (${avx512}, 1)
 	EXTRA_CFLAGS += -DHAVE_AVX512
 endif
 
+ifeq (${debug}, 1)
+	EXTRA_CFLAGS += -ggdb
+endif
+
 mbw: mbw.c
-	gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
+	gcc -Wall -Wextra -pedantic -O3 ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
 
 .PHONY: clean
 clean:
diff --git a/benchmark-scripts/milos-copy.sh b/benchmark-scripts/milos-copy.sh
new file mode 100755
index 0000000..81c7764
--- /dev/null
+++ b/benchmark-scripts/milos-copy.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/copy-memcpy
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \
+	::: ram_in $(seq 0 17) \
+	::: ram_out $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads 1 2 4 6 8 10 12 14 16 \
+>> ${fn}.txt
+
+fn=log/${HOST}/copy-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t3 4096 \
+	::: ram_in $(seq 0 17) \
+	::: ram_out $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads 1 2 4 6 8 10 12 14 16 \
+>> ${fn}.txt
diff --git a/benchmark-scripts/milos-read.sh b/benchmark-scripts/milos-read.sh
new file mode 100755
index 0000000..1f77cfe
--- /dev/null
+++ b/benchmark-scripts/milos-read.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/read-64bit
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t4 4096 \
+	::: ram_in $(seq 0 17) \
+	:::+ ram_out $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
+
+fn=log/${HOST}/read-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t6 4096 \
+	::: ram_in $(seq 0 17) \
+	:::+ ram_out $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
diff --git a/benchmark-scripts/milos-write.sh b/benchmark-scripts/milos-write.sh
new file mode 100755
index 0000000..ba50133
--- /dev/null
+++ b/benchmark-scripts/milos-write.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/write-64bit
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t5 4096 \
+	::: ram_out $(seq 0 17) \
+	:::+ ram_in $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
+
+fn=log/${HOST}/write-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t7 4096 \
+	::: ram_out $(seq 0 17) \
+	:::+ ram_in $(seq 0 17) \
+	::: cpu $(seq 0 7) \
+	::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
diff --git a/mbw.c b/mbw.c
index 991c37a..931b133 100644
--- a/mbw.c
+++ b/mbw.c
@@ -3,6 +3,7 @@
  */
 #define _GNU_SOURCE
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -41,7 +42,9 @@
 #define TEST_AVX512 3
 #define TEST_READ_PLAIN 4
 #define TEST_WRITE_PLAIN 5
-#define MAX_TESTS 6
+#define TEST_READ_AVX512 6
+#define TEST_WRITE_AVX512 7
+#define MAX_TESTS 8
 
 /* version number */
 #define VERSION "1.5+smaug"
@@ -74,12 +77,17 @@ pthread_t *threads;
 sem_t start_sem, stop_sem, sync_sem;
 #endif
 
-long *arr_a, *arr_b; /* the two arrays to be copied from/to */
+long *arr_a = NULL;
+long *arr_b = NULL; /* the two arrays to be copied from/to */
 unsigned long long arr_size=0; /* array size (elements in array) */
 unsigned int test_type;
 /* fixed memcpy block size for -t2 */
 unsigned long long block_size=DEFAULT_BLOCK_SIZE;
 
+int sanity_check = 0;
+long arr_a_sum = 0;
+long *partial_sum;
+
 #ifdef NUMA
 void* mp_pages[1];
 int mp_status[1];
@@ -189,27 +197,27 @@ static inline void
 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-
-	while (n >= 512) {
-		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
-		n -= 512;
-		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
-		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
-		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
-		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
-		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
-		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
-		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
-		src = src + 512;
-		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
-		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
-		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
-		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
-		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
-		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
-		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
-		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
-		dst = dst + 512;
+	const uint8_t *end = src + n;
+
+	while (src < end) {
+		zmm0 = _mm512_load_si512((const void *)(src + 0 * 64));
+		zmm1 = _mm512_load_si512((const void *)(src + 1 * 64));
+		zmm2 = _mm512_load_si512((const void *)(src + 2 * 64));
+		zmm3 = _mm512_load_si512((const void *)(src + 3 * 64));
+		zmm4 = _mm512_load_si512((const void *)(src + 4 * 64));
+		zmm5 = _mm512_load_si512((const void *)(src + 5 * 64));
+		zmm6 = _mm512_load_si512((const void *)(src + 6 * 64));
+		zmm7 = _mm512_load_si512((const void *)(src + 7 * 64));
+		_mm512_store_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_store_si512((void *)(dst + 1 * 64), zmm1);
+		_mm512_store_si512((void *)(dst + 2 * 64), zmm2);
+		_mm512_store_si512((void *)(dst + 3 * 64), zmm3);
+		_mm512_store_si512((void *)(dst + 4 * 64), zmm4);
+		_mm512_store_si512((void *)(dst + 5 * 64), zmm5);
+		_mm512_store_si512((void *)(dst + 6 * 64), zmm6);
+		_mm512_store_si512((void *)(dst + 7 * 64), zmm7);
+		src += 512;
+		dst += 512;
 	}
 }
 
@@ -339,14 +347,19 @@ void usage()
     printf("Options:\n");
     printf("	-n: number of runs per test (0 to run forever)\n");
     printf("	-a: Don't display average\n");
+    printf("	-C: enable sanity checks\n");
     printf("	-t%d: memcpy test\n", TEST_MEMCPY);
     printf("	-t%d: plain (b[i]=a[i] style) test\n", TEST_PLAIN);
     printf("	-t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
 #ifdef HAVE_AVX512
     printf("	-t%d: AVX512 copy test\n", TEST_AVX512);
 #endif
-    printf("	-t%d: plain read test\n", TEST_READ_PLAIN);
-    printf("	-t%d: plain write test\n", TEST_WRITE_PLAIN);
+    printf("	-t%d: plain read test (sum)\n", TEST_READ_PLAIN);
+    printf("	-t%d: plain write test (const fill)\n", TEST_WRITE_PLAIN);
+#ifdef HAVE_AVX512
+    printf("	-t%d: AVX512 read test (sum)\n", TEST_READ_AVX512);
+    printf("	-t%d: AVX512 write test (const fill)\n", TEST_WRITE_AVX512);
+#endif
     printf("	-b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
     printf("	-q: quiet (print statistics only)\n");
 #ifdef NUMA
@@ -363,7 +376,7 @@ void usage()
 
 /* allocate a test array and fill it with data
  * so as to force Linux to _really_ allocate it */
-long *make_array()
+long *make_array(long *sum)
 {
     unsigned long long t;
     unsigned int long_size=sizeof(long);
@@ -384,6 +397,12 @@ long *make_array()
     for(t=0; t<arr_size; t++) {
         a[t]=0xaa;
     }
+    if (sum != NULL) {
+        *sum = 0;
+        for(t=0; t<arr_size; t++) {
+            *sum += 0xaa;
+        }
+    }
     return a;
 }
 
@@ -426,14 +445,42 @@ void *thread_worker(void *arg)
         } else if(test_type==TEST_READ_PLAIN) {
             long tmp = 0;
             for(t=plain_start; t<plain_stop; t++) {
-                tmp ^= arr_a[t];
+                tmp += arr_a[t];
+            }
+            if (sanity_check) {
+                partial_sum[thread_id] = tmp;
             }
-            arr_b[plain_stop-1] = tmp;
         } else if(test_type==TEST_WRITE_PLAIN) {
-            long tmp = 0;
+            long tmp = 1374181804651713298;
             for(t=plain_start; t<plain_stop; t++) {
-                arr_b[t] = ++tmp;
+                arr_b[t] = tmp;
+            }
+#ifdef HAVE_AVX512
+        } else if(test_type==TEST_READ_AVX512) {
+            __m512i zmm0 = _mm512_setzero_epi32();
+            __m512i zmm1;
+            uint8_t *src = (uint8_t*)(arr_a + (plain_start & ~0x0000000000000007));
+            const uint8_t *end = (uint8_t*)(arr_a + (plain_stop & ~0x0000000000000007));
+            long tmp = 0;
+            while (src < end) {
+                zmm1 = _mm512_load_si512((const void *)src);
+                zmm0 = _mm512_add_epi64(zmm0, zmm1);
+                src += 64;
             }
+            tmp += (long)_mm512_reduce_add_epi64(zmm0);
+            if (sanity_check) {
+                partial_sum[thread_id] = tmp;
+            }
+        } else if(test_type==TEST_WRITE_AVX512) {
+            const long src = 0x0707070707070707;
+            uint8_t *dst = (uint8_t*)(arr_b + (plain_start & ~0x0000000000000007));
+            const uint8_t *end = (uint8_t*)(arr_b + (plain_stop & ~0x0000000000000007));
+            __m512i zmm0 = _mm512_load_si512(&src);
+            while (dst < end) {
+                _mm512_store_si512((void*)(dst), zmm0);
+                dst += 64;
+            }
+#endif // HAVE_AVX512
         }
         if (sem_post(&stop_sem) != 0) {
             err(1, "sem_post(stop_sem)");
@@ -522,10 +569,12 @@ double worker()
         long tmp = 0;
         clock_gettime(CLOCK_MONOTONIC, &starttime);
         for(t=0; t<arr_size; t++) {
-            tmp ^= arr_a[t];
+            tmp += arr_a[t];
         }
         clock_gettime(CLOCK_MONOTONIC, &endtime);
-        arr_b[arr_size-1] = tmp;
+        if (sanity_check) {
+            assert(tmp == arr_a_sum);
+        }
     } else if(test_type==TEST_WRITE_PLAIN) {
         long tmp = 0;
         clock_gettime(CLOCK_MONOTONIC, &starttime);
@@ -533,8 +582,42 @@ double worker()
             arr_b[t] = ++tmp;
         }
         clock_gettime(CLOCK_MONOTONIC, &endtime);
+#ifdef HAVE_AVX512
+    } else if(test_type==TEST_READ_AVX512) {
+        __m512i zmm0 = _mm512_setzero_epi32();
+        __m512i zmm1;
+        long tmp = 0;
+        uint8_t *src = (uint8_t*)arr_a;
+        const uint8_t *end = src + arr_size * sizeof(long);
+        clock_gettime(CLOCK_MONOTONIC, &starttime);
+        while (src < end) {
+            zmm1 = _mm512_load_si512((const void *)src);
+            zmm0 = _mm512_add_epi64(zmm0, zmm1);
+            src += 64;
+        }
+        clock_gettime(CLOCK_MONOTONIC, &endtime);
+        tmp = (long)_mm512_reduce_add_epi64(zmm0);
+        if (sanity_check) {
+            if (tmp != arr_a_sum) {
+                printf("expected: arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum);
+                printf("output:  reduce_add == %12ld (%016lx)\n", tmp, tmp);
+            }
+            assert(tmp == arr_a_sum);
+        }
+    } else if(test_type==TEST_WRITE_AVX512) {
+        const uint8_t *src = (uint8_t*)arr_b;
+        uint8_t *dst = (uint8_t*)arr_b;
+        const uint8_t *end = dst + arr_size * sizeof(long);
+        __m512i zmm0 = _mm512_load_si512(src);
+        clock_gettime(CLOCK_MONOTONIC, &starttime);
+        while (dst < end) {
+            _mm512_store_si512((void*)(dst), zmm0);
+            dst += 64;
+        }
+        clock_gettime(CLOCK_MONOTONIC, &endtime);
+#endif // HAVE_AVX512
     }
-#endif // MULTITHREADED
+#endif // !MULTITHREADED
 
     te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000;
 
@@ -590,8 +673,12 @@ int main(int argc, char **argv)
     tests[1]=0;
     tests[2]=0;
     tests[3]=0;
+    tests[4]=0;
+    tests[5]=0;
+    tests[6]=0;
+    tests[7]=0;
 
-    while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) {
+    while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:C")) != EOF) {
         switch(o) {
             case 'h':
                 usage();
@@ -631,6 +718,9 @@ int main(int argc, char **argv)
                     exit(1);
                 }
                 break;
+            case 'C':
+                sanity_check = 1;
+                break;
             case 'q': /* quiet */
                 quiet=1;
                 break;
@@ -639,14 +729,36 @@ int main(int argc, char **argv)
         }
     }
 
-    /* default is to run all tests if no specific tests were requested */
-    if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) == 0) {
+#ifndef HAVE_AVX512
+    if (tests[TEST_AVX512]) {
+        printf("Error: AVX512 memcpy requested, but this mbw build has been compiled without AVX512 support\n");
+        exit(1);
+    }
+    if (tests[TEST_READ_AVX512]) {
+        printf("Error: AVX512 read requested, but this mbw build has been compiled without AVX512 support\n");
+        exit(1);
+    }
+    if (tests[TEST_WRITE_AVX512]) {
+        printf("Error: AVX512 write requested, but this mbw build has been compiled without AVX512 support\n");
+        exit(1);
+    }
+#endif
+
+    /* default is to run most tests if no specific tests were requested */
+    if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) == 0) {
         tests[0]=1;
         tests[1]=1;
         tests[2]=1;
+        tests[4]=1;
+        tests[5]=1;
+#if HAVE_AVX512
+        tests[3]=1;
+        tests[6]=1;
+        tests[7]=1;
+#endif
     }
 
-    if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) != 1) ) {
+    if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) != 1) ) {
         printf("Error: nr_loops can be zero if only one test selected!\n");
         exit(1);
     }
@@ -675,7 +787,6 @@ int main(int argc, char **argv)
 
     if(!quiet) {
         printf("Long uses %d bytes. ", long_size);
-        printf("Allocating 2*%lld elements = %lld bytes of memory.\n", arr_size, 2*arr_size*long_size);
         if(tests[2]) {
             printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size);
         }
@@ -689,7 +800,12 @@ int main(int argc, char **argv)
         numa_free_nodemask(bitmask_a);
     }
 #endif
-    arr_a=make_array();
+    if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_READ_PLAIN]+tests[TEST_READ_AVX512]) {
+        if (!quiet) {
+            printf("Allocating %lld elements = %lld MiB of input memory.\n", arr_size, arr_size*long_size / 1024 / 1024);
+        }
+        arr_a=make_array(&arr_a_sum);
+    }
 
 #ifdef NUMA
     if (bitmask_b) {
@@ -697,7 +813,12 @@ int main(int argc, char **argv)
         numa_free_nodemask(bitmask_b);
     }
 #endif
-    arr_b=make_array();
+    if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_WRITE_PLAIN]+tests[TEST_WRITE_AVX512]) {
+        if (!quiet) {
+            printf("Allocating %lld elements = %lld MiB of output memory.\n", arr_size, arr_size*long_size / 1024 / 1024);
+        }
+        arr_b=make_array(NULL);
+    }
 
 #ifdef NUMA
     numa_set_membind(bitmask_all);
@@ -705,26 +826,30 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef NUMA
-    mp_pages[0] = arr_a;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(arr_a)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_a = mp_status[0];
+    if (arr_a != NULL) {
+        mp_pages[0] = arr_a;
+        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+            perror("move_pages(arr_a)");
+        }
+        else if (mp_status[0] < 0) {
+            printf("move_pages(arr_a) error: %d\n", mp_status[0]);
+        }
+        else {
+            numa_node_a = mp_status[0];
+        }
     }
 
-    mp_pages[0] = arr_b;
-    if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
-        perror("move_pages(arr_b)");
-    }
-    else if (mp_status[0] < 0) {
-        printf("move_pages error: %d", mp_status[0]);
-    }
-    else {
-        numa_node_b = mp_status[0];
+    if (arr_b != NULL) {
+        mp_pages[0] = arr_b;
+        if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+            perror("move_pages(arr_b)");
+        }
+        else if (mp_status[0] < 0) {
+            printf("move_pages(arr_b) error: %d\n", mp_status[0]);
+        }
+        else {
+            numa_node_b = mp_status[0];
+        }
     }
 
     if (numa_node_cpu != -1) {
@@ -751,7 +876,13 @@ int main(int argc, char **argv)
         err(1, "sem_init");
     }
     threads = calloc(num_threads, sizeof(pthread_t));
+    if (sanity_check) {
+        partial_sum = calloc(num_threads, sizeof(long));
+    }
     for (i=0; i < num_threads; i++) {
+        if (sanity_check) {
+            partial_sum[i] = 0;
+        }
         if (pthread_create(&threads[i], NULL, thread_worker, (void*)(unsigned long)i) != 0) {
             err(1, "pthread_create");
         }
@@ -777,6 +908,10 @@ int main(int argc, char **argv)
                     printf("[::] read");
                 } else if (test_type == TEST_WRITE_PLAIN) {
                     printf("[::] write");
+                } else if (test_type == TEST_READ_AVX512) {
+                    printf("[::] read-avx512");
+                } else if (test_type == TEST_WRITE_AVX512) {
+                    printf("[::] write-avx512");
                 }
                 printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size);
 #ifdef MULTITHREADED
@@ -802,6 +937,17 @@ int main(int argc, char **argv)
             err(1, "pthread_join");
         }
     }
+    if (sanity_check && (tests[TEST_READ_PLAIN] || tests[TEST_READ_AVX512])) {
+        long tmp = 0;
+        for (i=0; i < num_threads; i++) {
+            tmp += partial_sum[i];
+        }
+        if (tmp != arr_a_sum) {
+            printf("expected:  arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum);
+            printf("output: sum(partial) == %12ld (%016lx)\n", tmp, tmp);
+        }
+        assert(tmp == arr_a_sum);
+    }
 #endif
 
     free(arr_a);
diff --git a/milos-roofline.sh b/milos-roofline.sh
deleted file mode 100755
index 092d147..0000000
--- a/milos-roofline.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-
-mkdir -p log
-fn=log/${HOST}-roofline
-
-make -B numa=1 pthread=1
-
-parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
-	./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \
-	::: ram_in $(seq 0 15) \
-	::: ram_out $(seq 0 15) \
-	::: cpu $(seq 0 7) \
-	::: nr_threads $(seq 1 16) \
->> ${fn}.txt