summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile23
-rwxr-xr-xbenchmark-scripts/milos-copy.sh27
-rwxr-xr-xbenchmark-scripts/milos-read.sh27
-rwxr-xr-xbenchmark-scripts/milos-write.sh27
-rw-r--r--mbw.c262
-rwxr-xr-xmilos-roofline.sh14
6 files changed, 304 insertions, 76 deletions
diff --git a/Makefile b/Makefile
index 83f3f7d..750d35f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,20 +1,35 @@
EXTRA_CFLAGS =
EXTRA_LIBS =
-ifdef pthread
+
+native ?= 1
+pthread ?= 0
+numa ?= 0
+avx512 ?= 0
+debug ?= 0
+
+ifeq (${native}, 1)
+ EXTRA_CFLAGS += -march=native
+endif
+
+ifeq (${pthread}, 1)
EXTRA_CFLAGS += -DMULTITHREADED -pthread
endif
-ifdef numa
+ifeq (${numa}, 1)
EXTRA_CFLAGS += -DNUMA
EXTRA_LIBS += -lnuma
endif
-ifdef avx512
+ifeq (${avx512}, 1)
EXTRA_CFLAGS += -DHAVE_AVX512
endif
+ifeq (${debug}, 1)
+ EXTRA_CFLAGS += -ggdb
+endif
+
mbw: mbw.c
- gcc -Wall -Wextra -pedantic -O3 -march=native ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
+ gcc -Wall -Wextra -pedantic -O3 ${EXTRA_CFLAGS} -o mbw mbw.c ${EXTRA_LIBS}
.PHONY: clean
clean:
diff --git a/benchmark-scripts/milos-copy.sh b/benchmark-scripts/milos-copy.sh
new file mode 100755
index 0000000..81c7764
--- /dev/null
+++ b/benchmark-scripts/milos-copy.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/copy-memcpy
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \
+ ::: ram_in $(seq 0 17) \
+ ::: ram_out $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads 1 2 4 6 8 10 12 14 16 \
+>> ${fn}.txt
+
+fn=log/${HOST}/copy-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t3 4096 \
+ ::: ram_in $(seq 0 17) \
+ ::: ram_out $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads 1 2 4 6 8 10 12 14 16 \
+>> ${fn}.txt
diff --git a/benchmark-scripts/milos-read.sh b/benchmark-scripts/milos-read.sh
new file mode 100755
index 0000000..1f77cfe
--- /dev/null
+++ b/benchmark-scripts/milos-read.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/read-64bit
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t4 4096 \
+ ::: ram_in $(seq 0 17) \
+ :::+ ram_out $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
+
+fn=log/${HOST}/read-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t6 4096 \
+ ::: ram_in $(seq 0 17) \
+ :::+ ram_out $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
diff --git a/benchmark-scripts/milos-write.sh b/benchmark-scripts/milos-write.sh
new file mode 100755
index 0000000..ba50133
--- /dev/null
+++ b/benchmark-scripts/milos-write.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+mkdir -p log/${HOST}
+
+make -B numa=1 pthread=1 avx512=1
+
+fn=log/${HOST}/write-64bit
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t5 4096 \
+ ::: ram_out $(seq 0 17) \
+ :::+ ram_in $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
+
+fn=log/${HOST}/write-avx512
+echo "\n${fn}\n"
+echo "mbw $(git describe --all --long) $(git rev-parse HEAD)" >> ${fn}.txt
+parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
+ ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t7 4096 \
+ ::: ram_out $(seq 0 17) \
+ :::+ ram_in $(seq 0 17) \
+ ::: cpu $(seq 0 7) \
+ ::: nr_threads $(seq 1 16) \
+>> ${fn}.txt
diff --git a/mbw.c b/mbw.c
index 991c37a..931b133 100644
--- a/mbw.c
+++ b/mbw.c
@@ -3,6 +3,7 @@
*/
#define _GNU_SOURCE
+#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -41,7 +42,9 @@
#define TEST_AVX512 3
#define TEST_READ_PLAIN 4
#define TEST_WRITE_PLAIN 5
-#define MAX_TESTS 6
+#define TEST_READ_AVX512 6
+#define TEST_WRITE_AVX512 7
+#define MAX_TESTS 8
/* version number */
#define VERSION "1.5+smaug"
@@ -74,12 +77,17 @@ pthread_t *threads;
sem_t start_sem, stop_sem, sync_sem;
#endif
-long *arr_a, *arr_b; /* the two arrays to be copied from/to */
+long *arr_a = NULL;
+long *arr_b = NULL; /* the two arrays to be copied from/to */
unsigned long long arr_size=0; /* array size (elements in array) */
unsigned int test_type;
/* fixed memcpy block size for -t2 */
unsigned long long block_size=DEFAULT_BLOCK_SIZE;
+int sanity_check = 0;
+long arr_a_sum = 0;
+long *partial_sum;
+
#ifdef NUMA
void* mp_pages[1];
int mp_status[1];
@@ -189,27 +197,27 @@ static inline void
rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-
- while (n >= 512) {
- zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
- n -= 512;
- zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
- zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
- zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
- zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
- zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
- zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
- zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
- src = src + 512;
- _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
- _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
- _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
- _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
- _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
- _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
- _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
- _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
- dst = dst + 512;
+ const uint8_t *end = src + n;
+
+ while (src < end) {
+ zmm0 = _mm512_load_si512((const void *)(src + 0 * 64));
+ zmm1 = _mm512_load_si512((const void *)(src + 1 * 64));
+ zmm2 = _mm512_load_si512((const void *)(src + 2 * 64));
+ zmm3 = _mm512_load_si512((const void *)(src + 3 * 64));
+ zmm4 = _mm512_load_si512((const void *)(src + 4 * 64));
+ zmm5 = _mm512_load_si512((const void *)(src + 5 * 64));
+ zmm6 = _mm512_load_si512((const void *)(src + 6 * 64));
+ zmm7 = _mm512_load_si512((const void *)(src + 7 * 64));
+ _mm512_store_si512((void *)(dst + 0 * 64), zmm0);
+ _mm512_store_si512((void *)(dst + 1 * 64), zmm1);
+ _mm512_store_si512((void *)(dst + 2 * 64), zmm2);
+ _mm512_store_si512((void *)(dst + 3 * 64), zmm3);
+ _mm512_store_si512((void *)(dst + 4 * 64), zmm4);
+ _mm512_store_si512((void *)(dst + 5 * 64), zmm5);
+ _mm512_store_si512((void *)(dst + 6 * 64), zmm6);
+ _mm512_store_si512((void *)(dst + 7 * 64), zmm7);
+ src += 512;
+ dst += 512;
}
}
@@ -339,14 +347,19 @@ void usage()
printf("Options:\n");
printf(" -n: number of runs per test (0 to run forever)\n");
printf(" -a: Don't display average\n");
+ printf(" -C: enable sanity checks\n");
printf(" -t%d: memcpy test\n", TEST_MEMCPY);
printf(" -t%d: plain (b[i]=a[i] style) test\n", TEST_PLAIN);
printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
#ifdef HAVE_AVX512
printf(" -t%d: AVX512 copy test\n", TEST_AVX512);
#endif
- printf(" -t%d: plain read test\n", TEST_READ_PLAIN);
- printf(" -t%d: plain write test\n", TEST_WRITE_PLAIN);
+ printf(" -t%d: plain read test (sum)\n", TEST_READ_PLAIN);
+ printf(" -t%d: plain write test (const fill)\n", TEST_WRITE_PLAIN);
+#ifdef HAVE_AVX512
+ printf(" -t%d: AVX512 read test (sum)\n", TEST_READ_AVX512);
+ printf(" -t%d: AVX512 write test (const fill)\n", TEST_WRITE_AVX512);
+#endif
printf(" -b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
printf(" -q: quiet (print statistics only)\n");
#ifdef NUMA
@@ -363,7 +376,7 @@ void usage()
/* allocate a test array and fill it with data
* so as to force Linux to _really_ allocate it */
-long *make_array()
+long *make_array(long *sum)
{
unsigned long long t;
unsigned int long_size=sizeof(long);
@@ -384,6 +397,12 @@ long *make_array()
for(t=0; t<arr_size; t++) {
a[t]=0xaa;
}
+ if (sum != NULL) {
+ *sum = 0;
+ for(t=0; t<arr_size; t++) {
+ *sum += 0xaa;
+ }
+ }
return a;
}
@@ -426,14 +445,42 @@ void *thread_worker(void *arg)
} else if(test_type==TEST_READ_PLAIN) {
long tmp = 0;
for(t=plain_start; t<plain_stop; t++) {
- tmp ^= arr_a[t];
+ tmp += arr_a[t];
+ }
+ if (sanity_check) {
+ partial_sum[thread_id] = tmp;
}
- arr_b[plain_stop-1] = tmp;
} else if(test_type==TEST_WRITE_PLAIN) {
- long tmp = 0;
+ long tmp = 1374181804651713298;
for(t=plain_start; t<plain_stop; t++) {
- arr_b[t] = ++tmp;
+ arr_b[t] = tmp;
+ }
+#ifdef HAVE_AVX512
+ } else if(test_type==TEST_READ_AVX512) {
+ __m512i zmm0 = _mm512_setzero_epi32();
+ __m512i zmm1;
+ uint8_t *src = (uint8_t*)(arr_a + (plain_start & ~0x0000000000000007));
+ const uint8_t *end = (uint8_t*)(arr_a + (plain_stop & ~0x0000000000000007));
+ long tmp = 0;
+ while (src < end) {
+ zmm1 = _mm512_load_si512((const void *)src);
+ zmm0 = _mm512_add_epi64(zmm0, zmm1);
+ src += 64;
}
+ tmp += (long)_mm512_reduce_add_epi64(zmm0);
+ if (sanity_check) {
+ partial_sum[thread_id] = tmp;
+ }
+ } else if(test_type==TEST_WRITE_AVX512) {
+ const long src = 0x0707070707070707;
+ uint8_t *dst = (uint8_t*)(arr_b + (plain_start & ~0x0000000000000007));
+ const uint8_t *end = (uint8_t*)(arr_b + (plain_stop & ~0x0000000000000007));
+ __m512i zmm0 = _mm512_load_si512(&src);
+ while (dst < end) {
+ _mm512_store_si512((void*)(dst), zmm0);
+ dst += 64;
+ }
+#endif // HAVE_AVX512
}
if (sem_post(&stop_sem) != 0) {
err(1, "sem_post(stop_sem)");
@@ -522,10 +569,12 @@ double worker()
long tmp = 0;
clock_gettime(CLOCK_MONOTONIC, &starttime);
for(t=0; t<arr_size; t++) {
- tmp ^= arr_a[t];
+ tmp += arr_a[t];
}
clock_gettime(CLOCK_MONOTONIC, &endtime);
- arr_b[arr_size-1] = tmp;
+ if (sanity_check) {
+ assert(tmp == arr_a_sum);
+ }
} else if(test_type==TEST_WRITE_PLAIN) {
long tmp = 0;
clock_gettime(CLOCK_MONOTONIC, &starttime);
@@ -533,8 +582,42 @@ double worker()
arr_b[t] = ++tmp;
}
clock_gettime(CLOCK_MONOTONIC, &endtime);
+#ifdef HAVE_AVX512
+ } else if(test_type==TEST_READ_AVX512) {
+ __m512i zmm0 = _mm512_setzero_epi32();
+ __m512i zmm1;
+ long tmp = 0;
+ uint8_t *src = (uint8_t*)arr_a;
+ const uint8_t *end = src + arr_size * sizeof(long);
+ clock_gettime(CLOCK_MONOTONIC, &starttime);
+ while (src < end) {
+ zmm1 = _mm512_load_si512((const void *)src);
+ zmm0 = _mm512_add_epi64(zmm0, zmm1);
+ src += 64;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &endtime);
+ tmp = (long)_mm512_reduce_add_epi64(zmm0);
+ if (sanity_check) {
+ if (tmp != arr_a_sum) {
+ printf("expected: arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum);
+ printf("output: reduce_add == %12ld (%016lx)\n", tmp, tmp);
+ }
+ assert(tmp == arr_a_sum);
+ }
+ } else if(test_type==TEST_WRITE_AVX512) {
+ const uint8_t *src = (uint8_t*)arr_b;
+ uint8_t *dst = (uint8_t*)arr_b;
+ const uint8_t *end = dst + arr_size * sizeof(long);
+ __m512i zmm0 = _mm512_load_si512(src);
+ clock_gettime(CLOCK_MONOTONIC, &starttime);
+ while (dst < end) {
+ _mm512_store_si512((void*)(dst), zmm0);
+ dst += 64;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &endtime);
+#endif // HAVE_AVX512
}
-#endif // MULTITHREADED
+#endif // !MULTITHREADED
te=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec))/1000000000;
@@ -590,8 +673,12 @@ int main(int argc, char **argv)
tests[1]=0;
tests[2]=0;
tests[3]=0;
+ tests[4]=0;
+ tests[5]=0;
+ tests[6]=0;
+ tests[7]=0;
- while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:")) != EOF) {
+ while((o=getopt(argc, argv, "ha:b:c:qn:N:t:B:C")) != EOF) {
switch(o) {
case 'h':
usage();
@@ -631,6 +718,9 @@ int main(int argc, char **argv)
exit(1);
}
break;
+ case 'C':
+ sanity_check = 1;
+ break;
case 'q': /* quiet */
quiet=1;
break;
@@ -639,14 +729,36 @@ int main(int argc, char **argv)
}
}
- /* default is to run all tests if no specific tests were requested */
- if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) == 0) {
+#ifndef HAVE_AVX512
+ if (tests[TEST_AVX512]) {
+ printf("Error: AVX512 memcpy requested, but this mbw build has been compiled without AVX512 support\n");
+ exit(1);
+ }
+ if (tests[TEST_READ_AVX512]) {
+ printf("Error: AVX512 read requested, but this mbw build has been compiled without AVX512 support\n");
+ exit(1);
+ }
+ if (tests[TEST_WRITE_AVX512]) {
+ printf("Error: AVX512 write requested, but this mbw build has been compiled without AVX512 support\n");
+ exit(1);
+ }
+#endif
+
+ /* default is to run most tests if no specific tests were requested */
+ if( (tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) == 0) {
tests[0]=1;
tests[1]=1;
tests[2]=1;
+ tests[4]=1;
+ tests[5]=1;
+#if HAVE_AVX512
+ tests[3]=1;
+ tests[6]=1;
+ tests[7]=1;
+#endif
}
- if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]) != 1) ) {
+ if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]+tests[3]+tests[4]+tests[5]+tests[6]+tests[7]) != 1) ) {
printf("Error: nr_loops can be zero if only one test selected!\n");
exit(1);
}
@@ -675,7 +787,6 @@ int main(int argc, char **argv)
if(!quiet) {
printf("Long uses %d bytes. ", long_size);
- printf("Allocating 2*%lld elements = %lld bytes of memory.\n", arr_size, 2*arr_size*long_size);
if(tests[2]) {
printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size);
}
@@ -689,7 +800,12 @@ int main(int argc, char **argv)
numa_free_nodemask(bitmask_a);
}
#endif
- arr_a=make_array();
+ if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_READ_PLAIN]+tests[TEST_READ_AVX512]) {
+ if (!quiet) {
+ printf("Allocating %lld elements = %lld MiB of input memory.\n", arr_size, arr_size*long_size / 1024 / 1024);
+ }
+ arr_a=make_array(&arr_a_sum);
+ }
#ifdef NUMA
if (bitmask_b) {
@@ -697,7 +813,12 @@ int main(int argc, char **argv)
numa_free_nodemask(bitmask_b);
}
#endif
- arr_b=make_array();
+ if (tests[TEST_MEMCPY]+tests[TEST_PLAIN]+tests[TEST_MCBLOCK]+tests[TEST_AVX512]+tests[TEST_WRITE_PLAIN]+tests[TEST_WRITE_AVX512]) {
+ if (!quiet) {
+ printf("Allocating %lld elements = %lld MiB of output memory.\n", arr_size, arr_size*long_size / 1024 / 1024);
+ }
+ arr_b=make_array(NULL);
+ }
#ifdef NUMA
numa_set_membind(bitmask_all);
@@ -705,26 +826,30 @@ int main(int argc, char **argv)
#endif
#ifdef NUMA
- mp_pages[0] = arr_a;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(arr_a)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_a = mp_status[0];
+ if (arr_a != NULL) {
+ mp_pages[0] = arr_a;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(arr_a)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages(arr_a) error: %d\n", mp_status[0]);
+ }
+ else {
+ numa_node_a = mp_status[0];
+ }
}
- mp_pages[0] = arr_b;
- if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
- perror("move_pages(arr_b)");
- }
- else if (mp_status[0] < 0) {
- printf("move_pages error: %d", mp_status[0]);
- }
- else {
- numa_node_b = mp_status[0];
+ if (arr_b != NULL) {
+ mp_pages[0] = arr_b;
+ if (move_pages(0, 1, mp_pages, NULL, mp_status, 0) == -1) {
+ perror("move_pages(arr_b)");
+ }
+ else if (mp_status[0] < 0) {
+ printf("move_pages(arr_b) error: %d\n", mp_status[0]);
+ }
+ else {
+ numa_node_b = mp_status[0];
+ }
}
if (numa_node_cpu != -1) {
@@ -751,7 +876,13 @@ int main(int argc, char **argv)
err(1, "sem_init");
}
threads = calloc(num_threads, sizeof(pthread_t));
+ if (sanity_check) {
+ partial_sum = calloc(num_threads, sizeof(long));
+ }
for (i=0; i < num_threads; i++) {
+ if (sanity_check) {
+ partial_sum[i] = 0;
+ }
if (pthread_create(&threads[i], NULL, thread_worker, (void*)(unsigned long)i) != 0) {
err(1, "pthread_create");
}
@@ -777,6 +908,10 @@ int main(int argc, char **argv)
printf("[::] read");
} else if (test_type == TEST_WRITE_PLAIN) {
printf("[::] write");
+ } else if (test_type == TEST_READ_AVX512) {
+ printf("[::] read-avx512");
+ } else if (test_type == TEST_WRITE_AVX512) {
+ printf("[::] write-avx512");
}
printf(" | block_size_B=%llu array_size_B=%llu ", block_size, arr_size*long_size);
#ifdef MULTITHREADED
@@ -802,6 +937,17 @@ int main(int argc, char **argv)
err(1, "pthread_join");
}
}
+ if (sanity_check && (tests[TEST_READ_PLAIN] || tests[TEST_READ_AVX512])) {
+ long tmp = 0;
+ for (i=0; i < num_threads; i++) {
+ tmp += partial_sum[i];
+ }
+ if (tmp != arr_a_sum) {
+ printf("expected: arr_a_sum == %12ld (%016lx)\n", arr_a_sum, arr_a_sum);
+ printf("output: sum(partial) == %12ld (%016lx)\n", tmp, tmp);
+ }
+ assert(tmp == arr_a_sum);
+ }
#endif
free(arr_a);
diff --git a/milos-roofline.sh b/milos-roofline.sh
deleted file mode 100755
index 092d147..0000000
--- a/milos-roofline.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-
-mkdir -p log
-fn=log/${HOST}-roofline
-
-make -B numa=1 pthread=1
-
-parallel -j1 --eta --joblog ${fn}.joblog --resume --header : \
- ./mbw -a {ram_in} -b {ram_out} -c {cpu} -n 10 -N {nr_threads} -t0 4096 \
- ::: ram_in $(seq 0 15) \
- ::: ram_out $(seq 0 15) \
- ::: cpu $(seq 0 7) \
- ::: nr_threads $(seq 1 16) \
->> ${fn}.txt