summaryrefslogtreecommitdiff
path: root/src/run.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/run.cpp')
-rw-r--r--src/run.cpp792
1 files changed, 792 insertions, 0 deletions
diff --git a/src/run.cpp b/src/run.cpp
new file mode 100644
index 0000000..54ef7c1
--- /dev/null
+++ b/src/run.cpp
@@ -0,0 +1,792 @@
+/*******************************************************************************
+ * Copyright (c) 2006 International Business Machines Corporation. *
+ * All rights reserved. This program and the accompanying materials *
+ * are made available under the terms of the Common Public License v1.0 *
+ * which accompanies this distribution, and is available at *
+ * http://www.opensource.org/licenses/cpl1.0.php *
+ * *
+ * Contributors: *
+ * Douglas M. Pase - initial API and implementation *
+ *******************************************************************************/
+
+//
+// Configuration
+//
+
+// Implementation header
+#include "run.h"
+
+// System includes
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+#include <cstddef>
+#include <vector>
+#if defined(NUMA)
+#include <numa.h>
+#endif
+
+// Local includes
+#include <AsmJit/AsmJit.h>
+#include "timer.h"
+
+
+//
+// Implementation
+//
+
+static double max(double v1, double v2);
+static double min(double v1, double v2);
+typedef void (*benchmark)(const Chain**);
+typedef benchmark (*generator)(int64 chains_per_thread,
+ int64 bytes_per_line, int64 bytes_per_chain,
+ int64 stride, int64 busy_cycles, bool prefetch);
+static benchmark chase_pointers(int64 chains_per_thread,
+ int64 bytes_per_line, int64 bytes_per_chain,
+ int64 stride, int64 busy_cycles, bool prefetch);
+static benchmark follow_streams(int64 chains_per_thread,
+ int64 bytes_per_line, int64 bytes_per_chain,
+ int64 stride, int64 busy_cycles, bool prefetch);
+
+Lock Run::global_mutex;
+int64 Run::_ops_per_chain = 0;
+double Run::_seconds = 1E9;
+
+Run::Run() :
+ exp(NULL), bp(NULL) {
+}
+
+Run::~Run() {
+}
+
+void Run::set(Experiment &e, SpinBarrier* sbp) {
+ this->exp = &e;
+ this->bp = sbp;
+}
+
+int Run::run() {
+ // first allocate all memory for the chains,
+ // making sure it is allocated within the
+ // intended numa domains
+ Chain** chain_memory = new Chain*[this->exp->chains_per_thread];
+ Chain** root = new Chain*[this->exp->chains_per_thread];
+
+#if defined(NUMA)
+ // establish the node id where this thread
+ // will run. threads are mapped to nodes
+ // by the set-up code for Experiment.
+ int run_node_id = this->exp->thread_domain[this->thread_id()];
+ numa_run_on_node(run_node_id);
+
+ // establish the node id where this thread's
+ // memory will be allocated.
+ for (int i=0; i < this->exp->chains_per_thread; i++) {
+ int alloc_node_id = this->exp->chain_domain[this->thread_id()][i];
+ nodemask_t alloc_mask;
+ nodemask_zero(&alloc_mask);
+ nodemask_set(&alloc_mask, alloc_node_id);
+ numa_set_membind(&alloc_mask);
+
+ chain_memory[i] = new Chain[ this->exp->links_per_chain ];
+ }
+#else
+ for (int i = 0; i < this->exp->chains_per_thread; i++) {
+ chain_memory[i] = new Chain[this->exp->links_per_chain];
+ }
+#endif
+
+ // initialize the chains and
+ // compile the function that
+ // will execute the tests
+ generator gen;
+ for (int i = 0; i < this->exp->chains_per_thread; i++) {
+ if (this->exp->access_pattern == Experiment::RANDOM) {
+ root[i] = random_mem_init(chain_memory[i]);
+ gen = chase_pointers;
+ } else if (this->exp->access_pattern == Experiment::STRIDED) {
+ if (0 < this->exp->stride) {
+ root[i] = forward_mem_init(chain_memory[i]);
+ } else {
+ root[i] = reverse_mem_init(chain_memory[i]);
+ }
+ gen = chase_pointers;
+ } else if (this->exp->access_pattern == Experiment::STREAM) {
+ root[i] = stream_mem_init(chain_memory[i]);
+ gen = follow_streams;
+ }
+ }
+
+ if (this->exp->iterations <= 0) {
+ // compile benchmark
+ benchmark bench = gen(this->exp->chains_per_thread,
+ this->exp->bytes_per_line, this->exp->bytes_per_chain,
+ this->exp->stride, this->exp->busy_cycles,
+ this->exp->prefetch);
+
+ volatile static double istart = 0;
+ volatile static double istop = 0;
+ volatile static double elapsed = 0;
+ volatile static int64 iters = 1;
+ volatile double bound = max(0.2, 10 * Timer::resolution());
+ for (iters = 1; elapsed <= bound; iters = iters << 1) {
+ // barrier
+ this->bp->barrier();
+
+ // start timer
+ if (this->thread_id() == 0) {
+ istart = Timer::seconds();
+ }
+ this->bp->barrier();
+
+ // chase pointers
+ for (int i = 0; i < iters; i++)
+ bench((const Chain**) root);
+
+ // barrier
+ this->bp->barrier();
+
+ // stop timer
+ if (this->thread_id() == 0) {
+ istop = Timer::seconds();
+ elapsed = istop - istart;
+ }
+ this->bp->barrier();
+ }
+
+ // calculate the number of iterations
+ if (this->thread_id() == 0) {
+ if (0 < this->exp->seconds) {
+ this->exp->iterations = max(1,
+ 0.9999 + 0.5 * this->exp->seconds * iters / elapsed);
+ } else {
+ this->exp->iterations = max(1, 0.9999 + iters / elapsed);
+ }
+ }
+ this->bp->barrier();
+ }
+#if defined(UNDEFINED)
+#endif
+
+ // compile benchmark
+ benchmark bench = gen(this->exp->chains_per_thread,
+ this->exp->bytes_per_line, this->exp->bytes_per_chain,
+ this->exp->stride, this->exp->busy_cycles,
+ this->exp->prefetch);
+
+ for (int e = 0; e < this->exp->experiments; e++) {
+ // barrier
+ this->bp->barrier();
+
+ // start timer
+ double start = 0;
+ if (this->thread_id() == 0)
+ start = Timer::seconds();
+ this->bp->barrier();
+
+ // chase pointers
+ for (int i = 0; i < this->exp->iterations; i++)
+ bench((const Chain**) root);
+
+ // barrier
+ this->bp->barrier();
+
+ // stop timer
+ double stop = 0;
+ if (this->thread_id() == 0)
+ stop = Timer::seconds();
+ this->bp->barrier();
+
+ if (0 <= e) {
+ if (this->thread_id() == 0) {
+ double delta = stop - start;
+ if (0 < delta) {
+ Run::_seconds = min(Run::_seconds, delta);
+ }
+ }
+ }
+ }
+
+ this->bp->barrier();
+
+ for (int i = 0; i < this->exp->chains_per_thread; i++) {
+ if (chain_memory[i] != NULL
+ ) delete[] chain_memory[i];
+ }
+ if (chain_memory != NULL
+ ) delete[] chain_memory;
+
+ return 0;
+}
+
+int dummy = 0;
+void Run::mem_check(Chain *m) {
+ if (m == NULL
+ ) dummy += 1;
+}
+
+static double max(double v1, double v2) {
+ if (v1 < v2)
+ return v2;
+ return v1;
+}
+
+static double min(double v1, double v2) {
+ if (v2 < v1)
+ return v2;
+ return v1;
+}
+
+// exclude 2 and Mersenne primes, i.e.,
+// primes of the form 2**n - 1, e.g.,
+// 3, 7, 31, 127
+static const int prime_table[] = { 5, 11, 13, 17, 19, 23, 37, 41, 43, 47, 53,
+ 61, 71, 73, 79, 83, 89, 97, 101, 103, 109, 113, 131, 137, 139, 149, 151,
+ 157, 163, };
+static const int prime_table_size = sizeof prime_table / sizeof prime_table[0];
+
+Chain*
+Run::random_mem_init(Chain *mem) {
+ // initialize pointers --
+ // choose a page at random, then use
+ // one pointer from each cache line
+ // within the page. all pages and
+ // cache lines are chosen at random.
+ Chain* root = 0;
+ Chain* prev = 0;
+ int link_within_line = 0;
+ int64 local_ops_per_chain = 0;
+
+ // we must set a lock because random()
+ // is not thread safe
+ Run::global_mutex.lock();
+ setstate(this->exp->random_state[this->thread_id()]);
+ int page_factor = prime_table[random() % prime_table_size];
+ int page_offset = random() % this->exp->pages_per_chain;
+ Run::global_mutex.unlock();
+
+ // loop through the pages
+ for (int i = 0; i < this->exp->pages_per_chain; i++) {
+ int page = (page_factor * i + page_offset) % this->exp->pages_per_chain;
+ Run::global_mutex.lock();
+ setstate(this->exp->random_state[this->thread_id()]);
+ int line_factor = prime_table[random() % prime_table_size];
+ int line_offset = random() % this->exp->lines_per_page;
+ Run::global_mutex.unlock();
+
+ // loop through the lines within a page
+ for (int j = 0; j < this->exp->lines_per_page; j++) {
+ int line_within_page = (line_factor * j + line_offset)
+ % this->exp->lines_per_page;
+ int link = page * this->exp->links_per_page
+ + line_within_page * this->exp->links_per_line
+ + link_within_line;
+
+ if (root == 0) {
+// printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link);
+ prev = root = mem + link;
+ local_ops_per_chain += 1;
+ } else {
+// printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link);
+ prev->next = mem + link;
+ prev = prev->next;
+ local_ops_per_chain += 1;
+ }
+ }
+ }
+
+ prev->next = root;
+
+ Run::global_mutex.lock();
+ Run::_ops_per_chain = local_ops_per_chain;
+ Run::global_mutex.unlock();
+
+ return root;
+}
+
+Chain*
+Run::forward_mem_init(Chain *mem) {
+ Chain* root = 0;
+ Chain* prev = 0;
+ int link_within_line = 0;
+ int64 local_ops_per_chain = 0;
+
+ for (int i = 0; i < this->exp->lines_per_chain; i += this->exp->stride) {
+ int link = i * this->exp->links_per_line + link_within_line;
+ if (root == NULL) {
+// printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link);
+ prev = root = mem + link;
+ local_ops_per_chain += 1;
+ } else {
+// printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link);
+ prev->next = mem + link;
+ prev = prev->next;
+ local_ops_per_chain += 1;
+ }
+ }
+
+ prev->next = root;
+
+ Run::global_mutex.lock();
+ Run::_ops_per_chain = local_ops_per_chain;
+ Run::global_mutex.unlock();
+
+ return root;
+}
+
+Chain*
+Run::reverse_mem_init(Chain *mem) {
+ Chain* root = 0;
+ Chain* prev = 0;
+ int link_within_line = 0;
+ int64 local_ops_per_chain = 0;
+
+ int stride = -this->exp->stride;
+ int last;
+ for (int i = 0; i < this->exp->lines_per_chain; i += stride) {
+ last = i;
+ }
+
+ for (int i = last; 0 <= i; i -= stride) {
+ int link = i * this->exp->links_per_line + link_within_line;
+ if (root == 0) {
+// printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link);
+ prev = root = mem + link;
+ local_ops_per_chain += 1;
+ } else {
+// printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link);
+ prev->next = mem + link;
+ prev = prev->next;
+ local_ops_per_chain += 1;
+ }
+ }
+
+ prev->next = root;
+
+ Run::global_mutex.lock();
+ Run::_ops_per_chain = local_ops_per_chain;
+ Run::global_mutex.unlock();
+
+ return root;
+}
+
+static benchmark chase_pointers(int64 chains_per_thread, // memory loading per thread
+ int64 bytes_per_line, // ignored
+ int64 bytes_per_chain, // ignored
+ int64 stride, // ignored
+ int64 busy_cycles, // processing cycles
+ bool prefetch // prefetch?
+ ) {
+ // Create Compiler.
+ AsmJit::Compiler c;
+
+ // Tell compiler the function prototype we want. It allocates variables representing
+ // function arguments that can be accessed through Compiler or Function instance.
+ c.newFunction(AsmJit::CALL_CONV_DEFAULT, AsmJit::FunctionBuilder1<AsmJit::Void, const Chain**>());
+
+ // Try to generate function without prolog/epilog code:
+ c.getFunction()->setHint(AsmJit::FUNCTION_HINT_NAKED, true);
+
+ // Create labels.
+ AsmJit::Label L_Loop = c.newLabel();
+
+ // Function arguments.
+ AsmJit::GPVar chain(c.argGP(0));
+
+ // Save the head
+ std::vector<AsmJit::GPVar> heads(chains_per_thread);
+ for (int i = 0; i < chains_per_thread; i++) {
+ AsmJit::GPVar head = c.newGP();
+ c.mov(head, ptr(chain));
+ heads[i] = head;
+ }
+
+ // Current position
+ std::vector<AsmJit::GPVar> positions(chains_per_thread);
+ for (int i = 0; i < chains_per_thread; i++) {
+ AsmJit::GPVar position = c.newGP();
+ c.mov(position, heads[0]);
+ positions[i] = position;
+ }
+
+ // Loop.
+ c.bind(L_Loop);
+
+ // Process all links
+ for (int i = 0; i < chains_per_thread; i++) {
+ // Chase pointer
+ c.mov(positions[i], ptr(positions[i], offsetof(Chain, next)));
+
+ // Prefetch next
+ if (prefetch)
+ c.prefetch(ptr(positions[i]), AsmJit::PREFETCH_T0);
+ }
+
+ // Wait
+ for (int i = 0; i < busy_cycles; i++)
+ c.nop();
+
+ // Test if end reached
+ c.cmp(heads[0], positions[0]);
+ c.jne(L_Loop);
+
+ // Finish.
+ c.endFunction();
+
+ // Make JIT function.
+ benchmark fn = AsmJit::function_cast<benchmark>(c.make());
+
+ // Ensure that everything is ok.
+ if (!fn) {
+ printf("Error making jit function (%u).\n", c.getError());
+ return 0;
+ }
+
+ return fn;
+}
+
+// NOT WRITTEN YET -- DMP
+// JUST A PLACE HOLDER!
+Chain* Run::stream_mem_init(Chain *mem) {
+// fprintf(stderr, "made it into stream_mem_init.\n");
+// fprintf(stderr, "chains_per_thread = %ld\n", this->exp->chains_per_thread);
+// fprintf(stderr, "iterations = %ld\n", this->exp->iterations);
+// fprintf(stderr, "bytes_per_chain = %ld\n", this->exp->bytes_per_chain);
+// fprintf(stderr, "stride = %ld\n", this->exp->stride);
+ int64 local_ops_per_chain = 0;
+ double* tmp = (double *) mem;
+ int64 refs_per_line = this->exp->bytes_per_line / sizeof(double);
+ int64 refs_per_chain = this->exp->bytes_per_chain / sizeof(double);
+// fprintf(stderr, "refs_per_chain = %ld\n", refs_per_chain);
+
+ for (int64 i = 0; i < refs_per_chain;
+ i += this->exp->stride * refs_per_line) {
+ tmp[i] = 0;
+ local_ops_per_chain += 1;
+ }
+
+ Run::global_mutex.lock();
+ Run::_ops_per_chain = local_ops_per_chain;
+ Run::global_mutex.unlock();
+
+// fprintf(stderr, "made it out of stream_mem_init.\n");
+ return mem;
+}
+
+static int64 summ_ck = 0;
+void sum_chk(double t) {
+ if (t != 0)
+ summ_ck += 1;
+}
+
+// NOT WRITTEN YET -- DMP
+// JUST A PLACE HOLDER!
+static benchmark follow_streams(int64 chains_per_thread, // memory loading per thread
+ int64 bytes_per_line, // ignored
+ int64 bytes_per_chain, // ignored
+ int64 stride, // ignored
+ int64 busy_cycles, // ignored
+ bool prefetch // ignored
+ ) {
+ return 0;
+ /*
+ int64 refs_per_line = bytes_per_line / sizeof(double);
+ int64 refs_per_chain = bytes_per_chain / sizeof(double);
+
+ // chase pointers
+ switch (chains_per_thread) {
+ default:
+ case 1:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 2:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 3:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 4:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 5:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 6:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 7:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 8:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 9:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 10:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 11:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 12:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ double* a11 = (double *) root[11];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j] + a11[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 13:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ double* a11 = (double *) root[11];
+ double* a12 = (double *) root[12];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 14:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ double* a11 = (double *) root[11];
+ double* a12 = (double *) root[12];
+ double* a13 = (double *) root[13];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j]
+ + a13[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 15:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ double* a11 = (double *) root[11];
+ double* a12 = (double *) root[12];
+ double* a13 = (double *) root[13];
+ double* a14 = (double *) root[14];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j]
+ + a13[j] + a14[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ case 16:
+ for (int64 i = 0; i < iterations; i++) {
+ double t = 0;
+ double* a0 = (double *) root[0];
+ double* a1 = (double *) root[1];
+ double* a2 = (double *) root[2];
+ double* a3 = (double *) root[3];
+ double* a4 = (double *) root[4];
+ double* a5 = (double *) root[5];
+ double* a6 = (double *) root[6];
+ double* a7 = (double *) root[7];
+ double* a8 = (double *) root[8];
+ double* a9 = (double *) root[9];
+ double* a10 = (double *) root[10];
+ double* a11 = (double *) root[11];
+ double* a12 = (double *) root[12];
+ double* a13 = (double *) root[13];
+ double* a14 = (double *) root[14];
+ double* a15 = (double *) root[15];
+ for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) {
+ t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]
+ + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j]
+ + a13[j] + a14[j] + a15[j];
+ }
+ sum_chk(t);
+ }
+ break;
+ }
+ */
+}