diff options
author | Tim Besard <tim.besard@gmail.com> | 2011-11-02 09:13:38 +0100 |
---|---|---|
committer | Tim Besard <tim.besard@gmail.com> | 2011-11-02 09:13:38 +0100 |
commit | c108197c20aa7b93849e383a3aaaf7b2bba30405 (patch) | |
tree | f05aa2ff019892dfc936ad2242d43373191f16cd /src | |
parent | eb6995fb5a0f4382cb4a01d301423e74ea8babe6 (diff) |
Formatting the source.
Diffstat (limited to 'src')
-rw-r--r-- | src/Chain.cpp | 23 | ||||
-rw-r--r-- | src/Chain.h | 11 | ||||
-rw-r--r-- | src/Experiment.cpp | 1068 | ||||
-rw-r--r-- | src/Experiment.h | 37 | ||||
-rw-r--r-- | src/Lock.cpp | 31 | ||||
-rw-r--r-- | src/Lock.h | 13 | ||||
-rw-r--r-- | src/Main.c | 114 | ||||
-rw-r--r-- | src/Main.cpp | 107 | ||||
-rw-r--r-- | src/Output.cpp | 74 | ||||
-rw-r--r-- | src/Output.h | 9 | ||||
-rw-r--r-- | src/Run.cpp | 2259 | ||||
-rw-r--r-- | src/Run.h | 44 | ||||
-rw-r--r-- | src/SpinBarrier.cpp | 25 | ||||
-rw-r--r-- | src/SpinBarrier.h | 11 | ||||
-rw-r--r-- | src/Thread.cpp | 64 | ||||
-rw-r--r-- | src/Thread.h | 43 | ||||
-rw-r--r-- | src/Timer.cpp | 192 | ||||
-rw-r--r-- | src/Timer.h | 11 | ||||
-rw-r--r-- | src/Types.cpp | 1 | ||||
-rw-r--r-- | src/Types.h | 3 |
20 files changed, 2085 insertions, 2055 deletions
diff --git a/src/Chain.cpp b/src/Chain.cpp index 1eda774..ceb1b31 100644 --- a/src/Chain.cpp +++ b/src/Chain.cpp @@ -7,29 +7,24 @@ * * * Contributors: * * Douglas M. Pase - initial API and implementation * - *******************************************************************************/ - + *******************************************************************************/ #include <stdio.h> #include "Chain.h" -Chain::Chain() -: next(END_OF_CHAIN) -{ +Chain::Chain() : + next(END_OF_CHAIN) { } -Chain::Chain(Chain *end) -: next(end) -{ +Chain::Chain(Chain *end) : + next(end) { } -Chain::~Chain() -{ +Chain::~Chain() { } -Chain* Chain::END() -{ - static Chain chain((Chain*) 0xDEADBEEF); - return &chain; +Chain* Chain::END() { + static Chain chain((Chain*) 0xDEADBEEF); + return &chain; } diff --git a/src/Chain.h b/src/Chain.h index 5a55865..8413a43 100644 --- a/src/Chain.h +++ b/src/Chain.h @@ -9,18 +9,17 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Chain_h) #define Chain_h class Chain { public: - Chain(); - Chain(Chain* end); - ~Chain(); - Chain* next; + Chain(); + Chain(Chain* end); + ~Chain(); + Chain* next; - static Chain* END(); + static Chain* END(); private: }; diff --git a/src/Experiment.cpp b/src/Experiment.cpp index 27e1a25..e58be0a 100644 --- a/src/Experiment.cpp +++ b/src/Experiment.cpp @@ -35,7 +35,7 @@ Experiment::Experiment() : pages_per_chain (DEFAULT_PAGES_PER_CHAIN), chains_per_thread(DEFAULT_CHAINS_PER_THREAD), bytes_per_thread (DEFAULT_BYTES_PER_THREAD), - num_threads (DEFAULT_THREADS), + num_threads (DEFAULT_THREADS), bytes_per_test (DEFAULT_BYTES_PER_TEST), busy_cycles (DEFAULT_BUSY_CYCLES), seconds (DEFAULT_SECONDS), @@ -55,233 +55,334 @@ Experiment::Experiment() : { } -Experiment::~Experiment() -{ +Experiment::~Experiment() { } - // interface: - // - // -l or --line bytes per cache line (line size) - // -p or --page bytes per page (page size) - // -c or --chain bytes per chain (used to compute pages per chain) - // -r or --references chains per thread (memory loading) - // -t or --threads number of threads (concurrency and contention) - // -i or --iters iterations - // -e or --experiments experiments - // -b or --busy amount of cycles processor should remain busy - // -f or --prefetch prefetch data - // -a or --access memory access pattern - // random random access pattern - // forward <stride> exclusive OR and mask - // reverse <stride> addition and offset - // -o or --output output mode - // hdr header only - // csv csv only - // both header + csv - // table human-readable table of values - // -n or --numa numa placement - // local local allocation of all chains - // xor <mask> exclusive OR and mask - // add <offset> addition and offset - // map <map> explicit mapping of threads and chains to domains - -int -Experiment::parse_args(int argc, char* argv[]) -{ - int error = 0; - for (int i=1; i < argc; i++) { - if (strcasecmp(argv[i], "-x") == 0 || strcasecmp(argv[i], "--strict") == 0) { - this->strict = 1; - } else if (strcasecmp(argv[i], "-s") == 0 || strcasecmp(argv[i], "--seconds") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->seconds = Experiment::parse_real(argv[i]); - this->iterations = 0; - if (this->seconds == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-l") == 0 || strcasecmp(argv[i], "--line") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->bytes_per_line = Experiment::parse_number(argv[i]); - if (this->bytes_per_line == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-p") == 0 || strcasecmp(argv[i], "--page") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->bytes_per_page = Experiment::parse_number(argv[i]); - if (this->bytes_per_page == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-c") == 0 || strcasecmp(argv[i], "--chain") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->bytes_per_chain = Experiment::parse_number(argv[i]); - if (this->bytes_per_chain == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-r") == 0 || strcasecmp(argv[i], "--references") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->chains_per_thread = Experiment::parse_number(argv[i]); - if (this->chains_per_thread == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-t") == 0 || strcasecmp(argv[i], "--threads") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->num_threads = Experiment::parse_number(argv[i]); - if (this->num_threads == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-i") == 0 || strcasecmp(argv[i], "--iterations") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->iterations = Experiment::parse_number(argv[i]); - this->seconds = 0; - if (this->iterations == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-e") == 0 || strcasecmp(argv[i], "--experiments") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->experiments = Experiment::parse_number(argv[i]); - if (this->experiments == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-b") == 0 || strcasecmp(argv[i], "--busy") == 0) { - i++; - if (i == argc) { error = 1; break; } - this->busy_cycles = Experiment::parse_number(argv[i]); - if (this->experiments == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "-f") == 0 || strcasecmp(argv[i], "--prefetch") == 0) { - this->prefetch = true; - } else if (strcasecmp(argv[i], "-a") == 0 || strcasecmp(argv[i], "--access") == 0) { - i++; - if (i == argc) { error = 1; break; } - if (strcasecmp(argv[i], "random") == 0) { - this->access_pattern = RANDOM; - } else if (strcasecmp(argv[i], "forward") == 0) { - this->access_pattern = STRIDED; - i++; - if (i == argc) { error = 1; break; } - this->stride = Experiment::parse_number(argv[i]); - if (this->stride == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "reverse") == 0) { - this->access_pattern = STRIDED; - i++; - if (i == argc) { error = 1; break; } - this->stride = - Experiment::parse_number(argv[i]); - if (this->stride == 0) { error = 1; break; } - } else if (strcasecmp(argv[i], "stream") == 0) { - this->access_pattern = STREAM; - i++; - if (i == argc) { error = 1; break; } - this->stride = Experiment::parse_number(argv[i]); - if (this->stride == 0) { error = 1; break; } - } else { - error = 1; - break; - } - } else if (strcasecmp(argv[i], "-o") == 0 || strcasecmp(argv[i], "--output") == 0) { - i++; - if (i == argc) { error = 1; break; } - if (strcasecmp(argv[i], "table") == 0) { - this->output_mode = TABLE; - } else if (strcasecmp(argv[i], "csv") == 0) { - this->output_mode = CSV; - } else if (strcasecmp(argv[i], "both") == 0) { - this->output_mode = BOTH; - } else if (strcasecmp(argv[i], "hdr") == 0) { - this->output_mode = HEADER; - } else if (strcasecmp(argv[i], "header") == 0) { - this->output_mode = HEADER; - } else { - error = 1; - break; - } - } else if (strcasecmp(argv[i], "-n") == 0 || strcasecmp(argv[i], "--numa") == 0) { - i++; - if (i == argc) { error = 1; break; } - if (strcasecmp(argv[i], "local") == 0) { - this->numa_placement = LOCAL; - } else if (strcasecmp(argv[i], "xor") == 0) { - this->numa_placement = XOR; - i++; - if (i == argc) { error = 1; break; } - this->offset_or_mask = Experiment::parse_number(argv[i]); - } else if (strcasecmp(argv[i], "add") == 0) { - this->numa_placement = ADD; - i++; - if (i == argc) { error = 1; break; } - this->offset_or_mask = Experiment::parse_number(argv[i]); - } else if (strcasecmp(argv[i], "map") == 0) { - this->numa_placement = MAP; - i++; - if (i == argc) { error = 1; break; } - this->placement_map = argv[i]; - } else { - error = 1; - break; - } - } else { - error = 1; - break; +// interface: +// +// -l or --line bytes per cache line (line size) +// -p or --page bytes per page (page size) +// -c or --chain bytes per chain (used to compute pages per chain) +// -r or --references chains per thread (memory loading) +// -t or --threads number of threads (concurrency and contention) +// -i or --iters iterations +// -e or --experiments experiments +// -b or --busy amount of cycles processor should remain busy +// -f or --prefetch prefetch data +// -a or --access memory access pattern +// random random access pattern +// forward <stride> exclusive OR and mask +// reverse <stride> addition and offset +// -o or --output output mode +// hdr header only +// csv csv only +// both header + csv +// table human-readable table of values +// -n or --numa numa placement +// local local allocation of all chains +// xor <mask> exclusive OR and mask +// add <offset> addition and offset +// map <map> explicit mapping of threads and chains to domains + +int Experiment::parse_args(int argc, char* argv[]) { + int error = 0; + for (int i = 1; i < argc; i++) { + if (strcasecmp(argv[i], "-x") == 0 + || strcasecmp(argv[i], "--strict") == 0) { + this->strict = 1; + } else if (strcasecmp(argv[i], "-s") == 0 + || strcasecmp(argv[i], "--seconds") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->seconds = Experiment::parse_real(argv[i]); + this->iterations = 0; + if (this->seconds == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-l") == 0 + || strcasecmp(argv[i], "--line") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->bytes_per_line = Experiment::parse_number(argv[i]); + if (this->bytes_per_line == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-p") == 0 + || strcasecmp(argv[i], "--page") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->bytes_per_page = Experiment::parse_number(argv[i]); + if (this->bytes_per_page == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-c") == 0 + || strcasecmp(argv[i], "--chain") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->bytes_per_chain = Experiment::parse_number(argv[i]); + if (this->bytes_per_chain == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-r") == 0 + || strcasecmp(argv[i], "--references") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->chains_per_thread = Experiment::parse_number(argv[i]); + if (this->chains_per_thread == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-t") == 0 + || strcasecmp(argv[i], "--threads") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->num_threads = Experiment::parse_number(argv[i]); + if (this->num_threads == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-i") == 0 + || strcasecmp(argv[i], "--iterations") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->iterations = Experiment::parse_number(argv[i]); + this->seconds = 0; + if (this->iterations == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-e") == 0 + || strcasecmp(argv[i], "--experiments") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->experiments = Experiment::parse_number(argv[i]); + if (this->experiments == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-b") == 0 + || strcasecmp(argv[i], "--busy") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + this->busy_cycles = Experiment::parse_number(argv[i]); + if (this->experiments == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-f") == 0 + || strcasecmp(argv[i], "--prefetch") == 0) { + this->prefetch = true; + } else if (strcasecmp(argv[i], "-a") == 0 + || strcasecmp(argv[i], "--access") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + if (strcasecmp(argv[i], "random") == 0) { + this->access_pattern = RANDOM; + } else if (strcasecmp(argv[i], "forward") == 0) { + this->access_pattern = STRIDED; + i++; + if (i == argc) { + error = 1; + break; + } + this->stride = Experiment::parse_number(argv[i]); + if (this->stride == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "reverse") == 0) { + this->access_pattern = STRIDED; + i++; + if (i == argc) { + error = 1; + break; + } + this->stride = -Experiment::parse_number(argv[i]); + if (this->stride == 0) { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "stream") == 0) { + this->access_pattern = STREAM; + i++; + if (i == argc) { + error = 1; + break; + } + this->stride = Experiment::parse_number(argv[i]); + if (this->stride == 0) { + error = 1; + break; + } + } else { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-o") == 0 + || strcasecmp(argv[i], "--output") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + if (strcasecmp(argv[i], "table") == 0) { + this->output_mode = TABLE; + } else if (strcasecmp(argv[i], "csv") == 0) { + this->output_mode = CSV; + } else if (strcasecmp(argv[i], "both") == 0) { + this->output_mode = BOTH; + } else if (strcasecmp(argv[i], "hdr") == 0) { + this->output_mode = HEADER; + } else if (strcasecmp(argv[i], "header") == 0) { + this->output_mode = HEADER; + } else { + error = 1; + break; + } + } else if (strcasecmp(argv[i], "-n") == 0 + || strcasecmp(argv[i], "--numa") == 0) { + i++; + if (i == argc) { + error = 1; + break; + } + if (strcasecmp(argv[i], "local") == 0) { + this->numa_placement = LOCAL; + } else if (strcasecmp(argv[i], "xor") == 0) { + this->numa_placement = XOR; + i++; + if (i == argc) { + error = 1; + break; + } + this->offset_or_mask = Experiment::parse_number(argv[i]); + } else if (strcasecmp(argv[i], "add") == 0) { + this->numa_placement = ADD; + i++; + if (i == argc) { + error = 1; + break; + } + this->offset_or_mask = Experiment::parse_number(argv[i]); + } else if (strcasecmp(argv[i], "map") == 0) { + this->numa_placement = MAP; + i++; + if (i == argc) { + error = 1; + break; + } + this->placement_map = argv[i]; + } else { + error = 1; + break; + } + } else { + error = 1; + break; + } + } + + + // if we've hit an error, print a message and quit + if (error) { + printf("usage: %s <options>\n", argv[0]); + printf("where <options> are selected from the following:\n"); + printf(" [-h|--help] # this message\n"); + printf(" [-l|--line] <number> # bytes per cache line (cache line size)\n"); + printf(" [-p|--page] <number> # bytes per page (page size)\n"); + printf(" [-c|--chain] <number> # bytes per chain (used to compute pages per chain)\n"); + printf(" [-r|--references] <number> # chains per thread (memory loading)\n"); + printf(" [-t|--threads] <number> # number of threads (concurrency and contention)\n"); + printf(" [-i|--iterations] <number> # iterations per experiment\n"); + printf(" [-e|--experiments] <number> # experiments\n"); + printf(" [-a|--access] <pattern> # memory access pattern\n"); + printf(" [-o|--output] <format> # output format\n"); + printf(" [-n|--numa] <placement> # numa placement\n"); + printf(" [-s|--seconds] <number> # run each experiment for <number> seconds\n"); + printf(" [-b|--busy] <number> # how much processing cycles each loop should count\n"); + printf(" [-f|--prefetch] # prefetch data\n"); + printf(" [-x|--strict] # fail rather than adjust options to sensible values\n"); + printf("\n"); + printf("<pattern> is selected from the following:\n"); + printf(" random # all chains are accessed randomly\n"); + printf(" forward <stride> # chains are in forward order with constant stride\n"); + printf(" reverse <stride> # chains are in reverse order with constant stride\n"); + printf(" stream <stride> # references are calculated rather than read from memory\n"); + printf("\n"); + printf("Note: <stride> is always a small positive integer.\n"); + printf("\n"); + printf("<format> is selected from the following:\n"); + printf(" hdr # csv header only\n"); + printf(" csv # results in csv format only\n"); + printf(" both # header and results in csv format\n"); + printf(" table # human-readable table of values\n"); + printf("\n"); + printf("<placement> is selected from the following:\n"); + printf(" local # all chains are allocated locally\n"); + printf(" xor <mask> # exclusive OR and mask\n"); + printf(" add <offset> # addition and offset\n"); + printf(" map <map> # explicit mapping of threads and chains to domains\n"); + printf("\n"); + printf("<map> has the form \"t1:c11,c12,...,c1m;t2:c21,...,c2m;...;tn:cn1,...,cnm\"\n"); + printf("where t[i] is the NUMA domain where the ith thread is run,\n"); + printf("and c[i][j] is the NUMA domain where the jth chain in the ith thread is allocated.\n"); + printf("(The values t[i] and c[i][j] must all be zero or small positive integers.)\n"); + printf("\n"); + printf("Note: for maps, each thread must have the same number of chains,\n"); + printf("maps override the -t or --threads specification,\n"); + printf("NUMA domains are whole numbers in the range of 0..N, and\n"); + printf("thread or chain domains that exceed the maximum NUMA domain\n"); + printf("are wrapped around using a MOD function.\n"); + printf("\n"); + printf("To determine the number of NUMA domains currently available\n"); + printf("on your system, use a command such as \"numastat\".\n"); + printf("\n"); + printf("Final note: strict is not yet fully implemented, and\n"); + printf("maps do not gracefully handle ill-formed map specifications.\n"); + + return 1; } - } - - - // if we've hit an error, print a message and quit - if (error) { - printf("usage: %s <options>\n", argv[0]); - printf("where <options> are selected from the following:\n"); - printf(" [-h|--help] # this message\n"); - printf(" [-l|--line] <number> # bytes per cache line (cache line size)\n"); - printf(" [-p|--page] <number> # bytes per page (page size)\n"); - printf(" [-c|--chain] <number> # bytes per chain (used to compute pages per chain)\n"); - printf(" [-r|--references] <number> # chains per thread (memory loading)\n"); - printf(" [-t|--threads] <number> # number of threads (concurrency and contention)\n"); - printf(" [-i|--iterations] <number> # iterations per experiment\n"); - printf(" [-e|--experiments] <number> # experiments\n"); - printf(" [-a|--access] <pattern> # memory access pattern\n"); - printf(" [-o|--output] <format> # output format\n"); - printf(" [-n|--numa] <placement> # numa placement\n"); - printf(" [-s|--seconds] <number> # run each experiment for <number> seconds\n"); - printf(" [-b|--busy] <number> # how much processing cycles each loop should count\n"); - printf(" [-f|--prefetch] # prefetch data\n"); - printf(" [-x|--strict] # fail rather than adjust options to sensible values\n"); - printf("\n"); - printf("<pattern> is selected from the following:\n"); - printf(" random # all chains are accessed randomly\n"); - printf(" forward <stride> # chains are in forward order with constant stride\n"); - printf(" reverse <stride> # chains are in reverse order with constant stride\n"); - printf(" stream <stride> # references are calculated rather than read from memory\n"); - printf("\n"); - printf("Note: <stride> is always a small positive integer.\n"); - printf("\n"); - printf("<format> is selected from the following:\n"); - printf(" hdr # csv header only\n"); - printf(" csv # results in csv format only\n"); - printf(" both # header and results in csv format\n"); - printf(" table # human-readable table of values\n"); - printf("\n"); - printf("<placement> is selected from the following:\n"); - printf(" local # all chains are allocated locally\n"); - printf(" xor <mask> # exclusive OR and mask\n"); - printf(" add <offset> # addition and offset\n"); - printf(" map <map> # explicit mapping of threads and chains to domains\n"); - printf("\n"); - printf("<map> has the form \"t1:c11,c12,...,c1m;t2:c21,...,c2m;...;tn:cn1,...,cnm\"\n"); - printf("where t[i] is the NUMA domain where the ith thread is run,\n"); - printf("and c[i][j] is the NUMA domain where the jth chain in the ith thread is allocated.\n"); - printf("(The values t[i] and c[i][j] must all be zero or small positive integers.)\n"); - printf("\n"); - printf("Note: for maps, each thread must have the same number of chains,\n"); - printf("maps override the -t or --threads specification,\n"); - printf("NUMA domains are whole numbers in the range of 0..N, and\n"); - printf("thread or chain domains that exceed the maximum NUMA domain\n"); - printf("are wrapped around using a MOD function.\n"); - printf("\n"); - printf("To determine the number of NUMA domains currently available\n"); - printf("on your system, use a command such as \"numastat\".\n"); - printf("\n"); - printf("Final note: strict is not yet fully implemented, and\n"); - printf("maps do not gracefully handle ill-formed map specifications.\n"); - - return 1; - } - - - // STRICT -- fail if specifications are inconsistent - - // compute lines per page and lines per chain - // based on input and defaults. - // we round up page and chain sizes when needed. + + + // STRICT -- fail if specifications are inconsistent + + // compute lines per page and lines per chain + // based on input and defaults. + // we round up page and chain sizes when needed. this->lines_per_page = (this->bytes_per_page+this->bytes_per_line-1) / this->bytes_per_line; this->bytes_per_page = this->bytes_per_line * this->lines_per_page; this->pages_per_chain = (this->bytes_per_chain+this->bytes_per_page-1) / this->bytes_per_page; @@ -294,314 +395,307 @@ Experiment::parse_args(int argc, char* argv[]) this->links_per_chain = this->lines_per_chain * this->links_per_line; - // allocate the chain roots for all threads - // and compute the chain locations - // (the chains themselves are initialized by the threads) - switch (this->numa_placement) { - case LOCAL : - case XOR : - case ADD : - this->thread_domain = new int32 [ this->num_threads ]; - this->chain_domain = new int32*[ this->num_threads ]; - this->random_state = new char* [ this->num_threads ]; - - for (int i=0; i < this->num_threads; i++) { - this->chain_domain[i] = new int32 [ this->chains_per_thread ]; - - const int state_size = 256; - this->random_state[i] = new char[state_size]; - initstate((unsigned int) i, (char *) this->random_state[i], (size_t) state_size); + // allocate the chain roots for all threads + // and compute the chain locations + // (the chains themselves are initialized by the threads) + switch (this->numa_placement) { + case LOCAL: + case XOR: + case ADD: + this->thread_domain = new int32[this->num_threads]; + this->chain_domain = new int32*[this->num_threads]; + this->random_state = new char*[this->num_threads]; + + for (int i = 0; i < this->num_threads; i++) { + this->chain_domain[i] = new int32[this->chains_per_thread]; + + const int state_size = 256; + this->random_state[i] = new char[state_size]; + initstate((unsigned int) i, (char *) this->random_state[i], + (size_t) state_size); + } + break; } - break; - } - #if defined(NUMA) - this->numa_max_domain = numa_max_node(); - this->num_numa_domains = this->numa_max_domain + 1; + this->numa_max_domain = numa_max_node(); + this->num_numa_domains = this->numa_max_domain + 1; #endif + switch (this->numa_placement) { + case LOCAL: + default: + this->alloc_local(); + break; + case XOR: + this->alloc_xor(); + break; + case ADD: + this->alloc_add(); + break; + case MAP: + this->alloc_map(); + break; + } - switch (this->numa_placement) { - case LOCAL : - default: - this->alloc_local(); - break; - case XOR : - this->alloc_xor(); - break; - case ADD : - this->alloc_add(); - break; - case MAP : - this->alloc_map(); - break; - } - - return 0; + return 0; } - -int64 -Experiment::parse_number( const char* s ) -{ - int64 result = 0; - - int len = strlen( s ); - for (int i=0; i < len; i++) { - if ( '0' <= s[i] && s[i] <= '9' ) { - result = result * 10 + s[i] - '0'; - } else if (s[i] == 'k' || s[i] == 'K') { - result = result << 10; - break; - } else if (s[i] == 'm' || s[i] == 'M') { - result = result << 20; - break; - } else if (s[i] == 'g' || s[i] == 'G') { - result = result << 30; - break; - } else if (s[i] == 't' || s[i] == 'T') { - result = result << 40; - break; - } else { - break; +int64 Experiment::parse_number(const char* s) { + int64 result = 0; + + int len = strlen(s); + for (int i = 0; i < len; i++) { + if ('0' <= s[i] && s[i] <= '9') { + result = result * 10 + s[i] - '0'; + } else if (s[i] == 'k' || s[i] == 'K') { + result = result << 10; + break; + } else if (s[i] == 'm' || s[i] == 'M') { + result = result << 20; + break; + } else if (s[i] == 'g' || s[i] == 'G') { + result = result << 30; + break; + } else if (s[i] == 't' || s[i] == 'T') { + result = result << 40; + break; + } else { + break; + } } - } - return result; + return result; } - -float -Experiment::parse_real( const char* s ) -{ - float result = 0; - bool decimal = false; - float power = 1; - - int len = strlen( s ); - for (int i=0; i < len; i++) { - if ( '0' <= s[i] && s[i] <= '9' ) { - if (! decimal) { - result = result * 10 + s[i] - '0'; - } else { - power = power / 10; - result = result + (s[i] - '0') * power; - } - } else if ( '.' == s[i] ) { - decimal = true; - } else { - break; +float Experiment::parse_real(const char* s) { + float result = 0; + bool decimal = false; + float power = 1; + + int len = strlen(s); + for (int i = 0; i < len; i++) { + if ('0' <= s[i] && s[i] <= '9') { + if (!decimal) { + result = result * 10 + s[i] - '0'; + } else { + power = power / 10; + result = result + (s[i] - '0') * power; + } + } else if ('.' == s[i]) { + decimal = true; + } else { + break; + } } - } - return result; + return result; } -void -Experiment::alloc_local() -{ - for (int i=0; i < this->num_threads; i++) { - this->thread_domain[i] = i % this->num_numa_domains; - for (int j=0; j < this->chains_per_thread; j++) { - this->chain_domain[i][j] = this->thread_domain[i]; +void Experiment::alloc_local() { + for (int i = 0; i < this->num_threads; i++) { + this->thread_domain[i] = i % this->num_numa_domains; + for (int j = 0; j < this->chains_per_thread; j++) { + this->chain_domain[i][j] = this->thread_domain[i]; + } } - } } -void -Experiment::alloc_xor() -{ - for (int i=0; i < this->num_threads; i++) { - this->thread_domain[i] = i % this->num_numa_domains; - for (int j=0; j < this->chains_per_thread; j++) { - this->chain_domain[i][j] = (this->thread_domain[i] ^ this->offset_or_mask) % this->num_numa_domains; +void Experiment::alloc_xor() { + for (int i = 0; i < this->num_threads; i++) { + this->thread_domain[i] = i % this->num_numa_domains; + for (int j = 0; j < this->chains_per_thread; j++) { + this->chain_domain[i][j] = (this->thread_domain[i] + ^ this->offset_or_mask) % this->num_numa_domains; + } } - } } -void -Experiment::alloc_add() -{ - for (int i=0; i < this->num_threads; i++) { - this->thread_domain[i] = i % this->num_numa_domains; - for (int j=0; j < this->chains_per_thread; j++) { - this->chain_domain[i][j] = (this->thread_domain[i] + this->offset_or_mask) % this->num_numa_domains; +void Experiment::alloc_add() { + for (int i = 0; i < this->num_threads; i++) { + this->thread_domain[i] = i % this->num_numa_domains; + for (int j = 0; j < this->chains_per_thread; j++) { + this->chain_domain[i][j] = (this->thread_domain[i] + + this->offset_or_mask) % this->num_numa_domains; + } } - } } - // DOES NOT HANDLE ILL-FORMED SPECIFICATIONS -void -Experiment::alloc_map() -{ - // STRICT -- fail if specifications are inconsistent - - // maps look like "t1:c11,c12,...,c1m;t2:c21,...,c2m;...;tn:cn1,...,cnm" - // where t[i] is the thread domain of the ith thread, - // and c[i][j] is the chain domain of the jth chain in the ith thread - - // count the thread descriptors by counting ";" up to EOS - int threads = 1; - char *p = this->placement_map; - while (*p != '\0') { - if (*p == ';') threads += 1; - p++; - } - int thread_domain[ threads ]; - - // count the chain descriptors by counting "," up to ";" or EOS - int chains = 1; - p = this->placement_map; - while (*p != '\0') { - if (*p == ';') break; - if (*p == ',') chains += 1; - p++; - } - int chain_domain [ threads ][ chains ]; - - int t=0, c=0; - p = this->placement_map; - while (*p != '\0') { - // everything up to ":" is the thread domain - int i = 0; - char buf[64]; +// DOES NOT HANDLE ILL-FORMED SPECIFICATIONS +void Experiment::alloc_map() { + // STRICT -- fail if specifications are inconsistent + + // maps look like "t1:c11,c12,...,c1m;t2:c21,...,c2m;...;tn:cn1,...,cnm" + // where t[i] is the thread domain of the ith thread, + // and c[i][j] is the chain domain of the jth chain in the ith thread + + // count the thread descriptors by counting ";" up to EOS + int threads = 1; + char *p = this->placement_map; while (*p != '\0') { - if (*p == ':') { p++; break; } - buf[i] = *p; - i++; - p++; + if (*p == ';') + threads += 1; + p++; } - buf[i] = '\0'; - thread_domain[t] = Experiment::parse_number(buf); + int thread_domain[threads]; - // search for one or several ',' - c = 0; - while (*p != '\0' && *p != ';') { - if (chains <= c || threads <= t) { - // error in the thread/chain specification - fprintf(stderr, "Malformed map.\n"); - exit(1); - } - int i = 0; - while (*p != '\0' && *p != ';') { - if (*p == ',') { p++; break; } - buf[i] = *p; - i++; + // count the chain descriptors by counting "," up to ";" or EOS + int chains = 1; + p = this->placement_map; + while (*p != '\0') { + if (*p == ';') + break; + if (*p == ',') + chains += 1; p++; - } - buf[i] = '\0'; - chain_domain[t][c] = Experiment::parse_number(buf); - c++; } + int chain_domain[threads][chains]; - if (*p == '\0') break; - if (*p == ';') p++; - t++; - } - + int t = 0, c = 0; + p = this->placement_map; + while (*p != '\0') { + // everything up to ":" is the thread domain + int i = 0; + char buf[64]; + while (*p != '\0') { + if (*p == ':') { + p++; + break; + } + buf[i] = *p; + i++; + p++; + } + buf[i] = '\0'; + thread_domain[t] = Experiment::parse_number(buf); + + // search for one or several ',' + c = 0; + while (*p != '\0' && *p != ';') { + if (chains <= c || threads <= t) { + // error in the thread/chain specification + fprintf(stderr, "Malformed map.\n"); + exit(1); + } + int i = 0; + while (*p != '\0' && *p != ';') { + if (*p == ',') { + p++; + break; + } + buf[i] = *p; + i++; + p++; + } + buf[i] = '\0'; + chain_domain[t][c] = Experiment::parse_number(buf); + c++; + } + + if (*p == '\0') + break; + if (*p == ';') + p++; + t++; + } - this->num_threads = threads; - this->chains_per_thread = chains; + this->num_threads = threads; + this->chains_per_thread = chains; - this->thread_domain = new int32 [ this->num_threads ]; - this->chain_domain = new int32*[ this->num_threads ]; - this->random_state = new char* [ this->num_threads ]; + this->thread_domain = new int32[this->num_threads]; + this->chain_domain = new int32*[this->num_threads]; + this->random_state = new char*[this->num_threads]; - for (int i=0; i < this->num_threads; i++) { - this->thread_domain[i] = thread_domain[i] % this->num_numa_domains; + for (int i = 0; i < this->num_threads; i++) { + this->thread_domain[i] = thread_domain[i] % this->num_numa_domains; - const int state_size = 256; - this->random_state[i] = new char[state_size]; - initstate((unsigned int) i, (char *) this->random_state[i], (size_t) state_size); + const int state_size = 256; + this->random_state[i] = new char[state_size]; + initstate((unsigned int) i, (char *) this->random_state[i], + (size_t) state_size); - this->chain_domain[i] = new int32 [ this->chains_per_thread ]; - for (int j=0; j < this->chains_per_thread; j++) { - this->chain_domain[i][j] = chain_domain[i][j] % this->num_numa_domains; + this->chain_domain[i] = new int32[this->chains_per_thread]; + for (int j = 0; j < this->chains_per_thread; j++) { + this->chain_domain[i][j] = chain_domain[i][j] + % this->num_numa_domains; + } } - } - this->bytes_per_thread = this->bytes_per_chain * this->chains_per_thread; - this->bytes_per_test = this->bytes_per_thread * this->num_threads; + this->bytes_per_thread = this->bytes_per_chain * this->chains_per_thread; + this->bytes_per_test = this->bytes_per_thread * this->num_threads; } #include "Chain.h" -void -Experiment::print() -{ - printf("strict = %d\n", strict); - printf("pointer_size = %d\n", pointer_size); - printf("sizeof(Chain) = %d\n", sizeof(Chain)); - printf("sizeof(Chain *) = %d\n", sizeof(Chain *)); - printf("bytes_per_line = %d\n", bytes_per_line); - printf("links_per_line = %d\n", links_per_line); - printf("bytes_per_page = %d\n", bytes_per_page); - printf("lines_per_page = %d\n", lines_per_page); - printf("links_per_page = %d\n", links_per_page); - printf("bytes_per_chain = %d\n", bytes_per_chain); - printf("lines_per_chain = %d\n", lines_per_chain); - printf("links_per_chain = %d\n", links_per_chain); - printf("pages_per_chain = %d\n", pages_per_chain); - printf("chains_per_thread = %d\n", chains_per_thread); - printf("bytes_per_thread = %d\n", bytes_per_thread); - printf("num_threads = %d\n", num_threads); - printf("bytes_per_test = %d\n", bytes_per_test); - printf("busy cycles = %d\n", busy_cycles); - printf("prefetch = %d\n", prefetch); - printf("iterations = %d\n", iterations); - printf("experiments = %d\n", experiments); - printf("access_pattern = %d\n", access_pattern); - printf("stride = %d\n", stride); - printf("output_mode = %d\n", output_mode); - printf("numa_placement = %d\n", numa_placement); - printf("offset_or_mask = %d\n", offset_or_mask); - printf("numa_max_domain = %d\n", numa_max_domain); - printf("num_numa_domains = %d\n", num_numa_domains); - - for (int i=0; i < this->num_threads; i++) { - printf("%d: ", this->thread_domain[i]); - for (int j=0; j < this->chains_per_thread; j++) { - printf("%d,", this->chain_domain[i][j]); +void Experiment::print() { + printf("strict = %d\n", strict); + printf("pointer_size = %d\n", pointer_size); + printf("sizeof(Chain) = %d\n", sizeof(Chain)); + printf("sizeof(Chain *) = %d\n", sizeof(Chain *)); + printf("bytes_per_line = %d\n", bytes_per_line); + printf("links_per_line = %d\n", links_per_line); + printf("bytes_per_page = %d\n", bytes_per_page); + printf("lines_per_page = %d\n", lines_per_page); + printf("links_per_page = %d\n", links_per_page); + printf("bytes_per_chain = %d\n", bytes_per_chain); + printf("lines_per_chain = %d\n", lines_per_chain); + printf("links_per_chain = %d\n", links_per_chain); + printf("pages_per_chain = %d\n", pages_per_chain); + printf("chains_per_thread = %d\n", chains_per_thread); + printf("bytes_per_thread = %d\n", bytes_per_thread); + printf("num_threads = %d\n", num_threads); + printf("bytes_per_test = %d\n", bytes_per_test); + printf("busy cycles = %d\n", busy_cycles); + printf("prefetch = %d\n", prefetch); + printf("iterations = %d\n", iterations); + printf("experiments = %d\n", experiments); + printf("access_pattern = %d\n", access_pattern); + printf("stride = %d\n", stride); + printf("output_mode = %d\n", output_mode); + printf("numa_placement = %d\n", numa_placement); + printf("offset_or_mask = %d\n", offset_or_mask); + printf("numa_max_domain = %d\n", numa_max_domain); + printf("num_numa_domains = %d\n", num_numa_domains); + + for (int i = 0; i < this->num_threads; i++) { + printf("%d: ", this->thread_domain[i]); + for (int j = 0; j < this->chains_per_thread; j++) { + printf("%d,", this->chain_domain[i][j]); + } + printf("\n"); } - printf("\n"); - } - fflush(stdout); + fflush(stdout); } -const char* -Experiment::access() -{ - const char* result = NULL; - - if (this->access_pattern == RANDOM) { - result = "random"; - } else if (this->access_pattern == STRIDED && 0 < this->stride) { - result = "forward"; - } else if (this->access_pattern == STRIDED && this->stride < 0) { - result = "reverse"; - } else if (this->access_pattern == STREAM) { - result = "stream"; - } - - return result; +const char* Experiment::access() { + const char* result = NULL; + + if (this->access_pattern == RANDOM) { + result = "random"; + } else if (this->access_pattern == STRIDED && 0 < this->stride) { + result = "forward"; + } else if (this->access_pattern == STRIDED && this->stride < 0) { + result = "reverse"; + } else if (this->access_pattern == STREAM) { + result = "stream"; + } + + return result; } -const char* -Experiment::placement() -{ - const char* result = NULL; - - if (this->numa_placement == LOCAL) { - result = "local"; - } else if (this->numa_placement == XOR) { - result = "xor"; - } else if (this->numa_placement == ADD) { - result = "add"; - } else if (this->numa_placement == MAP) { - result = "map"; - } - - return result; +const char* Experiment::placement() { + const char* result = NULL; + + if (this->numa_placement == LOCAL) { + result = "local"; + } else if (this->numa_placement == XOR) { + result = "xor"; + } else if (this->numa_placement == ADD) { + result = "add"; + } else if (this->numa_placement == MAP) { + result = "map"; + } + + return result; } diff --git a/src/Experiment.h b/src/Experiment.h index 38756f0..0089c2f 100644 --- a/src/Experiment.h +++ b/src/Experiment.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Experiment_h) #define Experiment_h @@ -18,17 +17,17 @@ class Experiment { public: - Experiment(); - ~Experiment(); + Experiment(); + ~Experiment(); - int parse_args(int argc, char* argv[]); - int64 parse_number( const char* s ); - float parse_real( const char* s ); + int parse_args(int argc, char* argv[]); + int64 parse_number(const char* s); + float parse_real(const char* s); - const char* placement(); - const char* access(); + const char* placement(); + const char* access(); - // fundamental parameters + // fundamental parameters int64 pointer_size; // number of bytes in a pointer int64 bytes_per_line; // working set cache line size (bytes) int64 links_per_line; // working set cache line size (links) @@ -46,23 +45,23 @@ public: int64 busy_cycles; // processing cycles bool prefetch; // use of prefetching - float seconds; // number of seconds per experiment + float seconds; // number of seconds per experiment int64 iterations; // number of iterations per experiment int64 experiments; // number of experiments per test enum { CSV, BOTH, HEADER, TABLE } - output_mode; // results output mode + output_mode; // results output mode enum { RANDOM, STRIDED, STREAM } - access_pattern; // memory access pattern + access_pattern; // memory access pattern int64 stride; enum { LOCAL, XOR, ADD, MAP } - numa_placement; // memory allocation mode + numa_placement; // memory allocation mode int64 offset_or_mask; char* placement_map; - // maps threads and chains to numa domains + // maps threads and chains to numa domains int32* thread_domain; // thread_domain[thread] int32** chain_domain; // chain_domain[thread][chain] int32 numa_max_domain; // highest numa domain id @@ -70,7 +69,7 @@ public: char** random_state; // random state for each thread - int strict; // strictly adhere to user input, or fail + int strict; // strictly adhere to user input, or fail const static int32 DEFAULT_POINTER_SIZE = sizeof(Chain); const static int32 DEFAULT_BYTES_PER_LINE = 64; @@ -95,11 +94,11 @@ public: const static bool DEFAULT_PREFETCH = false; void alloc_local(); - void alloc_xor(); - void alloc_add(); - void alloc_map(); + void alloc_xor(); + void alloc_add(); + void alloc_map(); - void print(); + void print(); private: }; diff --git a/src/Lock.cpp b/src/Lock.cpp index 104dc81..517843d 100644 --- a/src/Lock.cpp +++ b/src/Lock.cpp @@ -9,37 +9,28 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include <pthread.h> #include "Lock.h" -Lock::Lock() -{ - pthread_mutex_init( &(this->mutex), NULL ); +Lock::Lock() { + pthread_mutex_init(&(this->mutex), NULL); } -Lock::~Lock() -{ - pthread_mutex_destroy( &(this->mutex) ); +Lock::~Lock() { + pthread_mutex_destroy(&(this->mutex)); } -void -Lock::lock() -{ - pthread_mutex_lock( &(this->mutex) ); +void Lock::lock() { + pthread_mutex_lock(&(this->mutex)); } -int -Lock::test() -{ - pthread_mutex_trylock( &(this->mutex) ); +int Lock::test() { + pthread_mutex_trylock(&(this->mutex)); } -void -Lock::unlock() -{ - pthread_mutex_unlock( &(this->mutex) ); -} +void Lock::unlock() { + pthread_mutex_unlock(&(this->mutex)); +} @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Lock_h) #define Lock_h @@ -17,14 +16,14 @@ class Lock { public: - Lock(); - ~Lock(); - void lock(); - int test(); - void unlock(); + Lock(); + ~Lock(); + void lock(); + int test(); + void unlock(); private: - pthread_mutex_t mutex; + pthread_mutex_t mutex; }; #endif @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include "Main.h" @@ -21,70 +20,75 @@ #include "Experiment.h" #include "SpinBarrier.h" - // This program allocates and accesses - // a number of blocks of memory, one or more - // for each thread that executes. Blocks - // are divided into sub-blocks called - // pages, and pages are divided into - // sub-blocks called cache lines. - // - // All pages are collected into a list. - // Pages are selected for the list in - // a particular order. Each cache line - // within the page is similarly gathered - // into a list in a particular order. - // In both cases the order may be random - // or linear. - // - // A root pointer points to the first - // cache line. A pointer in the cache - // line points to the next cache line, - // which contains a pointer to the cache - // line after that, and so on. This - // forms a pointer chain that touches all - // cache lines within the first page, - // then all cache lines within the second - // page, and so on until all pages are - // covered. The last pointer contains - // NULL, terminating the chain. - // - // Depending on compile-time options, - // pointers may be 32-bit or 64-bit - // pointers. +// This program allocates and accesses +// a number of blocks of memory, one or more +// for each thread that executes. Blocks +// are divided into sub-blocks called +// pages, and pages are divided into +// sub-blocks called cache lines. +// +// All pages are collected into a list. +// Pages are selected for the list in +// a particular order. Each cache line +// within the page is similarly gathered +// into a list in a particular order. +// In both cases the order may be random +// or linear. +// +// A root pointer points to the first +// cache line. A pointer in the cache +// line points to the next cache line, +// which contains a pointer to the cache +// line after that, and so on. This +// forms a pointer chain that touches all +// cache lines within the first page, +// then all cache lines within the second +// page, and so on until all pages are +// covered. The last pointer contains +// NULL, terminating the chain. +// +// Depending on compile-time options, +// pointers may be 32-bit or 64-bit +// pointers. int verbose = 0; -int -main( int argc, char* argv[] ) -{ - Timer::calibrate(10000); - double clk_res = Timer::resolution(); +int main(int argc, char* argv[]) { + Timer + ::calibrate(10000); + double clk_res = Timer + ::resolution(); - Experiment e; - if (e.parse_args(argc, argv)) { - return 0; - } + Experiment e; + if (e.parse_args(argc, argv)) { + return 0; + } #if defined(UNDEFINED) - e.print(); - if (argv != NULL) return 0; + e.print(); + if (argv != NULL) return 0; #endif - SpinBarrier sb( e.num_threads ); - Run r[ e.num_threads ]; - for (int i=0; i < e.num_threads; i++) { - r[i].set( e, &sb ); - r[i].start(); - } + SpinBarrier + sb(e.num_threads); + Run r[e.num_threads]; + for (int i = 0; i < e.num_threads; i++) { + r[i].set(e, &sb); + r[i].start(); + } - for (int i=0; i < e.num_threads; i++) { - r[i].wait(); - } + for (int i = 0; i < e.num_threads; i++) { + r[i].wait(); + } - int64 ops = Run::ops_per_chain(); - double secs = Run::seconds(); + int64 + ops = Run + ::ops_per_chain(); + double secs = Run + ::seconds(); - Output::print(e, ops, secs, clk_res); + Output + ::print(e, ops, secs, clk_res); - return 0; + return 0; } diff --git a/src/Main.cpp b/src/Main.cpp index ebd276a..a4d68e4 100644 --- a/src/Main.cpp +++ b/src/Main.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include "Main.h" @@ -20,70 +19,68 @@ #include "Output.h" #include "Experiment.h" - // This program allocates and accesses - // a number of blocks of memory, one or more - // for each thread that executes. Blocks - // are divided into sub-blocks called - // pages, and pages are divided into - // sub-blocks called cache lines. - // - // All pages are collected into a list. - // Pages are selected for the list in - // a particular order. Each cache line - // within the page is similarly gathered - // into a list in a particular order. - // In both cases the order may be random - // or linear. - // - // A root pointer points to the first - // cache line. A pointer in the cache - // line points to the next cache line, - // which contains a pointer to the cache - // line after that, and so on. This - // forms a pointer chain that touches all - // cache lines within the first page, - // then all cache lines within the second - // page, and so on until all pages are - // covered. The last pointer contains - // NULL, terminating the chain. - // - // Depending on compile-time options, - // pointers may be 32-bit or 64-bit - // pointers. +// This program allocates and accesses +// a number of blocks of memory, one or more +// for each thread that executes. Blocks +// are divided into sub-blocks called +// pages, and pages are divided into +// sub-blocks called cache lines. +// +// All pages are collected into a list. +// Pages are selected for the list in +// a particular order. Each cache line +// within the page is similarly gathered +// into a list in a particular order. +// In both cases the order may be random +// or linear. +// +// A root pointer points to the first +// cache line. A pointer in the cache +// line points to the next cache line, +// which contains a pointer to the cache +// line after that, and so on. This +// forms a pointer chain that touches all +// cache lines within the first page, +// then all cache lines within the second +// page, and so on until all pages are +// covered. The last pointer contains +// NULL, terminating the chain. +// +// Depending on compile-time options, +// pointers may be 32-bit or 64-bit +// pointers. int verbose = 0; -int -main( int argc, char* argv[] ) -{ - Timer::calibrate(10000); - double clk_res = Timer::resolution(); +int main(int argc, char* argv[]) { + Timer::calibrate(10000); + double clk_res = Timer::resolution(); - Experiment e; - if (e.parse_args(argc, argv)) { - return 0; - } + Experiment e; + if (e.parse_args(argc, argv)) { + return 0; + } #if defined(UNDEFINED) - e.print(); - if (argv != NULL) return 0; + e.print(); + if (argv != NULL) return 0; #endif - SpinBarrier sb( e.num_threads ); - Run r[ e.num_threads ]; - for (int i=0; i < e.num_threads; i++) { - r[i].set( e, &sb ); - r[i].start(); - } + SpinBarrier sb(e.num_threads); + Run r[e.num_threads]; + for (int i = 0; i < e.num_threads; i++) { + r[i].set(e, &sb); + r[i].start(); + } - for (int i=0; i < e.num_threads; i++) { - r[i].wait(); - } + for (int i = 0; i < e.num_threads; i++) { + r[i].wait(); + } - int64 ops = Run::ops_per_chain(); - double secs = Run::seconds(); + int64 ops = Run::ops_per_chain(); + double secs = Run::seconds(); - Output::print(e, ops, secs, clk_res); + Output::print(e, ops, secs, clk_res); - return 0; + return 0; } diff --git a/src/Output.cpp b/src/Output.cpp index 9f9c09a..84eb0df 100644 --- a/src/Output.cpp +++ b/src/Output.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -19,25 +18,20 @@ #include "Types.h" #include "Experiment.h" - -void -Output::print( Experiment &e, int64 ops, double secs, double ck_res ) -{ - if (e.output_mode == Experiment::CSV) { - Output::csv(e, ops, secs, ck_res); - } else if (e.output_mode == Experiment::BOTH) { - Output::header(e, ops, secs, ck_res); - Output::csv(e, ops, secs, ck_res); - } else if (e.output_mode == Experiment::HEADER) { - Output::header(e, ops, secs, ck_res); - } else { - Output::table(e, ops, secs, ck_res); - } +void Output::print(Experiment &e, int64 ops, double secs, double ck_res) { + if (e.output_mode == Experiment::CSV) { + Output::csv(e, ops, secs, ck_res); + } else if (e.output_mode == Experiment::BOTH) { + Output::header(e, ops, secs, ck_res); + Output::csv(e, ops, secs, ck_res); + } else if (e.output_mode == Experiment::HEADER) { + Output::header(e, ops, secs, ck_res); + } else { + Output::table(e, ops, secs, ck_res); + } } -void -Output::header( Experiment &e, int64 ops, double secs, double ck_res ) -{ +void Output::header(Experiment &e, int64 ops, double secs, double ck_res) { printf("pointer size (bytes),"); printf("cache line size (bytes),"); printf("page size (bytes),"); @@ -65,9 +59,7 @@ Output::header( Experiment &e, int64 ops, double secs, double ck_res ) fflush(stdout); } -void -Output::csv( Experiment &e, int64 ops, double secs, double ck_res ) -{ +void Output::csv(Experiment &e, int64 ops, double secs, double ck_res) { printf("%ld,", e.pointer_size); printf("%ld,", e.bytes_per_line); printf("%ld,", e.bytes_per_page); @@ -86,16 +78,16 @@ Output::csv( Experiment &e, int64 ops, double secs, double ck_res ) printf("\""); printf("%d:", e.thread_domain[0]); printf("%d", e.chain_domain[0][0]); - for (int j=1; j < e.chains_per_thread; j++) { - printf(",%d", e.chain_domain[0][j]); - } - for (int i=1; i < e.num_threads; i++) { - printf(";%d:", e.thread_domain[i]); - printf("%d", e.chain_domain[i][0]); - for (int j=1; j < e.chains_per_thread; j++) { - printf(",%d", e.chain_domain[i][j]); + for (int j = 1; j < e.chains_per_thread; j++) { + printf(",%d", e.chain_domain[0][j]); + } + for (int i = 1; i < e.num_threads; i++) { + printf(";%d:", e.thread_domain[i]); + printf("%d", e.chain_domain[i][0]); + for (int j = 1; j < e.chains_per_thread; j++) { + printf(",%d", e.chain_domain[i][j]); + } } - } printf("\","); printf("%ld,", ops); printf("%ld,", ops * e.chains_per_thread * e.num_threads); @@ -108,9 +100,7 @@ Output::csv( Experiment &e, int64 ops, double secs, double ck_res ) fflush(stdout); } -void -Output::table( Experiment &e, int64 ops, double secs, double ck_res ) -{ +void Output::table(Experiment &e, int64 ops, double secs, double ck_res) { printf("pointer size = %ld (bytes)\n", e.pointer_size); printf("cache line size = %ld (bytes)\n", e.bytes_per_line); printf("page size = %ld (bytes)\n", e.bytes_per_page); @@ -130,16 +120,16 @@ Output::table( Experiment &e, int64 ops, double secs, double ck_res ) printf("\""); printf("%d:", e.thread_domain[0]); printf("%d", e.chain_domain[0][0]); - for (int j=1; j < e.chains_per_thread; j++) { - printf(",%d", e.chain_domain[0][j]); - } - for (int i=1; i < e.num_threads; i++) { - printf(";%d:", e.thread_domain[i]); - printf("%d", e.chain_domain[i][0]); - for (int j=1; j < e.chains_per_thread; j++) { - printf(",%d", e.chain_domain[i][j]); + for (int j = 1; j < e.chains_per_thread; j++) { + printf(",%d", e.chain_domain[0][j]); + } + for (int i = 1; i < e.num_threads; i++) { + printf(";%d:", e.thread_domain[i]); + printf("%d", e.chain_domain[i][0]); + for (int j = 1; j < e.chains_per_thread; j++) { + printf(",%d", e.chain_domain[i][j]); + } } - } printf("\"\n"); printf("operations per chain = %ld\n", ops); printf("total operations = %ld\n", ops * e.chains_per_thread * e.num_threads); diff --git a/src/Output.h b/src/Output.h index 9ee2c80..65d3926 100644 --- a/src/Output.h +++ b/src/Output.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Output_h) #define Output_h @@ -18,10 +17,10 @@ class Output { public: - static void print ( Experiment &e, int64 ops, double secs, double ck_res ); - static void header( Experiment &e, int64 ops, double secs, double ck_res ); - static void csv ( Experiment &e, int64 ops, double secs, double ck_res ); - static void table ( Experiment &e, int64 ops, double secs, double ck_res ); + static void print(Experiment &e, int64 ops, double secs, double ck_res); + static void header(Experiment &e, int64 ops, double secs, double ck_res); + static void csv(Experiment &e, int64 ops, double secs, double ck_res); + static void table(Experiment &e, int64 ops, double secs, double ck_res); private: }; diff --git a/src/Run.cpp b/src/Run.cpp index c774b99..24435b4 100644 --- a/src/Run.cpp +++ b/src/Run.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -24,1253 +23,1255 @@ #include "Timer.h" #include "SpinBarrier.h" +static double max(double v1, double v2); +static double min(double v1, double v2); +static void chase_pointers(int64 chains_per_thread, int64 iterations, + Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, + int64 busy_cycles, bool prefetch); +static void follow_streams(int64 chains_per_thread, int64 iterations, + Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, + int64 busy_cycles, bool prefetch); +static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, + Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, + int64 busy_cycles, bool prefetch) = chase_pointers; -static double max( double v1, double v2 ); -static double min( double v1, double v2 ); -static void chase_pointers(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch); -static void follow_streams(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch); -static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch) = chase_pointers; - -Lock Run::global_mutex; -int64 Run::_ops_per_chain = 0; -double Run::_seconds = 1E9; +Lock Run::global_mutex; +int64 Run::_ops_per_chain = 0; +double Run::_seconds = 1E9; #define prefetch(x) __builtin_prefetch(x) -Run::Run() -: exp(NULL), bp(NULL) -{ +Run::Run() : + exp(NULL), bp(NULL) { } -Run::~Run() -{ +Run::~Run() { } -void -Run::set( Experiment &e, SpinBarrier* sbp ) -{ - this->exp = &e; - this->bp = sbp; +void Run::set(Experiment &e, SpinBarrier* sbp) { + this->exp = &e; + this->bp = sbp; } -int -Run::run() -{ - // first allocate all memory for the chains, - // making sure it is allocated within the - // intended numa domains - Chain** chain_memory = new Chain* [ this->exp->chains_per_thread ]; - Chain** root = new Chain* [ this->exp->chains_per_thread ]; +int Run::run() { + // first allocate all memory for the chains, + // making sure it is allocated within the + // intended numa domains + Chain** chain_memory = new Chain*[this->exp->chains_per_thread]; + Chain** root = new Chain*[this->exp->chains_per_thread]; #if defined(NUMA) - // establish the node id where this thread - // will run. threads are mapped to nodes - // by the set-up code for Experiment. - int run_node_id = this->exp->thread_domain[this->thread_id()]; - numa_run_on_node(run_node_id); + // establish the node id where this thread + // will run. threads are mapped to nodes + // by the set-up code for Experiment. + int run_node_id = this->exp->thread_domain[this->thread_id()]; + numa_run_on_node(run_node_id); - // establish the node id where this thread's - // memory will be allocated. - for (int i=0; i < this->exp->chains_per_thread; i++) { - int alloc_node_id = this->exp->chain_domain[this->thread_id()][i]; - nodemask_t alloc_mask; - nodemask_zero(&alloc_mask); - nodemask_set(&alloc_mask, alloc_node_id); - numa_set_membind(&alloc_mask); + // establish the node id where this thread's + // memory will be allocated. + for (int i=0; i < this->exp->chains_per_thread; i++) { + int alloc_node_id = this->exp->chain_domain[this->thread_id()][i]; + nodemask_t alloc_mask; + nodemask_zero(&alloc_mask); + nodemask_set(&alloc_mask, alloc_node_id); + numa_set_membind(&alloc_mask); - chain_memory[i] = new Chain[ this->exp->links_per_chain ]; - } + chain_memory[i] = new Chain[ this->exp->links_per_chain ]; + } #else - for (int i=0; i < this->exp->chains_per_thread; i++) { - chain_memory[i] = new Chain[ this->exp->links_per_chain ]; - } + for (int i = 0; i < this->exp->chains_per_thread; i++) { + chain_memory[i] = new Chain[this->exp->links_per_chain]; + } #endif - // initialize the chains and - // select the function that - // will execute the tests - for (int i=0; i < this->exp->chains_per_thread; i++) { - if (this->exp->access_pattern == Experiment::RANDOM) { - root[i] = random_mem_init( chain_memory[i] ); - run_benchmark = chase_pointers; - } else if (this->exp->access_pattern == Experiment::STRIDED) { - if (0 < this->exp->stride) { - root[i] = forward_mem_init( chain_memory[i] ); - } else { - root[i] = reverse_mem_init( chain_memory[i] ); - } - run_benchmark = chase_pointers; - } else if (this->exp->access_pattern == Experiment::STREAM) { - root[i] = stream_mem_init( chain_memory[i] ); - run_benchmark = follow_streams; + // initialize the chains and + // select the function that + // will execute the tests + for (int i = 0; i < this->exp->chains_per_thread; i++) { + if (this->exp->access_pattern == Experiment::RANDOM) { + root[i] = random_mem_init(chain_memory[i]); + run_benchmark = chase_pointers; + } else if (this->exp->access_pattern == Experiment::STRIDED) { + if (0 < this->exp->stride) { + root[i] = forward_mem_init(chain_memory[i]); + } else { + root[i] = reverse_mem_init(chain_memory[i]); + } + run_benchmark = chase_pointers; + } else if (this->exp->access_pattern == Experiment::STREAM) { + root[i] = stream_mem_init(chain_memory[i]); + run_benchmark = follow_streams; + } } - } - if (this->exp->iterations <= 0) { - volatile static double istart = 0; - volatile static double istop = 0; - volatile static double elapsed = 0; - volatile static int64 iters = 1; - volatile double bound = max(0.2, 10 * Timer::resolution()); - for (iters=1; elapsed <= bound; iters=iters<<1) { - this->bp->barrier(); + if (this->exp->iterations <= 0) { + volatile static double istart = 0; + volatile static double istop = 0; + volatile static double elapsed = 0; + volatile static int64 iters = 1; + volatile double bound = max(0.2, 10 * Timer::resolution()); + for (iters = 1; elapsed <= bound; iters = iters << 1) { + this->bp->barrier(); - // start timer - if (this->thread_id() == 0) { - istart = Timer::seconds(); - } - this->bp->barrier(); + // start timer + if (this->thread_id() == 0) { + istart = Timer::seconds(); + } + this->bp->barrier(); - // chase pointers - run_benchmark(this->exp->chains_per_thread, iters, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch); + // chase pointers + run_benchmark(this->exp->chains_per_thread, iters, root, + this->exp->bytes_per_line, this->exp->bytes_per_chain, + this->exp->stride, this->exp->busy_cycles, + this->exp->prefetch); - // barrier - this->bp->barrier(); + // barrier + this->bp->barrier(); - // stop timer - if (this->thread_id() == 0) { - istop = Timer::seconds(); - elapsed = istop - istart; - } - this->bp->barrier(); - } + // stop timer + if (this->thread_id() == 0) { + istop = Timer::seconds(); + elapsed = istop - istart; + } + this->bp->barrier(); + } - // calculate the number of iterations - if (this->thread_id() == 0) { - if (0 < this->exp->seconds) { - this->exp->iterations = max(1, 0.9999 + 0.5 * this->exp->seconds * iters / elapsed); - } else { - this->exp->iterations = max(1, 0.9999 + iters / elapsed); - } + // calculate the number of iterations + if (this->thread_id() == 0) { + if (0 < this->exp->seconds) { + this->exp->iterations = max(1, + 0.9999 + 0.5 * this->exp->seconds * iters / elapsed); + } else { + this->exp->iterations = max(1, 0.9999 + iters / elapsed); + } + } + this->bp->barrier(); } - this->bp->barrier(); - } #if defined(UNDEFINED) #endif - // barrier - for (int e=0; e < this->exp->experiments; e++) { - this->bp->barrier(); + // barrier + for (int e = 0; e < this->exp->experiments; e++) { + this->bp->barrier(); - // start timer - double start = 0; - if (this->thread_id() == 0) start = Timer::seconds(); - this->bp->barrier(); + // start timer + double start = 0; + if (this->thread_id() == 0) + start = Timer::seconds(); + this->bp->barrier(); - // chase pointers - run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch); + // chase pointers + run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, + this->exp->bytes_per_line, this->exp->bytes_per_chain, + this->exp->stride, this->exp->busy_cycles, this->exp->prefetch); - // barrier - this->bp->barrier(); + // barrier + this->bp->barrier(); - // stop timer - double stop = 0; - if (this->thread_id() == 0) stop = Timer::seconds(); - this->bp->barrier(); + // stop timer + double stop = 0; + if (this->thread_id() == 0) + stop = Timer::seconds(); + this->bp->barrier(); - if (0 <= e) { - if (this->thread_id() == 0) { - double delta = stop - start; - if (0 < delta) { - Run::_seconds = min( Run::_seconds, delta ); + if (0 <= e) { + if (this->thread_id() == 0) { + double delta = stop - start; + if (0 < delta) { + Run::_seconds = min(Run::_seconds, delta); + } + } } - } } - } - this->bp->barrier(); + this->bp->barrier(); - for (int i=0; i < this->exp->chains_per_thread; i++) { - if (chain_memory[i] != NULL) delete [] chain_memory[i]; - } - if (chain_memory != NULL) delete [] chain_memory; + for (int i = 0; i < this->exp->chains_per_thread; i++) { + if (chain_memory[i] != NULL + ) delete[] chain_memory[i]; + } + if (chain_memory != NULL + ) delete[] chain_memory; - return 0; + return 0; } int dummy = 0; -void -Run::mem_check( Chain *m ) -{ - if (m == NULL) dummy += 1; +void Run::mem_check(Chain *m) { + if (m == NULL + ) dummy += 1; } -static double -max( double v1, double v2 ) -{ - if (v1 < v2) return v2; - return v1; +static double max(double v1, double v2) { + if (v1 < v2) + return v2; + return v1; } -static double -min( double v1, double v2 ) -{ - if (v2 < v1) return v2; - return v1; +static double min(double v1, double v2) { + if (v2 < v1) + return v2; + return v1; } - // exclude 2 and mersienne primes, i.e., - // primes of the form 2**n - 1, e.g., - // 3, 7, 31, 127 -static const int prime_table[] = { 5, 11, 13, 17, 19, 23, 37, 41, 43, 47, - 53, 61, 71, 73, 79, 83, 89, 97, 101, 103, 109, 113, 131, 137, 139, 149, - 151, 157, 163, }; +// exclude 2 and mersienne primes, i.e., +// primes of the form 2**n - 1, e.g., +// 3, 7, 31, 127 +static const int prime_table[] = { 5, 11, 13, 17, 19, 23, 37, 41, 43, 47, 53, + 61, 71, 73, 79, 83, 89, 97, 101, 103, 109, 113, 131, 137, 139, 149, 151, + 157, 163, }; static const int prime_table_size = sizeof prime_table / sizeof prime_table[0]; Chain* -Run::random_mem_init( Chain *mem ) -{ - // initialize pointers -- - // choose a page at random, then use - // one pointer from each cache line - // within the page. all pages and - // cache lines are chosen at random. - Chain* root = END_OF_CHAIN; - Chain* prev = END_OF_CHAIN; - int link_within_line = 0; - int64 local_ops_per_chain = 0; +Run::random_mem_init(Chain *mem) { + // initialize pointers -- + // choose a page at random, then use + // one pointer from each cache line + // within the page. all pages and + // cache lines are chosen at random. + Chain* root = END_OF_CHAIN; + Chain* prev = END_OF_CHAIN; + int link_within_line = 0; + int64 local_ops_per_chain = 0; - // we must set a lock because random() - // is not thread safe - Run::global_mutex.lock(); - setstate(this->exp->random_state[this->thread_id()]); - int page_factor = prime_table[ random() % prime_table_size ]; - int page_offset = random() % this->exp->pages_per_chain; - Run::global_mutex.unlock(); - - // loop through the pages - for (int i=0; i < this->exp->pages_per_chain; i++) { - int page = (page_factor * i + page_offset) % this->exp->pages_per_chain; + // we must set a lock because random() + // is not thread safe Run::global_mutex.lock(); setstate(this->exp->random_state[this->thread_id()]); - int line_factor = prime_table[ random() % prime_table_size ]; - int line_offset = random() % this->exp->lines_per_page; + int page_factor = prime_table[random() % prime_table_size]; + int page_offset = random() % this->exp->pages_per_chain; Run::global_mutex.unlock(); - // loop through the lines within a page - for (int j=0; j < this->exp->lines_per_page; j++) { - int line_within_page = (line_factor * j + line_offset) % this->exp->lines_per_page; - int link = page * this->exp->links_per_page + line_within_page * this->exp->links_per_line + link_within_line; + // loop through the pages + for (int i = 0; i < this->exp->pages_per_chain; i++) { + int page = (page_factor * i + page_offset) % this->exp->pages_per_chain; + Run::global_mutex.lock(); + setstate(this->exp->random_state[this->thread_id()]); + int line_factor = prime_table[random() % prime_table_size]; + int line_offset = random() % this->exp->lines_per_page; + Run::global_mutex.unlock(); + + // loop through the lines within a page + for (int j = 0; j < this->exp->lines_per_page; j++) { + int line_within_page = (line_factor * j + line_offset) + % this->exp->lines_per_page; + int link = page * this->exp->links_per_page + + line_within_page * this->exp->links_per_line + + link_within_line; - if (root == END_OF_CHAIN) { + if (root == END_OF_CHAIN) { // printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link); - prev = root = mem + link; - local_ops_per_chain += 1; - } else { + prev = root = mem + link; + local_ops_per_chain += 1; + } else { // printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link); - prev->next = mem + link; - prev = prev->next; - local_ops_per_chain += 1; - } + prev->next = mem + link; + prev = prev->next; + local_ops_per_chain += 1; + } + } } - } - Run::global_mutex.lock(); - Run::_ops_per_chain = local_ops_per_chain; - Run::global_mutex.unlock(); + Run::global_mutex.lock(); + Run::_ops_per_chain = local_ops_per_chain; + Run::global_mutex.unlock(); - return root; + return root; } Chain* -Run::forward_mem_init( Chain *mem ) -{ - Chain* root = END_OF_CHAIN; - Chain* prev = END_OF_CHAIN; - int link_within_line = 0; - int64 local_ops_per_chain = 0; +Run::forward_mem_init(Chain *mem) { + Chain* root = END_OF_CHAIN; + Chain* prev = END_OF_CHAIN; + int link_within_line = 0; + int64 local_ops_per_chain = 0; - for (int i=0; i < this->exp->lines_per_chain; i += this->exp->stride) { - int link = i * this->exp->links_per_line + link_within_line; - if (root == NULL) { + for (int i = 0; i < this->exp->lines_per_chain; i += this->exp->stride) { + int link = i * this->exp->links_per_line + link_within_line; + if (root == NULL) { // printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link); - prev = root = mem + link; - local_ops_per_chain += 1; - } else { + prev = root = mem + link; + local_ops_per_chain += 1; + } else { // printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link); - prev->next = mem + link; - prev = prev->next; - local_ops_per_chain += 1; + prev->next = mem + link; + prev = prev->next; + local_ops_per_chain += 1; + } } - } - Run::global_mutex.lock(); - Run::_ops_per_chain = local_ops_per_chain; - Run::global_mutex.unlock(); + Run::global_mutex.lock(); + Run::_ops_per_chain = local_ops_per_chain; + Run::global_mutex.unlock(); - return root; + return root; } Chain* -Run::reverse_mem_init( Chain *mem ) -{ - Chain* root = END_OF_CHAIN; - Chain* prev = END_OF_CHAIN; - int link_within_line = 0; - int64 local_ops_per_chain = 0; +Run::reverse_mem_init(Chain *mem) { + Chain* root = END_OF_CHAIN; + Chain* prev = END_OF_CHAIN; + int link_within_line = 0; + int64 local_ops_per_chain = 0; - int stride = -this->exp->stride; - int last; - for (int i=0; i < this->exp->lines_per_chain; i += stride) { - last = i; - } + int stride = -this->exp->stride; + int last; + for (int i = 0; i < this->exp->lines_per_chain; i += stride) { + last = i; + } - for (int i=last; 0 <= i; i -= stride) { - int link = i * this->exp->links_per_line + link_within_line; - if (root == END_OF_CHAIN) { + for (int i = last; 0 <= i; i -= stride) { + int link = i * this->exp->links_per_line + link_within_line; + if (root == END_OF_CHAIN) { // printf("root = %d(%d)[0x%x].\n", page, line_within_page, mem+link); - prev = root = mem + link; - local_ops_per_chain += 1; - } else { + prev = root = mem + link; + local_ops_per_chain += 1; + } else { // printf("0x%x = %d(%d)[0x%x].\n", prev, page, line_within_page, mem+link); - prev->next = mem + link; - prev = prev->next; - local_ops_per_chain += 1; + prev->next = mem + link; + prev = prev->next; + local_ops_per_chain += 1; + } } - } - Run::global_mutex.lock(); - Run::_ops_per_chain = local_ops_per_chain; - Run::global_mutex.unlock(); + Run::global_mutex.lock(); + Run::_ops_per_chain = local_ops_per_chain; + Run::global_mutex.unlock(); - return root; + return root; } static int64 dumb_ck = 0; -void -mem_chk( Chain *m ) -{ - if (m == END_OF_CHAIN) dumb_ck += 1; +void mem_chk(Chain *m) { + if (m == END_OF_CHAIN) + dumb_ck += 1; } -static void -chase_pointers( - int64 chains_per_thread, // memory loading per thread - int64 iterations, // number of iterations per experiment - Chain** root, // root(s) of the chain(s) to follow - int64 bytes_per_line, // ignored - int64 bytes_per_chain, // ignored - int64 stride, // ignored - int64 busy_cycles, // processing cycles - bool prefetch // prefetch? -) -{ - // chase pointers - switch (chains_per_thread) { - default: - case 1: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - while (a != END_OF_CHAIN) { - a = a->next; - if (prefetch) - prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - } - break; - case 2: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - if (prefetch) +static void chase_pointers(int64 chains_per_thread, // memory loading per thread + int64 iterations, // number of iterations per experiment + Chain** root, // root(s) of the chain(s) to follow + int64 bytes_per_line, // ignored + int64 bytes_per_chain, // ignored + int64 stride, // ignored + int64 busy_cycles, // processing cycles + bool prefetch // prefetch? + ) { + // chase pointers + switch (chains_per_thread) { + default: + case 1: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + while (a != END_OF_CHAIN) { + a = a->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - } - break; - case 3: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + } + break; + case 2: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - } - break; - case 4: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + } + break; + case 3: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - } - break; - case 5: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + } + break; + case 4: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - } - break; - case 6: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + } + break; + case 5: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - } - break; - case 7: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + } + break; + case 6: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - } - break; - case 8: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + } + break; + case 7: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - } - break; - case 9: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + } + break; + case 8: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - } - break; - case 10: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + } + break; + case 9: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - } - break; - case 11: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + } + break; + case 10: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - } - break; - case 12: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - Chain* m = root[11]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - m = m->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + } + break; + case 11: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - mem_chk( m ); - } - break; - case 13: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - Chain* m = root[11]; - Chain* n = root[12]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - m = m->next; - n = n->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + } + break; + case 12: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + Chain* m = root[11]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + m = m->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - mem_chk( m ); - mem_chk( n ); - } - break; - case 14: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - Chain* m = root[11]; - Chain* n = root[12]; - Chain* o = root[13]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - m = m->next; - n = n->next; - o = o->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + mem_chk(m); + } + break; + case 13: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + Chain* m = root[11]; + Chain* n = root[12]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + m = m->next; + n = n->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - mem_chk( m ); - mem_chk( n ); - mem_chk( o ); - } - break; - case 15: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - Chain* m = root[11]; - Chain* n = root[12]; - Chain* o = root[13]; - Chain* p = root[14]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - m = m->next; - n = n->next; - o = o->next; - p = p->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + mem_chk(m); + mem_chk(n); + } + break; + case 14: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + Chain* m = root[11]; + Chain* n = root[12]; + Chain* o = root[13]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + m = m->next; + n = n->next; + o = o->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - mem_chk( m ); - mem_chk( n ); - mem_chk( o ); - mem_chk( p ); - } - break; - case 16: - for (int64 i=0; i < iterations; i++) { - Chain* a = root[0]; - Chain* b = root[1]; - Chain* c = root[2]; - Chain* d = root[3]; - Chain* e = root[4]; - Chain* f = root[5]; - Chain* g = root[6]; - Chain* h = root[7]; - Chain* j = root[8]; - Chain* k = root[9]; - Chain* l = root[10]; - Chain* m = root[11]; - Chain* n = root[12]; - Chain* o = root[13]; - Chain* p = root[14]; - Chain* q = root[15]; - while (a != END_OF_CHAIN) { - a = a->next; - b = b->next; - c = c->next; - d = d->next; - e = e->next; - f = f->next; - g = g->next; - h = h->next; - j = j->next; - k = k->next; - l = l->next; - m = m->next; - n = n->next; - o = o->next; - p = p->next; - q = q->next; - if (prefetch) + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + mem_chk(m); + mem_chk(n); + mem_chk(o); + } + break; + case 15: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + Chain* m = root[11]; + Chain* n = root[12]; + Chain* o = root[13]; + Chain* p = root[14]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + m = m->next; + n = n->next; + o = o->next; + p = p->next; + if (prefetch) + prefetch(a->next); + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + mem_chk(m); + mem_chk(n); + mem_chk(o); + mem_chk(p); + } + break; + case 16: + for (int64 i = 0; i < iterations; i++) { + Chain* a = root[0]; + Chain* b = root[1]; + Chain* c = root[2]; + Chain* d = root[3]; + Chain* e = root[4]; + Chain* f = root[5]; + Chain* g = root[6]; + Chain* h = root[7]; + Chain* j = root[8]; + Chain* k = root[9]; + Chain* l = root[10]; + Chain* m = root[11]; + Chain* n = root[12]; + Chain* o = root[13]; + Chain* p = root[14]; + Chain* q = root[15]; + while (a != END_OF_CHAIN) { + a = a->next; + b = b->next; + c = c->next; + d = d->next; + e = e->next; + f = f->next; + g = g->next; + h = h->next; + j = j->next; + k = k->next; + l = l->next; + m = m->next; + n = n->next; + o = o->next; + p = p->next; + q = q->next; + if (prefetch) prefetch(a->next); - for (int64 j=0; j < busy_cycles; j++) - asm("nop"); - } - mem_chk( a ); - mem_chk( b ); - mem_chk( c ); - mem_chk( d ); - mem_chk( e ); - mem_chk( f ); - mem_chk( g ); - mem_chk( h ); - mem_chk( j ); - mem_chk( k ); - mem_chk( l ); - mem_chk( m ); - mem_chk( n ); - mem_chk( o ); - mem_chk( p ); - mem_chk( q ); + for (int64 j = 0; j < busy_cycles; j++) + asm("nop"); + } + mem_chk(a); + mem_chk(b); + mem_chk(c); + mem_chk(d); + mem_chk(e); + mem_chk(f); + mem_chk(g); + mem_chk(h); + mem_chk(j); + mem_chk(k); + mem_chk(l); + mem_chk(m); + mem_chk(n); + mem_chk(o); + mem_chk(p); + mem_chk(q); + } } - } } - // NOT WRITTEN YET -- DMP - // JUST A PLACE HOLDER! +// NOT WRITTEN YET -- DMP +// JUST A PLACE HOLDER! Chain* -Run::stream_mem_init( Chain *mem ) -{ +Run::stream_mem_init(Chain *mem) { // fprintf(stderr, "made it into stream_mem_init.\n"); // fprintf(stderr, "chains_per_thread = %ld\n", this->exp->chains_per_thread); // fprintf(stderr, "iterations = %ld\n", this->exp->iterations); // fprintf(stderr, "bytes_per_chain = %ld\n", this->exp->bytes_per_chain); // fprintf(stderr, "stride = %ld\n", this->exp->stride); - int64 local_ops_per_chain = 0; - double* tmp = (double *) mem; - int64 refs_per_line = this->exp->bytes_per_line / sizeof(double); - int64 refs_per_chain = this->exp->bytes_per_chain / sizeof(double); + int64 local_ops_per_chain = 0; + double* tmp = (double *) mem; + int64 refs_per_line = this->exp->bytes_per_line / sizeof(double); + int64 refs_per_chain = this->exp->bytes_per_chain / sizeof(double); // fprintf(stderr, "refs_per_chain = %ld\n", refs_per_chain); - for (int64 i=0; i < refs_per_chain; i += this->exp->stride*refs_per_line) { - tmp[i] = 0; - local_ops_per_chain += 1; - } + for (int64 i = 0; i < refs_per_chain; + i += this->exp->stride * refs_per_line) { + tmp[i] = 0; + local_ops_per_chain += 1; + } - Run::global_mutex.lock(); - Run::_ops_per_chain = local_ops_per_chain; - Run::global_mutex.unlock(); + Run::global_mutex.lock(); + Run::_ops_per_chain = local_ops_per_chain; + Run::global_mutex.unlock(); // fprintf(stderr, "made it out of stream_mem_init.\n"); - return mem; + return mem; } static int64 summ_ck = 0; -void -sum_chk( double t ) -{ - if (t != 0) summ_ck += 1; +void sum_chk(double t) { + if (t != 0) + summ_ck += 1; } - // NOT WRITTEN YET -- DMP - // JUST A PLACE HOLDER! -static void -follow_streams( - int64 chains_per_thread, // memory loading per thread - int64 iterations, // number of iterations per experiment - Chain** root, // root(s) of the chain(s) to follow - int64 bytes_per_line, // ignored - int64 bytes_per_chain, // ignored - int64 stride, // ignored - int64 busy_cycles, // ignored - bool prefetch // ignored -) -{ - int64 refs_per_line = bytes_per_line / sizeof(double); - int64 refs_per_chain = bytes_per_chain / sizeof(double); +// NOT WRITTEN YET -- DMP +// JUST A PLACE HOLDER! +static void follow_streams(int64 chains_per_thread, // memory loading per thread + int64 iterations, // number of iterations per experiment + Chain** root, // root(s) of the chain(s) to follow + int64 bytes_per_line, // ignored + int64 bytes_per_chain, // ignored + int64 stride, // ignored + int64 busy_cycles, // ignored + bool prefetch // ignored + ) { + int64 refs_per_line = bytes_per_line / sizeof(double); + int64 refs_per_chain = bytes_per_chain / sizeof(double); - // chase pointers - switch (chains_per_thread) { - default: - case 1: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j]; - } - sum_chk( t ); - } - break; - case 2: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j]; - } - sum_chk( t ); - } - break; - case 3: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j]; - } - sum_chk( t ); - } - break; - case 4: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j]; - } - sum_chk( t ); - } - break; - case 5: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j]; - } - sum_chk( t ); - } - break; - case 6: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - double* a5 = (double *) root[5]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j]; - } - sum_chk( t ); - } - break; - case 7: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - double* a5 = (double *) root[5]; - double* a6 = (double *) root[6]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]; - } - sum_chk( t ); - } - break; - case 8: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - double* a5 = (double *) root[5]; - double* a6 = (double *) root[6]; - double* a7 = (double *) root[7]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + a7[j]; - } - sum_chk( t ); - } - break; - case 9: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - double* a5 = (double *) root[5]; - double* a6 = (double *) root[6]; - double* a7 = (double *) root[7]; - double* a8 = (double *) root[8]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + a7[j] + - a8[j]; - } - sum_chk( t ); - } - break; - case 10: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[0]; - double* a1 = (double *) root[1]; - double* a2 = (double *) root[2]; - double* a3 = (double *) root[3]; - double* a4 = (double *) root[4]; - double* a5 = (double *) root[5]; - double* a6 = (double *) root[6]; - double* a7 = (double *) root[7]; - double* a8 = (double *) root[8]; - double* a9 = (double *) root[9]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + a7[j] + - a8[j] + a9[j]; - } - sum_chk( t ); - } - break; - case 11: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3[j] + a4[j] + a5[j] + a6[j] + a7[j] + - a8[j] + a9[j] + a10[j]; - } - sum_chk( t ); - } - break; - case 12: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - double* a11 = (double *) root[11]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3 [j] + a4[j] + a5[j] + a6[j] + a7[j] + - a8[j] + a9[j] + a10[j] + a11[j]; - } - sum_chk( t ); - } - break; - case 13: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - double* a11 = (double *) root[11]; - double* a12 = (double *) root[12]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3 [j] + a4 [j] + a5[j] + a6[j] + a7[j] + - a8[j] + a9[j] + a10[j] + a11[j] + a12[j]; - } - sum_chk( t ); - } - break; - case 14: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - double* a11 = (double *) root[11]; - double* a12 = (double *) root[12]; - double* a13 = (double *) root[13]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3 [j] + a4 [j] + a5 [j] + a6[j] + a7[j] + - a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + a13[j]; - } - sum_chk( t ); - } - break; - case 15: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - double* a11 = (double *) root[11]; - double* a12 = (double *) root[12]; - double* a13 = (double *) root[13]; - double* a14 = (double *) root[14]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3 [j] + a4 [j] + a5 [j] + a6 [j] + a7[j] + - a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + a13[j] + a14[j]; - } - sum_chk( t ); - } - break; - case 16: - for (int64 i=0; i < iterations; i++) { - double t = 0; - double* a0 = (double *) root[ 0]; - double* a1 = (double *) root[ 1]; - double* a2 = (double *) root[ 2]; - double* a3 = (double *) root[ 3]; - double* a4 = (double *) root[ 4]; - double* a5 = (double *) root[ 5]; - double* a6 = (double *) root[ 6]; - double* a7 = (double *) root[ 7]; - double* a8 = (double *) root[ 8]; - double* a9 = (double *) root[ 9]; - double* a10 = (double *) root[10]; - double* a11 = (double *) root[11]; - double* a12 = (double *) root[12]; - double* a13 = (double *) root[13]; - double* a14 = (double *) root[14]; - double* a15 = (double *) root[15]; - for (int64 j=0; j < refs_per_chain; j+=stride*refs_per_line) { - t += a0[j] + a1[j] + a2 [j] + a3 [j] + a4 [j] + a5 [j] + a6 [j] + a7 [j] + - a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + a13[j] + a14[j] + a15[j]; - } - sum_chk( t ); + // chase pointers + switch (chains_per_thread) { + default: + case 1: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j]; + } + sum_chk(t); + } + break; + case 2: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j]; + } + sum_chk(t); + } + break; + case 3: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j]; + } + sum_chk(t); + } + break; + case 4: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j]; + } + sum_chk(t); + } + break; + case 5: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j]; + } + sum_chk(t); + } + break; + case 6: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j]; + } + sum_chk(t); + } + break; + case 7: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j]; + } + sum_chk(t); + } + break; + case 8: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j]; + } + sum_chk(t); + } + break; + case 9: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j]; + } + sum_chk(t); + } + break; + case 10: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j]; + } + sum_chk(t); + } + break; + case 11: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j]; + } + sum_chk(t); + } + break; + case 12: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + double* a11 = (double *) root[11]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j] + a11[j]; + } + sum_chk(t); + } + break; + case 13: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + double* a11 = (double *) root[11]; + double* a12 = (double *) root[12]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j]; + } + sum_chk(t); + } + break; + case 14: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + double* a11 = (double *) root[11]; + double* a12 = (double *) root[12]; + double* a13 = (double *) root[13]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + + a13[j]; + } + sum_chk(t); + } + break; + case 15: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + double* a11 = (double *) root[11]; + double* a12 = (double *) root[12]; + double* a13 = (double *) root[13]; + double* a14 = (double *) root[14]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + + a13[j] + a14[j]; + } + sum_chk(t); + } + break; + case 16: + for (int64 i = 0; i < iterations; i++) { + double t = 0; + double* a0 = (double *) root[0]; + double* a1 = (double *) root[1]; + double* a2 = (double *) root[2]; + double* a3 = (double *) root[3]; + double* a4 = (double *) root[4]; + double* a5 = (double *) root[5]; + double* a6 = (double *) root[6]; + double* a7 = (double *) root[7]; + double* a8 = (double *) root[8]; + double* a9 = (double *) root[9]; + double* a10 = (double *) root[10]; + double* a11 = (double *) root[11]; + double* a12 = (double *) root[12]; + double* a13 = (double *) root[13]; + double* a14 = (double *) root[14]; + double* a15 = (double *) root[15]; + for (int64 j = 0; j < refs_per_chain; j += stride * refs_per_line) { + t += a0[j] + a1[j] + a2[j] + a3[j] + a4[j] + a5[j] + a6[j] + + a7[j] + a8[j] + a9[j] + a10[j] + a11[j] + a12[j] + + a13[j] + a14[j] + a15[j]; + } + sum_chk(t); + } + break; } - break; - } } @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Run_h) #define Run_h @@ -23,28 +22,31 @@ class Run: public Thread { public: - Run(); - ~Run(); - int run(); - void set( Experiment &e, SpinBarrier* sbp ); - - static int64 ops_per_chain() { return _ops_per_chain; } - static double seconds() { return _seconds; } + Run(); + ~Run(); + int run(); + void set(Experiment &e, SpinBarrier* sbp); + + static int64 ops_per_chain() { + return _ops_per_chain; + } + static double seconds() { + return _seconds; + } private: - Experiment* exp; // experiment data - SpinBarrier* bp; // spin barrier used by all threads - - void mem_check( Chain *m ); - Chain* random_mem_init( Chain *m ); - Chain* forward_mem_init( Chain *m ); - Chain* reverse_mem_init( Chain *m ); - Chain* stream_mem_init( Chain *m ); - - static Lock global_mutex; // global lock - static int64 _ops_per_chain; // total number of operations per chain - static double _seconds; // total number of seconds + Experiment* exp; // experiment data + SpinBarrier* bp; // spin barrier used by all threads + + void mem_check(Chain *m); + Chain* random_mem_init(Chain *m); + Chain* forward_mem_init(Chain *m); + Chain* reverse_mem_init(Chain *m); + Chain* stream_mem_init(Chain *m); + + static Lock global_mutex; // global lock + static int64 _ops_per_chain; // total number of operations per chain + static double _seconds; // total number of seconds }; - #endif diff --git a/src/SpinBarrier.cpp b/src/SpinBarrier.cpp index d3d2d7b..d89e7c3 100644 --- a/src/SpinBarrier.cpp +++ b/src/SpinBarrier.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation *
*******************************************************************************/
-
/******************************************************************************
* *
* SpinBarrier *
@@ -27,22 +26,18 @@ #include "SpinBarrier.h"
- // create a new barrier
-SpinBarrier::SpinBarrier(int participants)
-: limit( participants )
-{
- pthread_barrier_init( &barrier_obj, NULL, this->limit );
+// create a new barrier
+SpinBarrier::SpinBarrier(int participants) :
+ limit(participants) {
+ pthread_barrier_init(&barrier_obj, NULL, this->limit);
}
- // destroy an old barrier
-SpinBarrier::~SpinBarrier()
-{
+// destroy an old barrier
+SpinBarrier::~SpinBarrier() {
}
- // enter the barrier and wait. everyone leaves
- // when the last participant enters the barrier.
-void
-SpinBarrier::barrier()
-{
- pthread_barrier_wait( &this->barrier_obj );
+// enter the barrier and wait. everyone leaves
+// when the last participant enters the barrier.
+void SpinBarrier::barrier() {
+ pthread_barrier_wait(&this->barrier_obj);
}
diff --git a/src/SpinBarrier.h b/src/SpinBarrier.h index f0b76d3..4ab3242 100644 --- a/src/SpinBarrier.h +++ b/src/SpinBarrier.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation *
*******************************************************************************/
-
/******************************************************************************
* *
* SpinBarrier *
@@ -31,14 +30,14 @@ class SpinBarrier {
public:
- SpinBarrier(int participants);
- ~SpinBarrier();
+ SpinBarrier(int participants);
+ ~SpinBarrier();
- void barrier();
+ void barrier();
private:
- int limit; // number of barrier participants
- pthread_barrier_t barrier_obj;
+ int limit; // number of barrier participants
+ pthread_barrier_t barrier_obj;
};
#endif
diff --git a/src/Thread.cpp b/src/Thread.cpp index 8908cfe..0dfb91c 100644 --- a/src/Thread.cpp +++ b/src/Thread.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include <pthread.h> #include <unistd.h> @@ -19,68 +18,51 @@ #include "Lock.h" Lock Thread::_global_lock; -int Thread::count = 0; +int Thread::count = 0; -Thread::Thread() -{ - Thread::global_lock(); +Thread::Thread() { + Thread::global_lock(); this->id = Thread::count; Thread::count += 1; - Thread::global_unlock(); + Thread::global_unlock(); } -Thread::~Thread() -{ +Thread::~Thread() { } -int -Thread::start() -{ - return pthread_create(&this->thread, NULL, Thread::start_routine, this); +int Thread::start() { + return pthread_create(&this->thread, NULL, Thread::start_routine, this); } void* -Thread::start_routine(void* p) -{ - ((Thread*)p)->run(); +Thread::start_routine(void* p) { + ((Thread*) p)->run(); - return NULL; + return NULL; } -void -Thread::exit() -{ - pthread_exit(NULL); +void Thread::exit() { + pthread_exit(NULL); } -int -Thread::wait() -{ - pthread_join(this->thread, NULL); +int Thread::wait() { + pthread_join(this->thread, NULL); - return 0; + return 0; } -void -Thread::lock() -{ - this->object_lock.lock(); +void Thread::lock() { + this->object_lock.lock(); } -void -Thread::unlock() -{ - this->object_lock.unlock(); +void Thread::unlock() { + this->object_lock.unlock(); } -void -Thread::global_lock() -{ - Thread::_global_lock.lock(); +void Thread::global_lock() { + Thread::_global_lock.lock(); } -void -Thread::global_unlock() -{ - Thread::_global_lock.unlock(); +void Thread::global_unlock() { + Thread::_global_lock.unlock(); } diff --git a/src/Thread.h b/src/Thread.h index 3948f56..55ebf1c 100644 --- a/src/Thread.h +++ b/src/Thread.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Thread_h) #define Thread_h @@ -19,35 +18,39 @@ class Thread { public: - Thread(); - ~Thread(); + Thread(); + ~Thread(); - virtual int run() = 0; + virtual int run() = 0; - int start(); - int wait(); - int thread_count() { return Thread::count; } - int thread_id() { return id; } + int start(); + int wait(); + int thread_count() { + return Thread::count; + } + int thread_id() { + return id; + } - static void exit(); + static void exit(); protected: - void lock(); - void unlock(); - static void global_lock(); - static void global_unlock(); + void lock(); + void unlock(); + static void global_lock(); + static void global_unlock(); private: - static void* start_routine(void *); - static Lock _global_lock; + static void* start_routine(void *); + static Lock _global_lock; - Lock object_lock; + Lock object_lock; - pthread_t thread; + pthread_t thread; - static int count; - int id; - int lock_obj; + static int count; + int id; + int lock_obj; }; #endif diff --git a/src/Timer.cpp b/src/Timer.cpp index b326048..8331b9a 100644 --- a/src/Timer.cpp +++ b/src/Timer.cpp @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include <stdio.h> #include <sys/time.h> @@ -17,15 +16,15 @@ #include "Types.h" -static int64 read_rtc(); -static void calibrate_rtc(int n); +static int64 read_rtc(); +static void calibrate_rtc(int n); static double wall_seconds(); -static int wall_ticks = -1; -static int rtc_ticks = -1; +static int wall_ticks = -1; +static int rtc_ticks = -1; static double wall_elapsed = -1; -static int64 rtc_elapsed = -1; -static double time_factor = -1; +static int64 rtc_elapsed = -1; +static double time_factor = -1; #if !defined(RTC) && !defined(GTOD) #define RTC @@ -33,109 +32,97 @@ static double time_factor = -1; #if defined(RTC) -double -Timer::seconds() -{ - return (double) read_rtc() * time_factor; +double Timer::seconds() { + return (double) read_rtc() * time_factor; } -int64 -Timer::ticks() -{ - // See pg. 406 of the AMD x86-64 Architecture - // Programmer's Manual, Volume 2, System Programming - unsigned int eax=0, edx=0; - - __asm__ __volatile__( - "rdtsc ;" - "movl %%eax,%0;" - "movl %%edx,%1;" - "" - : "=r"(eax), "=r"(edx) - : - : "%eax", "%edx" - ); - - return ((int64) edx << 32) | (int64) eax; +int64 Timer::ticks() { + // See pg. 406 of the AMD x86-64 Architecture + // Programmer's Manual, Volume 2, System Programming + unsigned int eax = 0, edx = 0; + + __asm__ __volatile__( + "rdtsc ;" + "movl %%eax,%0;" + "movl %%edx,%1;" + "" + : "=r"(eax), "=r"(edx) + : + : "%eax", "%edx" + ); + + return ((int64) edx << 32) | (int64) eax; } -static int64 -read_rtc() -{ - // See pg. 406 of the AMD x86-64 Architecture - // Programmer's Manual, Volume 2, System Programming - unsigned int eax=0, edx=0; - - __asm__ __volatile__( - "rdtsc ;" - "movl %%eax,%0;" - "movl %%edx,%1;" - "" - : "=r"(eax), "=r"(edx) - : - : "%eax", "%edx" - ); - - return ((int64) edx << 32) | (int64) eax; +static int64 read_rtc() { + // See pg. 406 of the AMD x86-64 Architecture + // Programmer's Manual, Volume 2, System Programming + unsigned int eax = 0, edx = 0; + + __asm__ __volatile__( + "rdtsc ;" + "movl %%eax,%0;" + "movl %%edx,%1;" + "" + : "=r"(eax), "=r"(edx) + : + : "%eax", "%edx" + ); + + return ((int64) edx << 32) | (int64) eax; } -void -Timer::calibrate() -{ - Timer::calibrate(1000); +void Timer::calibrate() { + Timer::calibrate(1000); } -void -Timer::calibrate(int n) -{ - wall_ticks = n; - - double wall_start,wall_finish,t; - t = wall_seconds(); - while (t == (wall_start=wall_seconds())) { - ; - } - int64 rtc_start = read_rtc(); - for (int i=0; i < wall_ticks; i++) { +void Timer::calibrate(int n) { + wall_ticks = n; + + double wall_start, wall_finish, t; t = wall_seconds(); - while (t == (wall_finish=wall_seconds())) { - ; + while (t == (wall_start = wall_seconds())) { + ; } - } - int64 rtc_finish = read_rtc(); + int64 rtc_start = read_rtc(); + for (int i = 0; i < wall_ticks; i++) { + t = wall_seconds(); + while (t == (wall_finish = wall_seconds())) { + ; + } + } + int64 rtc_finish = read_rtc(); - wall_elapsed = wall_finish - wall_start; - rtc_elapsed = rtc_finish - rtc_start; - time_factor = wall_elapsed / (double) rtc_elapsed; + wall_elapsed = wall_finish - wall_start; + rtc_elapsed = rtc_finish - rtc_start; + time_factor = wall_elapsed / (double) rtc_elapsed; } -static double -wall_seconds() -{ - struct timeval t; - gettimeofday(&t, NULL); +static double wall_seconds() { + struct timeval t; + gettimeofday(&t, NULL); - return (double) t.tv_sec + (double) t.tv_usec * 1E-6; + return (double) t.tv_sec + (double) t.tv_usec * 1E-6; } #else double Timer::seconds() -{ - struct timeval t; - gettimeofday(&t, NULL); +{ + struct timeval t; + gettimeofday(&t, NULL); - return (double) t.tv_sec + (double) t.tv_usec * 1E-6; + return (double) t.tv_sec + (double) t.tv_usec * 1E-6; } int64 Timer::ticks() -{ - struct timeval t; - gettimeofday(&t, NULL); +{ + struct timeval t; + gettimeofday(&t, NULL); - return 1000000 * (int64) t.tv_sec + (int64) t.tv_usec; + return 1000000 * (int64) t.tv_sec + (int64) t.tv_usec; } void @@ -150,26 +137,23 @@ Timer::calibrate(int n) #endif -static double -min( double v1, double v2 ) -{ - if (v2 < v1) return v2; - return v1; +static double min(double v1, double v2) { + if (v2 < v1) + return v2; + return v1; } -double -Timer::resolution() -{ - double a,b,c=1E9; - for (int i=0; i < 10; i++) { - a = Timer::seconds(); - while (a == (b=Timer::seconds())) - ; - a = Timer::seconds(); - while (a == (b=Timer::seconds())) - ; - c = min(b - a, c); - } - - return c; +double Timer::resolution() { + double a, b, c = 1E9; + for (int i = 0; i < 10; i++) { + a = Timer::seconds(); + while (a == (b = Timer::seconds())) + ; + a = Timer::seconds(); + while (a == (b = Timer::seconds())) + ; + c = min(b - a, c); + } + + return c; } diff --git a/src/Timer.h b/src/Timer.h index ba2c503..abc52af 100644 --- a/src/Timer.h +++ b/src/Timer.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Timer_h) #define Timer_h @@ -17,11 +16,11 @@ class Timer { public: - static double seconds(); - static double resolution(); - static int64 ticks(); - static void calibrate(); - static void calibrate(int n); + static double seconds(); + static double resolution(); + static int64 ticks(); + static void calibrate(); + static void calibrate(int n); private: }; diff --git a/src/Types.cpp b/src/Types.cpp index da5ecd0..409f727 100644 --- a/src/Types.cpp +++ b/src/Types.cpp @@ -9,5 +9,4 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #include "Types.h" diff --git a/src/Types.h b/src/Types.h index 9e2eeb0..9bb6038 100644 --- a/src/Types.h +++ b/src/Types.h @@ -9,7 +9,6 @@ * Douglas M. Pase - initial API and implementation * *******************************************************************************/ - #if !defined(Types_h) #define Types_h @@ -24,6 +23,6 @@ typedef unsigned short uint16; typedef unsigned char uint8; typedef double float64; -typedef float float32; +typedef float float32; #endif |