diff options
author | Tim Besard <tim.besard@gmail.com> | 2011-10-31 20:20:14 +0100 |
---|---|---|
committer | Tim Besard <tim.besard@gmail.com> | 2011-10-31 20:20:14 +0100 |
commit | e91666fd65362039e5cf3d7f3cb400ec3633c448 (patch) | |
tree | c5b1ed966155f8b42cfcd96543d0e08c81cc5cf5 | |
parent | f8ecbea68082f8f5cdfb088a783b2e46b63e9f23 (diff) |
Adding prefetching.
-rw-r--r-- | src/Experiment.cpp | 7 | ||||
-rw-r--r-- | src/Experiment.h | 2 | ||||
-rw-r--r-- | src/Run.cpp | 50 |
3 files changed, 51 insertions, 8 deletions
diff --git a/src/Experiment.cpp b/src/Experiment.cpp index 04fb9ef..e9fbdfa 100644 --- a/src/Experiment.cpp +++ b/src/Experiment.cpp @@ -40,7 +40,8 @@ Experiment::Experiment() : busy_cycles (DEFAULT_BUSY_CYCLES), seconds (DEFAULT_SECONDS), iterations (DEFAULT_ITERATIONS), - experiments (DEFAULT_EXPERIMENTS), + experiments (DEFAULT_EXPERIMENTS), + prefetch (DEFAULT_PREFETCH), output_mode (TABLE), access_pattern (RANDOM), stride (1), @@ -68,6 +69,7 @@ Experiment::~Experiment() // -i or --iters iterations // -e or --experiments experiments // -b or --busy amount of cycles processor should remain busy + // -f or --prefetch prefetch data // -a or --access memory access pattern // random random access pattern // forward <stride> exclusive OR and mask @@ -137,6 +139,8 @@ Experiment::parse_args(int argc, char* argv[]) if (i == argc) { error = 1; break; } this->busy_cycles = Experiment::parse_number(argv[i]); if (this->experiments == 0) { error = 1; break; } + } else if (strcasecmp(argv[i], "-f") == 0 || strcasecmp(argv[i], "--prefetch") == 0) { + this->prefetch = true; } else if (strcasecmp(argv[i], "-a") == 0 || strcasecmp(argv[i], "--access") == 0) { i++; if (i == argc) { error = 1; break; } @@ -229,6 +233,7 @@ Experiment::parse_args(int argc, char* argv[]) printf(" [-n|--numa] <placement> # numa placement\n"); printf(" [-s|--seconds] <number> # run each experiment for <number> seconds\n"); printf(" [-b|--busy] <number> # how much processing cycles each loop should count\n"); + printf(" [-f|--prefetch] # prefetch data\n"); printf(" [-x|--strict] # fail rather than adjust options to sensible values\n"); printf("\n"); printf("<pattern> is selected from the following:\n"); diff --git a/src/Experiment.h b/src/Experiment.h index 4c0ae90..38756f0 100644 --- a/src/Experiment.h +++ b/src/Experiment.h @@ -44,6 +44,7 @@ public: int64 num_threads; // number of threads in the experiment int64 bytes_per_test; // test working set size (bytes) int64 busy_cycles; // processing cycles + bool prefetch; // use of prefetching float seconds; // number of seconds per experiment int64 iterations; // number of iterations per experiment @@ -91,6 +92,7 @@ public: const static int32 DEFAULT_EXPERIMENTS = 1; const static int32 DEFAULT_OUTPUT_MODE = 1; + const static bool DEFAULT_PREFETCH = false; void alloc_local(); void alloc_xor(); diff --git a/src/Run.cpp b/src/Run.cpp index 1ea4db3..049a115 100644 --- a/src/Run.cpp +++ b/src/Run.cpp @@ -27,14 +27,16 @@ static double max( double v1, double v2 ); static double min( double v1, double v2 ); -static void chase_pointers(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles); -static void follow_streams(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles); -static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles) = chase_pointers; +static void chase_pointers(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch); +static void follow_streams(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch); +static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch) = chase_pointers; Lock Run::global_mutex; int64 Run::_ops_per_chain = 0; double Run::_seconds = 1E9; +#define prefetch(x) __builtin_prefetch(x) + Run::Run() : exp(NULL), bp(NULL) { @@ -120,7 +122,7 @@ Run::run() this->bp->barrier(); // chase pointers - run_benchmark(this->exp->chains_per_thread, iters, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles); + run_benchmark(this->exp->chains_per_thread, iters, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch); // barrier this->bp->barrier(); @@ -156,7 +158,7 @@ Run::run() this->bp->barrier(); // chase pointers - run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles); + run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch); // barrier this->bp->barrier(); @@ -349,7 +351,8 @@ chase_pointers( int64 bytes_per_line, // ignored int64 bytes_per_chain, // ignored int64 stride, // ignored - int64 busy_cycles // processing cycles + int64 busy_cycles, // processing cycles + bool prefetch // prefetch? ) { // chase pointers @@ -360,6 +363,8 @@ chase_pointers( Chain* a = root[0]; while (a != NULL) { a = a->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -373,6 +378,8 @@ chase_pointers( while (a != NULL) { a = a->next; b = b->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -389,6 +396,8 @@ chase_pointers( a = a->next; b = b->next; c = c->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -408,6 +417,8 @@ chase_pointers( b = b->next; c = c->next; d = d->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -430,6 +441,8 @@ chase_pointers( c = c->next; d = d->next; e = e->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -455,6 +468,8 @@ chase_pointers( d = d->next; e = e->next; f = f->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -483,6 +498,8 @@ chase_pointers( e = e->next; f = f->next; g = g->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -514,6 +531,8 @@ chase_pointers( f = f->next; g = g->next; h = h->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -548,6 +567,8 @@ chase_pointers( g = g->next; h = h->next; j = j->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -585,6 +606,8 @@ chase_pointers( h = h->next; j = j->next; k = k->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -625,6 +648,8 @@ chase_pointers( j = j->next; k = k->next; l = l->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -668,6 +693,8 @@ chase_pointers( k = k->next; l = l->next; m = m->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -714,6 +741,8 @@ chase_pointers( l = l->next; m = m->next; n = n->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -763,6 +792,8 @@ chase_pointers( m = m->next; n = n->next; o = o->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -815,6 +846,8 @@ chase_pointers( n = n->next; o = o->next; p = p->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -870,6 +903,8 @@ chase_pointers( o = o->next; p = p->next; q = q->next; + if (prefetch) + prefetch(a->next); for (int64 j=0; j < busy_cycles; j++) asm("nop"); } @@ -939,7 +974,8 @@ follow_streams( int64 bytes_per_line, // ignored int64 bytes_per_chain, // ignored int64 stride, // ignored - int64 busy_cycles // ignored + int64 busy_cycles, // ignored + bool prefetch // ignored ) { int64 refs_per_line = bytes_per_line / sizeof(double); |