summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Besard <tim.besard@gmail.com>2011-10-31 20:20:14 +0100
committerTim Besard <tim.besard@gmail.com>2011-10-31 20:20:14 +0100
commite91666fd65362039e5cf3d7f3cb400ec3633c448 (patch)
treec5b1ed966155f8b42cfcd96543d0e08c81cc5cf5
parentf8ecbea68082f8f5cdfb088a783b2e46b63e9f23 (diff)
Adding prefetching.
-rw-r--r--src/Experiment.cpp7
-rw-r--r--src/Experiment.h2
-rw-r--r--src/Run.cpp50
3 files changed, 51 insertions, 8 deletions
diff --git a/src/Experiment.cpp b/src/Experiment.cpp
index 04fb9ef..e9fbdfa 100644
--- a/src/Experiment.cpp
+++ b/src/Experiment.cpp
@@ -40,7 +40,8 @@ Experiment::Experiment() :
busy_cycles (DEFAULT_BUSY_CYCLES),
seconds (DEFAULT_SECONDS),
iterations (DEFAULT_ITERATIONS),
- experiments (DEFAULT_EXPERIMENTS),
+ experiments (DEFAULT_EXPERIMENTS),
+ prefetch (DEFAULT_PREFETCH),
output_mode (TABLE),
access_pattern (RANDOM),
stride (1),
@@ -68,6 +69,7 @@ Experiment::~Experiment()
// -i or --iters iterations
// -e or --experiments experiments
// -b or --busy amount of cycles processor should remain busy
+ // -f or --prefetch prefetch data
// -a or --access memory access pattern
// random random access pattern
// forward <stride> exclusive OR and mask
@@ -137,6 +139,8 @@ Experiment::parse_args(int argc, char* argv[])
if (i == argc) { error = 1; break; }
this->busy_cycles = Experiment::parse_number(argv[i]);
if (this->experiments == 0) { error = 1; break; }
+ } else if (strcasecmp(argv[i], "-f") == 0 || strcasecmp(argv[i], "--prefetch") == 0) {
+ this->prefetch = true;
} else if (strcasecmp(argv[i], "-a") == 0 || strcasecmp(argv[i], "--access") == 0) {
i++;
if (i == argc) { error = 1; break; }
@@ -229,6 +233,7 @@ Experiment::parse_args(int argc, char* argv[])
printf(" [-n|--numa] <placement> # numa placement\n");
printf(" [-s|--seconds] <number> # run each experiment for <number> seconds\n");
printf(" [-b|--busy] <number> # how much processing cycles each loop should count\n");
+ printf(" [-f|--prefetch] # prefetch data\n");
printf(" [-x|--strict] # fail rather than adjust options to sensible values\n");
printf("\n");
printf("<pattern> is selected from the following:\n");
diff --git a/src/Experiment.h b/src/Experiment.h
index 4c0ae90..38756f0 100644
--- a/src/Experiment.h
+++ b/src/Experiment.h
@@ -44,6 +44,7 @@ public:
int64 num_threads; // number of threads in the experiment
int64 bytes_per_test; // test working set size (bytes)
int64 busy_cycles; // processing cycles
+ bool prefetch; // use of prefetching
float seconds; // number of seconds per experiment
int64 iterations; // number of iterations per experiment
@@ -91,6 +92,7 @@ public:
const static int32 DEFAULT_EXPERIMENTS = 1;
const static int32 DEFAULT_OUTPUT_MODE = 1;
+ const static bool DEFAULT_PREFETCH = false;
void alloc_local();
void alloc_xor();
diff --git a/src/Run.cpp b/src/Run.cpp
index 1ea4db3..049a115 100644
--- a/src/Run.cpp
+++ b/src/Run.cpp
@@ -27,14 +27,16 @@
static double max( double v1, double v2 );
static double min( double v1, double v2 );
-static void chase_pointers(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles);
-static void follow_streams(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles);
-static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles) = chase_pointers;
+static void chase_pointers(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch);
+static void follow_streams(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch);
+static void (*run_benchmark)(int64 chains_per_thread, int64 iterations, Chain** root, int64 bytes_per_line, int64 bytes_per_chain, int64 stride, int64 busy_cycles, bool prefetch) = chase_pointers;
Lock Run::global_mutex;
int64 Run::_ops_per_chain = 0;
double Run::_seconds = 1E9;
+#define prefetch(x) __builtin_prefetch(x)
+
Run::Run()
: exp(NULL), bp(NULL)
{
@@ -120,7 +122,7 @@ Run::run()
this->bp->barrier();
// chase pointers
- run_benchmark(this->exp->chains_per_thread, iters, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles);
+ run_benchmark(this->exp->chains_per_thread, iters, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch);
// barrier
this->bp->barrier();
@@ -156,7 +158,7 @@ Run::run()
this->bp->barrier();
// chase pointers
- run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles);
+ run_benchmark(this->exp->chains_per_thread, this->exp->iterations, root, this->exp->bytes_per_line, this->exp->bytes_per_chain, this->exp->stride, this->exp->busy_cycles, this->exp->prefetch);
// barrier
this->bp->barrier();
@@ -349,7 +351,8 @@ chase_pointers(
int64 bytes_per_line, // ignored
int64 bytes_per_chain, // ignored
int64 stride, // ignored
- int64 busy_cycles // processing cycles
+ int64 busy_cycles, // processing cycles
+ bool prefetch // prefetch?
)
{
// chase pointers
@@ -360,6 +363,8 @@ chase_pointers(
Chain* a = root[0];
while (a != NULL) {
a = a->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -373,6 +378,8 @@ chase_pointers(
while (a != NULL) {
a = a->next;
b = b->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -389,6 +396,8 @@ chase_pointers(
a = a->next;
b = b->next;
c = c->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -408,6 +417,8 @@ chase_pointers(
b = b->next;
c = c->next;
d = d->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -430,6 +441,8 @@ chase_pointers(
c = c->next;
d = d->next;
e = e->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -455,6 +468,8 @@ chase_pointers(
d = d->next;
e = e->next;
f = f->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -483,6 +498,8 @@ chase_pointers(
e = e->next;
f = f->next;
g = g->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -514,6 +531,8 @@ chase_pointers(
f = f->next;
g = g->next;
h = h->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -548,6 +567,8 @@ chase_pointers(
g = g->next;
h = h->next;
j = j->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -585,6 +606,8 @@ chase_pointers(
h = h->next;
j = j->next;
k = k->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -625,6 +648,8 @@ chase_pointers(
j = j->next;
k = k->next;
l = l->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -668,6 +693,8 @@ chase_pointers(
k = k->next;
l = l->next;
m = m->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -714,6 +741,8 @@ chase_pointers(
l = l->next;
m = m->next;
n = n->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -763,6 +792,8 @@ chase_pointers(
m = m->next;
n = n->next;
o = o->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -815,6 +846,8 @@ chase_pointers(
n = n->next;
o = o->next;
p = p->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -870,6 +903,8 @@ chase_pointers(
o = o->next;
p = p->next;
q = q->next;
+ if (prefetch)
+ prefetch(a->next);
for (int64 j=0; j < busy_cycles; j++)
asm("nop");
}
@@ -939,7 +974,8 @@ follow_streams(
int64 bytes_per_line, // ignored
int64 bytes_per_chain, // ignored
int64 stride, // ignored
- int64 busy_cycles // ignored
+ int64 busy_cycles, // ignored
+ bool prefetch // ignored
)
{
int64 refs_per_line = bytes_per_line / sizeof(double);