summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <derf@finalrewind.org>2018-03-12 16:02:01 +0100
committerDaniel Friesel <derf@finalrewind.org>2018-03-12 16:02:01 +0100
commit16b17dd34a0c1602b7df842af3126d8bd4d4a042 (patch)
tree2e7d28a99a704cff6ee956fa6d067e8292d2d7ff
parent83bc676dd6247bca3bc1901401469d23f4768992 (diff)
optional outlier handling... not sure if useful yet
-rwxr-xr-xbin/analyze-archive.py10
-rwxr-xr-xlib/dfatool.py40
2 files changed, 47 insertions, 3 deletions
diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py
index e3a44bf..a9ef3bc 100755
--- a/bin/analyze-archive.py
+++ b/bin/analyze-archive.py
@@ -40,10 +40,11 @@ def model_quality_table(result_lists, info_list):
if __name__ == '__main__':
ignored_trace_indexes = None
+ discard_outliers = None
try:
raw_opts, args = getopt.getopt(sys.argv[1:], "",
- 'plot ignored-trace-indexes='.split(' '))
+ 'plot ignored-trace-indexes= discard-outliers='.split(' '))
for option, parameter in raw_opts:
optname = re.sub(r'^--', '', option)
@@ -54,6 +55,9 @@ if __name__ == '__main__':
if 0 in ignored_trace_indexes:
print('[E] arguments to --ignored-trace-indexes start from 1')
+ if 'discard-outliers' in opts:
+ discard_outliers = float(opts['discard-outliers'])
+
except getopt.GetoptError as err:
print(err)
sys.exit(2)
@@ -61,7 +65,9 @@ if __name__ == '__main__':
raw_data = RawData(args)
preprocessed_data = raw_data.get_preprocessed_data()
- model = EnergyModel(preprocessed_data, ignore_trace_indexes = ignored_trace_indexes)
+ model = EnergyModel(preprocessed_data,
+ ignore_trace_indexes = ignored_trace_indexes,
+ discard_outliers = discard_outliers)
print('--- simple static model ---')
static_model = model.get_static()
diff --git a/lib/dfatool.py b/lib/dfatool.py
index 419a214..b34af2e 100755
--- a/lib/dfatool.py
+++ b/lib/dfatool.py
@@ -305,6 +305,7 @@ class RawData:
paramvalue.extend(map(soft_cast_int, online_trace_part['args']))
if not 'offline_aggregates' in online_trace_part:
+ online_trace_part['offline_attributes'] = ['power', 'duration', 'energy']
online_trace_part['offline_aggregates'] = {
'power' : [],
'duration' : [],
@@ -314,6 +315,7 @@ class RawData:
'param': [],
}
if online_trace_part['isa'] == 'transition':
+ online_trace_part['offline_attributes'].extend(['rel_energy_prev', 'rel_energy_next', 'timeout'])
online_trace_part['offline_aggregates']['rel_energy_prev'] = []
online_trace_part['offline_aggregates']['rel_energy_next'] = []
online_trace_part['offline_aggregates']['timeout'] = []
@@ -737,7 +739,7 @@ def _mean_std_by_param(by_param, state_or_tran, key, param_index):
class EnergyModel:
- def __init__(self, preprocessed_data, ignore_trace_indexes = None):
+ def __init__(self, preprocessed_data, ignore_trace_indexes = None, discard_outliers = None):
self.traces = preprocessed_data
self.by_name = {}
self.by_param = {}
@@ -746,6 +748,9 @@ class EnergyModel:
np.seterr('raise')
self._parameter_names = sorted(self.traces[0]['trace'][0]['parameter'].keys())
self._num_args = {}
+ self._outlier_threshold = discard_outliers
+ if discard_outliers != None:
+ self._compute_outlier_stats(ignore_trace_indexes, discard_outliers)
for run in self.traces:
if ignore_trace_indexes == None or int(run['id']) not in ignore_trace_indexes:
for i, elem in enumerate(run['trace']):
@@ -756,6 +761,26 @@ class EnergyModel:
self._aggregate_to_ndarray(self.by_name)
self._compute_all_param_statistics()
+ def _compute_outlier_stats(self, ignore_trace_indexes, threshold):
+ tmp_by_param = {}
+ self.median_by_param = {}
+ for run in self.traces:
+ if ignore_trace_indexes == None or int(run['id']) not in ignore_trace_indexes:
+ for i, elem in enumerate(run['trace']):
+ key = (elem['name'], tuple(_elem_param_and_arg_list(elem)))
+ if not key in tmp_by_param:
+ tmp_by_param[key] = {}
+ for attribute in elem['offline_attributes']:
+ tmp_by_param[key][attribute] = []
+ for attribute in elem['offline_attributes']:
+ tmp_by_param[key][attribute].extend(elem['offline_aggregates'][attribute])
+ for key, elem in tmp_by_param.items():
+ if not key in self.median_by_param:
+ self.median_by_param[key] = {}
+ for attribute in tmp_by_param[key].keys():
+ self.median_by_param[key][attribute] = np.median(tmp_by_param[key][attribute])
+
+
def _compute_all_param_statistics(self):
queue = []
for state_or_trans in self.by_name.keys():
@@ -796,6 +821,17 @@ class EnergyModel:
for key in elem['attributes']:
elem[key] = np.array(elem[key])
+ def _prune_outliers(self, key, attribute, data):
+ if self._outlier_threshold == None:
+ return data
+ median = self.median_by_param[key][attribute]
+ if np.median(np.abs(data - median)) == 0:
+ return data
+ pruned_data = list(filter(lambda x: np.abs(0.6745 * (x - median) / np.median(np.abs(data - median))) > self._outlier_threshold, data ))
+ if len(pruned_data):
+ print('[I] Pruned outliers from ({}) {}: {}'.format(key, attribute, pruned_data))
+ data = list(filter(lambda x: np.abs(0.6745 * (x - median) / np.median(np.abs(data - median))) <= self._outlier_threshold, data ))
+ return data
def _add_data_to_aggregate(self, aggregate, key, element):
if not key in aggregate:
@@ -812,6 +848,8 @@ class EnergyModel:
if element['plan']['level'] == 'epilogue':
aggregate[key]['attributes'].insert(0, 'timeout')
for datakey, dataval in element['offline_aggregates'].items():
+ if datakey in element['offline_attributes']:
+ dataval = self._prune_outliers((element['name'], tuple(_elem_param_and_arg_list(element))), datakey, dataval)
aggregate[key][datakey].extend(dataval)
def _load_agg_elem(self, name, elem):