diff options
author | Daniel Friesel <derf@finalrewind.org> | 2018-03-12 16:02:01 +0100 |
---|---|---|
committer | Daniel Friesel <derf@finalrewind.org> | 2018-03-12 16:02:01 +0100 |
commit | 16b17dd34a0c1602b7df842af3126d8bd4d4a042 (patch) | |
tree | 2e7d28a99a704cff6ee956fa6d067e8292d2d7ff | |
parent | 83bc676dd6247bca3bc1901401469d23f4768992 (diff) |
optional outlier handling... not sure if useful yet
-rwxr-xr-x | bin/analyze-archive.py | 10 | ||||
-rwxr-xr-x | lib/dfatool.py | 40 |
2 files changed, 47 insertions, 3 deletions
diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py index e3a44bf..a9ef3bc 100755 --- a/bin/analyze-archive.py +++ b/bin/analyze-archive.py @@ -40,10 +40,11 @@ def model_quality_table(result_lists, info_list): if __name__ == '__main__': ignored_trace_indexes = None + discard_outliers = None try: raw_opts, args = getopt.getopt(sys.argv[1:], "", - 'plot ignored-trace-indexes='.split(' ')) + 'plot ignored-trace-indexes= discard-outliers='.split(' ')) for option, parameter in raw_opts: optname = re.sub(r'^--', '', option) @@ -54,6 +55,9 @@ if __name__ == '__main__': if 0 in ignored_trace_indexes: print('[E] arguments to --ignored-trace-indexes start from 1') + if 'discard-outliers' in opts: + discard_outliers = float(opts['discard-outliers']) + except getopt.GetoptError as err: print(err) sys.exit(2) @@ -61,7 +65,9 @@ if __name__ == '__main__': raw_data = RawData(args) preprocessed_data = raw_data.get_preprocessed_data() - model = EnergyModel(preprocessed_data, ignore_trace_indexes = ignored_trace_indexes) + model = EnergyModel(preprocessed_data, + ignore_trace_indexes = ignored_trace_indexes, + discard_outliers = discard_outliers) print('--- simple static model ---') static_model = model.get_static() diff --git a/lib/dfatool.py b/lib/dfatool.py index 419a214..b34af2e 100755 --- a/lib/dfatool.py +++ b/lib/dfatool.py @@ -305,6 +305,7 @@ class RawData: paramvalue.extend(map(soft_cast_int, online_trace_part['args'])) if not 'offline_aggregates' in online_trace_part: + online_trace_part['offline_attributes'] = ['power', 'duration', 'energy'] online_trace_part['offline_aggregates'] = { 'power' : [], 'duration' : [], @@ -314,6 +315,7 @@ class RawData: 'param': [], } if online_trace_part['isa'] == 'transition': + online_trace_part['offline_attributes'].extend(['rel_energy_prev', 'rel_energy_next', 'timeout']) online_trace_part['offline_aggregates']['rel_energy_prev'] = [] online_trace_part['offline_aggregates']['rel_energy_next'] = [] online_trace_part['offline_aggregates']['timeout'] = [] @@ -737,7 +739,7 @@ def _mean_std_by_param(by_param, state_or_tran, key, param_index): class EnergyModel: - def __init__(self, preprocessed_data, ignore_trace_indexes = None): + def __init__(self, preprocessed_data, ignore_trace_indexes = None, discard_outliers = None): self.traces = preprocessed_data self.by_name = {} self.by_param = {} @@ -746,6 +748,9 @@ class EnergyModel: np.seterr('raise') self._parameter_names = sorted(self.traces[0]['trace'][0]['parameter'].keys()) self._num_args = {} + self._outlier_threshold = discard_outliers + if discard_outliers != None: + self._compute_outlier_stats(ignore_trace_indexes, discard_outliers) for run in self.traces: if ignore_trace_indexes == None or int(run['id']) not in ignore_trace_indexes: for i, elem in enumerate(run['trace']): @@ -756,6 +761,26 @@ class EnergyModel: self._aggregate_to_ndarray(self.by_name) self._compute_all_param_statistics() + def _compute_outlier_stats(self, ignore_trace_indexes, threshold): + tmp_by_param = {} + self.median_by_param = {} + for run in self.traces: + if ignore_trace_indexes == None or int(run['id']) not in ignore_trace_indexes: + for i, elem in enumerate(run['trace']): + key = (elem['name'], tuple(_elem_param_and_arg_list(elem))) + if not key in tmp_by_param: + tmp_by_param[key] = {} + for attribute in elem['offline_attributes']: + tmp_by_param[key][attribute] = [] + for attribute in elem['offline_attributes']: + tmp_by_param[key][attribute].extend(elem['offline_aggregates'][attribute]) + for key, elem in tmp_by_param.items(): + if not key in self.median_by_param: + self.median_by_param[key] = {} + for attribute in tmp_by_param[key].keys(): + self.median_by_param[key][attribute] = np.median(tmp_by_param[key][attribute]) + + def _compute_all_param_statistics(self): queue = [] for state_or_trans in self.by_name.keys(): @@ -796,6 +821,17 @@ class EnergyModel: for key in elem['attributes']: elem[key] = np.array(elem[key]) + def _prune_outliers(self, key, attribute, data): + if self._outlier_threshold == None: + return data + median = self.median_by_param[key][attribute] + if np.median(np.abs(data - median)) == 0: + return data + pruned_data = list(filter(lambda x: np.abs(0.6745 * (x - median) / np.median(np.abs(data - median))) > self._outlier_threshold, data )) + if len(pruned_data): + print('[I] Pruned outliers from ({}) {}: {}'.format(key, attribute, pruned_data)) + data = list(filter(lambda x: np.abs(0.6745 * (x - median) / np.median(np.abs(data - median))) <= self._outlier_threshold, data )) + return data def _add_data_to_aggregate(self, aggregate, key, element): if not key in aggregate: @@ -812,6 +848,8 @@ class EnergyModel: if element['plan']['level'] == 'epilogue': aggregate[key]['attributes'].insert(0, 'timeout') for datakey, dataval in element['offline_aggregates'].items(): + if datakey in element['offline_attributes']: + dataval = self._prune_outliers((element['name'], tuple(_elem_param_and_arg_list(element))), datakey, dataval) aggregate[key][datakey].extend(dataval) def _load_agg_elem(self, name, elem): |