From eb634401bfa4f93154eeb6265f100fd9db2bf7d4 Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Mon, 7 Oct 2019 09:48:37 +0200 Subject: move ParamStats and helper functions to lib/parameters.py --- lib/dfatool.py | 186 +-------------------------------------------------------- 1 file changed, 2 insertions(+), 184 deletions(-) (limited to 'lib/dfatool.py') diff --git a/lib/dfatool.py b/lib/dfatool.py index cc6b32e..3151c65 100755 --- a/lib/dfatool.py +++ b/lib/dfatool.py @@ -15,7 +15,8 @@ from multiprocessing import Pool from automata import PTA from functions import analytic from functions import AnalyticFunction -from utils import vprint, is_numeric, soft_cast_int, param_slice_eq, compute_param_statistics, remove_index_from_tuple, is_power_of_two, distinct_param_values +from parameters import ParamStats +from utils import vprint, is_numeric, soft_cast_int, param_slice_eq, remove_index_from_tuple, is_power_of_two arg_support_enabled = True @@ -370,189 +371,6 @@ def _preprocess_measurement(measurement): return processed_data -class ParamStats: - - def __init__(self, by_name, by_param, parameter_names, arg_count, use_corrcoef = False, verbose = False): - """ - Compute standard deviation and correlation coefficient on parameterized data partitions. - - It is strongly recommended to vary all parameter values evenly. - For instance, given two parameters, providing only the combinations - (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results. - It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values) - - arguments: - by_name -- ground truth partitioned by state/transition name. - by_name[state_or_trans][attribute] must be a list or 1-D numpy array. - by_name[state_or_trans]['param'] must be a list of parameter values - corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the - first ground truth element has the (lexically) first parameter set to 1, - the second to 2 and the third to 3. - by_param -- ground truth partitioned by state/transition name and parameters. - by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array. - parameter_names -- list of parameter names, must have the same order as the parameter - values in by_param (lexical sorting is recommended). - arg_count -- dict providing the number of functions args ("local parameters") for each function. - use_corrcoef -- use correlation coefficient instead of stddev heuristic for parameter detection - """ - self.stats = dict() - self.use_corrcoef = use_corrcoef - self._parameter_names = parameter_names - # Note: This is deliberately single-threaded. The overhead incurred - # by multiprocessing is higher than the speed gained by parallel - # computation of statistics measures. - for state_or_tran in by_name.keys(): - self.stats[state_or_tran] = dict() - for attribute in by_name[state_or_tran]['attributes']: - self.stats[state_or_tran][attribute] = compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_tran, attribute, verbose = verbose) - - def _generic_param_independence_ratio(self, state_or_trans, attribute): - """ - Return the heuristic ratio of parameter independence for state_or_trans and attribute. - - This is not supported if the correlation coefficient is used. - A value close to 1 means no influence, a value close to 0 means high probability of influence. - """ - statistics = self.stats[state_or_trans][attribute] - if self.use_corrcoef: - # not supported - raise ValueError - if statistics['std_static'] == 0: - return 0 - return statistics['std_param_lut'] / statistics['std_static'] - - def generic_param_dependence_ratio(self, state_or_trans, attribute): - """ - Return the heuristic ratio of parameter dependence for state_or_trans and attribute. - - This is not supported if the correlation coefficient is used. - A value close to 0 means no influence, a value close to 1 means high probability of influence. - """ - return 1 - self._generic_param_independence_ratio(state_or_trans, attribute) - - def _reduce_param_matrix(self, matrix: np.ndarray, parameter_names: list) -> list: - """ - :param matrix: parameter dependence matrix, M[(...)] == 1 iff (model attribute) is influenced by (parameter) for other parameter value indxe == (...) - :param parameter_names: names of parameters in the order in which they appear in the matrix index. The first entry corresponds to the first axis, etc. - :returns: parameters which determine whether (parameter) has an effect on (model attribute). If a parameter is not part of this list, its value does not - affect (parameter)'s influence on (model attribute) -- it either always or never has an influence - """ - if np.all(matrix == True) or np.all(matrix == False): - return list() - - if not is_power_of_two(np.count_nonzero(matrix)): - # cannot be reliably reduced to a list of parameters - return list() - - if np.count_nonzero(matrix) == 1: - influential_parameters = list() - for i, parameter_name in enumerate(parameter_names): - if matrix.shape[i] > 1: - influential_parameters.append(parameter_name) - return influential_parameters - - for axis in range(matrix.ndim): - candidate = self._reduce_param_matrix(np.all(matrix, axis=axis), remove_index_from_tuple(parameter_names, axis)) - if len(candidate): - return candidate - - return list() - - def _get_codependent_parameters(self, stats, param): - """ - Return list of parameters which affect whether `param` influences the model attribute described in `stats` or not. - """ - safe_div = np.vectorize(lambda x,y: 0. if x == 0 else 1 - x/y) - ratio_by_value = safe_div(stats['lut_by_param_values'][param], stats['std_by_param_values'][param]) - err_mode = np.seterr('ignore') - dep_by_value = ratio_by_value > 0.5 - np.seterr(**err_mode) - - other_param_list = list(filter(lambda x: x != param, self._parameter_names)) - influencer_parameters = self._reduce_param_matrix(dep_by_value, other_param_list) - return influencer_parameters - - def _param_independence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float: - """ - Return the heuristic ratio of parameter independence for state_or_trans, attribute, and param. - - A value close to 1 means no influence, a value close to 0 means high probability of influence. - """ - statistics = self.stats[state_or_trans][attribute] - if self.use_corrcoef: - return 1 - np.abs(statistics['corr_by_param'][param]) - if statistics['std_by_param'][param] == 0: - if statistics['std_param_lut'] != 0: - raise RuntimeError("wat") - # In general, std_param_lut < std_by_param. So, if std_by_param == 0, std_param_lut == 0 follows. - # This means that the variation of param does not affect the model quality -> no influence, return 1 - return 1. - - return statistics['std_param_lut'] / statistics['std_by_param'][param] - - def param_dependence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float: - """ - Return the heuristic ratio of parameter dependence for state_or_trans, attribute, and param. - - A value close to 0 means no influence, a value close to 1 means high probability of influence. - - :param state_or_trans: state or transition name - :param attribute: model attribute - :param param: parameter name - - :returns: parameter dependence (float between 0 == no influence and 1 == high probability of influence) - """ - return 1 - self._param_independence_ratio(state_or_trans, attribute, param) - - def reverse_dependent_parameters(self, state_or_trans: str, attribute: str, param: str) -> list: - """ - Return parameters whose value influences whether `attribute` of `state_or_trans` depends on `param` or not. - - For example, a radio's TX POWER is only influenced by the packet length if dynamically sized payloads are enabled. - So reverse_dependent_parameters('TX', 'POWER', 'packet_length') == ['dynamic_payload_size']. - - :param state_or_trans: state or transition name - :param attribute: model attribute - :param param: parameter name - - :returns: list of parameters - """ - return self._get_codependent_parameters(self.stats[state_or_trans][attribute], param) - - def _arg_independence_ratio(self, state_or_trans, attribute, arg_index): - statistics = self.stats[state_or_trans][attribute] - if self.use_corrcoef: - return 1 - np.abs(statistics['corr_by_arg'][arg_index]) - if statistics['std_by_arg'][arg_index] == 0: - if statistics['std_param_lut'] != 0: - raise RuntimeError("wat") - # In general, std_param_lut < std_by_arg. So, if std_by_arg == 0, std_param_lut == 0 follows. - # This means that the variation of arg does not affect the model quality -> no influence, return 1 - return 1 - return statistics['std_param_lut'] / statistics['std_by_arg'][arg_index] - - def arg_dependence_ratio(self, state_or_trans: str, attribute: str, arg_index: int) -> float: - return 1 - self._arg_independence_ratio(state_or_trans, attribute, arg_index) - - # This heuristic is very similar to the "function is not much better than - # median" checks in get_fitted. So far, doing it here as well is mostly - # a performance and not an algorithm quality decision. - # --df, 2018-04-18 - def depends_on_param(self, state_or_trans, attribute, param): - """Return whether attribute of state_or_trans depens on param.""" - if self.use_corrcoef: - return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.1 - else: - return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.5 - - # See notes on depends_on_param - def depends_on_arg(self, state_or_trans, attribute, arg_index): - """Return whether attribute of state_or_trans depens on arg_index.""" - if self.use_corrcoef: - return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.1 - else: - return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.5 - class TimingData: """ Loader for timing model traces measured with on-board timers using `harness.OnboardTimerHarness`. -- cgit v1.2.3