move ParamStats and helper functions to lib/parameters.py

author: Daniel Friesel <daniel.friesel@uos.de> 2019-10-07 09:48:37 +0200
committer: Daniel Friesel <daniel.friesel@uos.de> 2019-10-07 09:48:37 +0200
commit: eb634401bfa4f93154eeb6265f100fd9db2bf7d4 (patch)
tree: 6adfbf12d55d6f3f925dfa672207352734b30186 /lib/dfatool.py
parent: 8b496797773a95bac66d76acc0d4dfee53f70ff7 (diff)
1 files changed, 2 insertions, 184 deletions
diff --git a/lib/dfatool.py b/lib/dfatool.py
index cc6b32e..3151c65 100755
--- a/lib/dfatool.py
+++ b/lib/dfatool.py
@@ -15,7 +15,8 @@ from multiprocessing import Pool
 from automata import PTA
 from functions import analytic
 from functions import AnalyticFunction
-from utils import vprint, is_numeric, soft_cast_int, param_slice_eq, compute_param_statistics, remove_index_from_tuple, is_power_of_two, distinct_param_values
+from parameters import ParamStats
+from utils import vprint, is_numeric, soft_cast_int, param_slice_eq, remove_index_from_tuple, is_power_of_two
 
 arg_support_enabled = True
 
@@ -370,189 +371,6 @@ def _preprocess_measurement(measurement):
 
     return processed_data
 
-class ParamStats:
-
-    def __init__(self, by_name, by_param, parameter_names, arg_count, use_corrcoef = False, verbose = False):
-        """
-        Compute standard deviation and correlation coefficient on parameterized data partitions.
-
-        It is strongly recommended to vary all parameter values evenly.
-        For instance, given two parameters, providing only the combinations
-        (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
-        It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
-
-        arguments:
-        by_name -- ground truth partitioned by state/transition name.
-            by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
-            by_name[state_or_trans]['param'] must be a list of parameter values
-            corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
-            first ground truth element has the (lexically) first parameter set to 1,
-            the second to 2 and the third to 3.
-        by_param -- ground truth partitioned by state/transition name and parameters.
-            by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
-        parameter_names -- list of parameter names, must have the same order as the parameter
-            values in by_param (lexical sorting is recommended).
-        arg_count -- dict providing the number of functions args ("local parameters") for each function.
-        use_corrcoef -- use correlation coefficient instead of stddev heuristic for parameter detection
-        """
-        self.stats = dict()
-        self.use_corrcoef = use_corrcoef
-        self._parameter_names = parameter_names
-        # Note: This is deliberately single-threaded. The overhead incurred
-        # by multiprocessing is higher than the speed gained by parallel
-        # computation of statistics measures.
-        for state_or_tran in by_name.keys():
-            self.stats[state_or_tran] = dict()
-            for attribute in by_name[state_or_tran]['attributes']:
-                self.stats[state_or_tran][attribute] = compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_tran, attribute, verbose = verbose)
-
-    def _generic_param_independence_ratio(self, state_or_trans, attribute):
-        """
-        Return the heuristic ratio of parameter independence for state_or_trans and attribute.
-
-        This is not supported if the correlation coefficient is used.
-        A value close to 1 means no influence, a value close to 0 means high probability of influence.
-        """
-        statistics = self.stats[state_or_trans][attribute]
-        if self.use_corrcoef:
-            # not supported
-            raise ValueError
-        if statistics['std_static'] == 0:
-            return 0
-        return statistics['std_param_lut'] / statistics['std_static']
-
-    def generic_param_dependence_ratio(self, state_or_trans, attribute):
-        """
-        Return the heuristic ratio of parameter dependence for state_or_trans and attribute.
-
-        This is not supported if the correlation coefficient is used.
-        A value close to 0 means no influence, a value close to 1 means high probability of influence.
-        """
-        return 1 - self._generic_param_independence_ratio(state_or_trans, attribute)
-
-    def _reduce_param_matrix(self, matrix: np.ndarray, parameter_names: list) -> list:
-        """
-        :param matrix: parameter dependence matrix, M[(...)] == 1 iff (model attribute) is influenced by (parameter) for other parameter value indxe == (...)
-        :param parameter_names: names of parameters in the order in which they appear in the matrix index. The first entry corresponds to the first axis, etc.
-        :returns: parameters which determine whether (parameter) has an effect on (model attribute). If a parameter is not part of this list, its value does not
-            affect (parameter)'s influence on (model attribute) -- it either always or never has an influence
-        """
-        if np.all(matrix == True) or np.all(matrix == False):
-            return list()
-
-        if not is_power_of_two(np.count_nonzero(matrix)):
-            # cannot be reliably reduced to a list of parameters
-            return list()
-
-        if np.count_nonzero(matrix) == 1:
-            influential_parameters = list()
-            for i, parameter_name in enumerate(parameter_names):
-                if matrix.shape[i] > 1:
-                    influential_parameters.append(parameter_name)
-            return influential_parameters
-
-        for axis in range(matrix.ndim):
-            candidate = self._reduce_param_matrix(np.all(matrix, axis=axis), remove_index_from_tuple(parameter_names, axis))
-            if len(candidate):
-                return candidate
-
-        return list()
-
-    def _get_codependent_parameters(self, stats, param):
-        """
-        Return list of parameters which affect whether `param` influences the model attribute described in `stats` or not.
-        """
-        safe_div = np.vectorize(lambda x,y: 0. if x == 0 else 1 - x/y)
-        ratio_by_value = safe_div(stats['lut_by_param_values'][param], stats['std_by_param_values'][param])
-        err_mode = np.seterr('ignore')
-        dep_by_value = ratio_by_value > 0.5
-        np.seterr(**err_mode)
-
-        other_param_list = list(filter(lambda x: x != param, self._parameter_names))
-        influencer_parameters = self._reduce_param_matrix(dep_by_value, other_param_list)
-        return influencer_parameters
-
-    def _param_independence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float:
-        """
-        Return the heuristic ratio of parameter independence for state_or_trans, attribute, and param.
-
-        A value close to 1 means no influence, a value close to 0 means high probability of influence.
-        """
-        statistics = self.stats[state_or_trans][attribute]
-        if self.use_corrcoef:
-            return 1 - np.abs(statistics['corr_by_param'][param])
-        if statistics['std_by_param'][param] == 0:
-            if statistics['std_param_lut'] != 0:
-                raise RuntimeError("wat")
-            # In general, std_param_lut < std_by_param. So, if std_by_param == 0, std_param_lut == 0 follows.
-            # This means that the variation of param does not affect the model quality -> no influence, return 1
-            return 1.
-
-        return statistics['std_param_lut'] / statistics['std_by_param'][param]
-
-    def param_dependence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float:
-        """
-        Return the heuristic ratio of parameter dependence for state_or_trans, attribute, and param.
-
-        A value close to 0 means no influence, a value close to 1 means high probability of influence.
-
-        :param state_or_trans: state or transition name
-        :param attribute: model attribute
-        :param param: parameter name
-
-        :returns: parameter dependence (float between 0 == no influence and 1 == high probability of influence)
-        """
-        return 1 - self._param_independence_ratio(state_or_trans, attribute, param)
-
-    def reverse_dependent_parameters(self, state_or_trans: str, attribute: str, param: str) -> list:
-        """
-        Return parameters whose value influences whether `attribute` of `state_or_trans` depends on `param` or not.
-
-        For example, a radio's TX POWER is only influenced by the packet length if dynamically sized payloads are enabled.
-        So reverse_dependent_parameters('TX', 'POWER', 'packet_length') == ['dynamic_payload_size'].
-
-        :param state_or_trans: state or transition name
-        :param attribute: model attribute
-        :param param: parameter name
-
-        :returns: list of parameters
-        """
-        return self._get_codependent_parameters(self.stats[state_or_trans][attribute], param)
-
-    def _arg_independence_ratio(self, state_or_trans, attribute, arg_index):
-        statistics = self.stats[state_or_trans][attribute]
-        if self.use_corrcoef:
-            return 1 - np.abs(statistics['corr_by_arg'][arg_index])
-        if statistics['std_by_arg'][arg_index] == 0:
-            if statistics['std_param_lut'] != 0:
-                raise RuntimeError("wat")
-            # In general, std_param_lut < std_by_arg. So, if std_by_arg == 0, std_param_lut == 0 follows.
-            # This means that the variation of arg does not affect the model quality -> no influence, return 1
-            return 1
-        return statistics['std_param_lut'] / statistics['std_by_arg'][arg_index]
-
-    def arg_dependence_ratio(self, state_or_trans: str, attribute: str, arg_index: int) -> float:
-        return 1 - self._arg_independence_ratio(state_or_trans, attribute, arg_index)
-
-    # This heuristic is very similar to the "function is not much better than
-    # median" checks in get_fitted. So far, doing it here as well is mostly
-    # a performance and not an algorithm quality decision.
-    # --df, 2018-04-18
-    def depends_on_param(self, state_or_trans, attribute, param):
-        """Return whether attribute of state_or_trans depens on param."""
-        if self.use_corrcoef:
-            return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.1
-        else:
-            return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.5
-
-    # See notes on depends_on_param
-    def depends_on_arg(self, state_or_trans, attribute, arg_index):
-        """Return whether attribute of state_or_trans depens on arg_index."""
-        if self.use_corrcoef:
-            return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.1
-        else:
-            return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.5
-
 class TimingData:
     """
     Loader for timing model traces measured with on-board timers using `harness.OnboardTimerHarness`.
author	Daniel Friesel <daniel.friesel@uos.de>	2019-10-07 09:48:37 +0200
committer	Daniel Friesel <daniel.friesel@uos.de>	2019-10-07 09:48:37 +0200
commit	eb634401bfa4f93154eeb6265f100fd9db2bf7d4 (patch)
tree	6adfbf12d55d6f3f925dfa672207352734b30186 /lib/dfatool.py
parent	8b496797773a95bac66d76acc0d4dfee53f70ff7 (diff)