move ParamStats and helper functions to lib/parameters.py

author: Daniel Friesel <daniel.friesel@uos.de> 2019-10-07 09:48:37 +0200
committer: Daniel Friesel <daniel.friesel@uos.de> 2019-10-07 09:48:37 +0200
commit: eb634401bfa4f93154eeb6265f100fd9db2bf7d4 (patch)
tree: 6adfbf12d55d6f3f925dfa672207352734b30186 /lib/parameters.py
parent: 8b496797773a95bac66d76acc0d4dfee53f70ff7 (diff)
1 files changed, 439 insertions, 0 deletions
diff --git a/lib/parameters.py b/lib/parameters.py
new file mode 100644
index 0000000..4dc9c3a
--- /dev/null
+++ b/lib/parameters.py
@@ -0,0 +1,439 @@
+import itertools
+import numpy as np
+from utils import remove_index_from_tuple, is_numeric
+
+def compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False):
+    """
+    Compute standard deviation and correlation coefficient for various data partitions.
+
+    It is strongly recommended to vary all parameter values evenly across partitions.
+    For instance, given two parameters, providing only the combinations
+    (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
+    It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
+
+    :param by_name: ground truth partitioned by state/transition name.
+        by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
+        by_name[state_or_trans]['param'] must be a list of parameter values
+        corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
+        first ground truth element has the (lexically) first parameter set to 1,
+        the second to 2 and the third to 3.
+    :param by_param: ground truth partitioned by state/transition name and parameters.
+        by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
+    :param parameter_names: list of parameter names, must have the same order as the parameter
+        values in by_param (lexical sorting is recommended).
+    :param arg_count: dict providing the number of functions args ("local parameters") for each function.
+    :param state_or_trans: state or transition name, e.g. 'send' or 'TX'
+    :param attribute: model attribute, e.g. 'power' or 'duration'
+    :param verbose: print warning if some parameter partitions are too small for fitting
+
+    :returns: a dict with the following content:
+    std_static -- static parameter-unaware model error: stddev of by_name[state_or_trans][attribute]
+    std_param_lut -- static parameter-aware model error: mean stddev of by_param[(state_or_trans, *)][attribute]
+    std_by_param -- static parameter-aware model error ignoring a single parameter.
+        dictionary with one key per parameter. The value is the mean stddev
+        of measurements where all other parameters are fixed and the parameter
+        in question is variable. E.g. std_by_param['X'] is the mean stddev of
+        by_param[(state_or_trans, (X=*, Y=..., Z=...))][attribute].
+    std_by_arg -- same, but ignoring a single function argument
+        Only set if state_or_trans appears in arg_count, empty dict otherwise.
+    corr_by_param -- correlation coefficient
+    corr_by_arg -- same, but ignoring a single function argument
+        Only set if state_or_trans appears in arg_count, empty dict otherwise.
+    """
+    ret = {
+        'std_static' : np.std(by_name[state_or_trans][attribute]),
+        'std_param_lut' : np.mean([np.std(by_param[x][attribute]) for x in by_param.keys() if x[0] == state_or_trans]),
+        'std_by_param' : {},
+        'std_by_param_values' : {},
+        'lut_by_param_values' : {},
+        'std_by_arg' : [],
+        'std_by_arg_values' : [],
+        'lut_by_arg_values' : [],
+        'corr_by_param' : {},
+        'corr_by_arg' : [],
+    }
+
+    np.seterr('raise')
+
+    param_values = distinct_param_values(by_name, state_or_trans)
+
+    for param_idx, param in enumerate(parameter_names):
+        std_matrix, mean_std, lut_matrix = _std_by_param(by_param, param_values, state_or_trans, attribute, param_idx, verbose)
+        ret['std_by_param'][param] = mean_std
+        ret['std_by_param_values'][param] = std_matrix
+        ret['lut_by_param_values'][param] = lut_matrix
+        ret['corr_by_param'][param] = _corr_by_param(by_name, state_or_trans, attribute, param_idx)
+    if state_or_trans in arg_count:
+        for arg_index in range(arg_count[state_or_trans]):
+            std_matrix, mean_std, lut_matrix = _std_by_param(by_param, param_values, state_or_trans, attribute, len(parameter_names) + arg_index, verbose)
+            ret['std_by_arg'].append(mean_std)
+            ret['std_by_arg_values'].append(std_matrix)
+            ret['lut_by_arg_values'].append(lut_matrix)
+            ret['corr_by_arg'].append(_corr_by_param(by_name, state_or_trans, attribute, len(parameter_names) + arg_index))
+
+    return ret
+
+def distinct_param_values(by_name, state_or_tran):
+    """
+    Return the distinct values of each parameter in by_name[state_or_tran].
+
+    E.g. if by_name[state_or_tran]['param'] contains the distinct entries (1, 1), (1, 2), (1, 3), (0, 3),
+    this function returns [[1, 0], [1, 2, 3]].
+    Note that the order is not guaranteed to be deterministic at the moment.
+
+    Also note that this function deliberately also consider None
+    (uninitialized parameter with unknown value) as a distinct value. Benchmarks
+    and drivers must ensure that a parameter is only None when its value is
+    not important yet, e.g. a packet length parameter must only be None when
+    write() or similar has not been called yet. Other parameters should always
+    be initialized when leaving UNINITIALIZED.
+    """
+    # TODO a set() is an _unordered_ collection, so this must be converted to
+    # an OrderedDict or a list with a duplicate-pruning step
+    distinct_values = [set() for i in range(len(by_name[state_or_tran]['param'][0]))]
+    for param_tuple in by_name[state_or_tran]['param']:
+        for i in range(len(param_tuple)):
+            distinct_values[i].add(param_tuple[i])
+
+    # Convert sets to lists
+    distinct_values = list(map(list, distinct_values))
+    return distinct_values
+
+def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_index, verbose = False):
+    u"""
+    Calculate standard deviations for a static model where all parameters but param_index are constant.
+
+    :param by_param: measurements sorted by key/transition name and parameter values
+    :param state_or_tran: state or transition name (-> by_param[(state_or_tran, *)])
+    :param attribute: model attribute, e.g. 'power' or 'duration'
+           (-> by_param[(state_or_tran, *)][attribute])
+    :param param_index: index of variable parameter
+    :returns: (stddev matrix, mean stddev)
+
+    Returns the mean standard deviation of all measurements of 'attribute'
+    (e.g. power consumption or timeout) for state/transition 'state_or_tran' where
+    parameter 'param_index' is dynamic and all other parameters are fixed.
+    I.e., if parameters are a, b, c ∈ {1,2,3} and 'index' corresponds to b, then
+    this function returns the mean of the standard deviations of (a=1, b=*, c=1),
+    (a=1, b=*, c=2), and so on.
+    Also returns an (n-1)-dimensional array (where n is the number of parameters)
+    giving the standard deviation of each individual partition. E.g. for
+    param_index == 2 and 4 parameters, array[a][b][d] is the
+    stddev of measurements with param0 == a, param1 == b, param2 variable,
+    and param3 == d.
+    """
+    param_values = list(remove_index_from_tuple(all_param_values, param_index))
+    info_shape = tuple(map(len, param_values))
+
+    # We will calculate the mean over the entire matrix later on. We cannot
+    # guarantee that each entry will be filled in this loop (e.g. transitions
+    # whose arguments are combined using 'zip' rather than 'cartesian' always
+    # have missing parameter combinations), we pre-fill it with NaN and use
+    # np.nanmean to skip those when calculating the mean.
+    stddev_matrix = np.full(info_shape, np.nan)
+    lut_matrix = np.full(info_shape, np.nan)
+
+    for param_value in itertools.product(*param_values):
+        param_partition = list()
+        std_list = list()
+        for k, v in by_param.items():
+            if k[0] == state_or_tran and (*k[1][:param_index], *k[1][param_index+1:]) == param_value:
+                param_partition.extend(v[attribute])
+                std_list.append(np.std(v[attribute]))
+
+        if len(param_partition) > 1:
+            matrix_index = list(range(len(param_value)))
+            for i in range(len(param_value)):
+                matrix_index[i] = param_values[i].index(param_value[i])
+            matrix_index = tuple(matrix_index)
+            stddev_matrix[matrix_index] = np.std(param_partition)
+            lut_matrix[matrix_index] = np.mean(std_list)
+        # This can (and will) happen in normal operation, e.g. when a transition's
+        # arguments are combined using 'zip' rather than 'cartesian'.
+        #elif len(param_partition) == 1:
+        #    vprint(verbose, '[W] parameter value partition for {} contains only one element -- skipping'.format(param_value))
+        #else:
+        #    vprint(verbose, '[W] parameter value partition for {} is empty'.format(param_value))
+
+    if np.all(np.isnan(stddev_matrix)):
+        vprint(verbose, '[W] {}/{} parameter #{} has no data partitions -- how did this even happen?'.format(state_or_tran, attribute, param_index))
+        vprint(verbose, 'stddev_matrix = {}'.format(stddev_matrix))
+        return stddev_matrix, 0.
+
+    return stddev_matrix, np.nanmean(stddev_matrix), lut_matrix #np.mean([np.std(partition) for partition in partitions])
+
+def _corr_by_param(by_name, state_or_trans, attribute, param_index):
+    if _all_params_are_numeric(by_name[state_or_trans], param_index):
+        param_values = np.array(list((map(lambda x: x[param_index], by_name[state_or_trans]['param']))))
+        try:
+            return np.corrcoef(by_name[state_or_trans][attribute], param_values)[0, 1]
+        except FloatingPointError:
+            # Typically happens when all parameter values are identical.
+            # Building a correlation coefficient is pointless in this case
+            # -> assume no correlation
+            return 0.
+        except ValueError:
+            print('[!] Exception in _corr_by_param(by_name, state_or_trans={}, attribute={}, param_index={})'.format(state_or_trans, attribute, param_index))
+            print('[!] while executing np.corrcoef(by_name[{}][{}]={}, {}))'.format(state_or_trans, attribute, by_name[state_or_trans][attribute], param_values))
+            raise
+    else:
+        return 0.
+
+def _all_params_are_numeric(data, param_idx):
+    """Check if all `data['param'][*][param_idx]` elements are numeric, as reported by `utils.is_numeric`."""
+    param_values = list(map(lambda x: x[param_idx], data['param']))
+    if len(list(filter(is_numeric, param_values))) == len(param_values):
+        return True
+    return False
+
+def prune_dependent_parameters(by_name, parameter_names, correlation_threshold = 0.5):
+    """
+    Remove dependent parameters from aggregate.
+
+    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+        by_name[name][attribute] must be a list or 1-D numpy array.
+        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+        Other dict members are left as-is
+    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+    :param correlation_threshold: Remove parameter if absolute correlation exceeds this threshold (default: 0.5)
+
+    Model generation (and its components, such as relevant parameter detection and least squares optimization) only works if input variables (i.e., parameters)
+    are independent of each other. This function computes the correlation coefficient for each pair of parameters and removes those which depend on each other.
+    For each pair of dependent parameters, the lexically greater one is removed (e.g. "a" and "b" -> "b" is removed).
+    """
+
+    parameter_indices_to_remove = list()
+    for parameter_combination in itertools.product(range(len(parameter_names)), range(len(parameter_names))):
+        index_1, index_2 = parameter_combination
+        if index_1 >= index_2:
+            continue
+        parameter_values = [list(), list()] # both parameters have a value
+        parameter_values_1 = list() # parameter 1 has a value
+        parameter_values_2 = list() # parameter 2 has a value
+        for name in by_name:
+            for measurement in by_name[name]['param']:
+                value_1 = measurement[index_1]
+                value_2 = measurement[index_2]
+                if is_numeric(value_1):
+                    parameter_values_1.append(value_1)
+                if is_numeric(value_2):
+                    parameter_values_2.append(value_2)
+                if is_numeric(value_1) and is_numeric(value_2):
+                    parameter_values[0].append(value_1)
+                    parameter_values[1].append(value_2)
+        if len(parameter_values[0]):
+            # Calculating the correlation coefficient only makes sense when neither value is constant
+            if np.std(parameter_values_1) != 0 and np.std(parameter_values_2) != 0:
+                correlation = np.corrcoef(parameter_values)[0][1]
+                if correlation != np.nan and np.abs(correlation) > correlation_threshold:
+                    print('[!] Parameters {} <-> {} are correlated with coefficcient {}'.format(parameter_names[index_1], parameter_names[index_2], correlation))
+                    if len(parameter_values_1) < len(parameter_values_2):
+                        index_to_remove = index_1
+                    else:
+                        index_to_remove = index_2
+                    print('    Removing parameter {}'.format(parameter_names[index_to_remove]))
+                    parameter_indices_to_remove.append(index_to_remove)
+    remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove)
+
+def remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove):
+    """
+    Remove parameters listed in `parameter_indices` from aggregate `by_name` and `parameter_names`.
+
+    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+        by_name[name][attribute] must be a list or 1-D numpy array.
+        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+        Other dict members are left as-is
+    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+    :param parameter_indices_to_remove: List of parameter indices to be removed
+    """
+
+    # Start removal from the end of the list to avoid renumbering of list elemenets
+    for parameter_index in sorted(parameter_indices_to_remove, reverse = True):
+        for name in by_name:
+            for measurement in by_name[name]['param']:
+                measurement.pop(parameter_index)
+        parameter_names.pop(parameter_index)
+
+class ParamStats:
+
+    def __init__(self, by_name, by_param, parameter_names, arg_count, use_corrcoef = False, verbose = False):
+        """
+        Compute standard deviation and correlation coefficient on parameterized data partitions.
+
+        It is strongly recommended to vary all parameter values evenly.
+        For instance, given two parameters, providing only the combinations
+        (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
+        It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
+
+        arguments:
+        by_name -- ground truth partitioned by state/transition name.
+            by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
+            by_name[state_or_trans]['param'] must be a list of parameter values
+            corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
+            first ground truth element has the (lexically) first parameter set to 1,
+            the second to 2 and the third to 3.
+        by_param -- ground truth partitioned by state/transition name and parameters.
+            by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
+        parameter_names -- list of parameter names, must have the same order as the parameter
+            values in by_param (lexical sorting is recommended).
+        arg_count -- dict providing the number of functions args ("local parameters") for each function.
+        use_corrcoef -- use correlation coefficient instead of stddev heuristic for parameter detection
+        """
+        self.stats = dict()
+        self.use_corrcoef = use_corrcoef
+        self._parameter_names = parameter_names
+        # Note: This is deliberately single-threaded. The overhead incurred
+        # by multiprocessing is higher than the speed gained by parallel
+        # computation of statistics measures.
+        for state_or_tran in by_name.keys():
+            self.stats[state_or_tran] = dict()
+            for attribute in by_name[state_or_tran]['attributes']:
+                self.stats[state_or_tran][attribute] = compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_tran, attribute, verbose = verbose)
+
+    def _generic_param_independence_ratio(self, state_or_trans, attribute):
+        """
+        Return the heuristic ratio of parameter independence for state_or_trans and attribute.
+
+        This is not supported if the correlation coefficient is used.
+        A value close to 1 means no influence, a value close to 0 means high probability of influence.
+        """
+        statistics = self.stats[state_or_trans][attribute]
+        if self.use_corrcoef:
+            # not supported
+            raise ValueError
+        if statistics['std_static'] == 0:
+            return 0
+        return statistics['std_param_lut'] / statistics['std_static']
+
+    def generic_param_dependence_ratio(self, state_or_trans, attribute):
+        """
+        Return the heuristic ratio of parameter dependence for state_or_trans and attribute.
+
+        This is not supported if the correlation coefficient is used.
+        A value close to 0 means no influence, a value close to 1 means high probability of influence.
+        """
+        return 1 - self._generic_param_independence_ratio(state_or_trans, attribute)
+
+    def _reduce_param_matrix(self, matrix: np.ndarray, parameter_names: list) -> list:
+        """
+        :param matrix: parameter dependence matrix, M[(...)] == 1 iff (model attribute) is influenced by (parameter) for other parameter value indxe == (...)
+        :param parameter_names: names of parameters in the order in which they appear in the matrix index. The first entry corresponds to the first axis, etc.
+        :returns: parameters which determine whether (parameter) has an effect on (model attribute). If a parameter is not part of this list, its value does not
+            affect (parameter)'s influence on (model attribute) -- it either always or never has an influence
+        """
+        if np.all(matrix == True) or np.all(matrix == False):
+            return list()
+
+        if not is_power_of_two(np.count_nonzero(matrix)):
+            # cannot be reliably reduced to a list of parameters
+            return list()
+
+        if np.count_nonzero(matrix) == 1:
+            influential_parameters = list()
+            for i, parameter_name in enumerate(parameter_names):
+                if matrix.shape[i] > 1:
+                    influential_parameters.append(parameter_name)
+            return influential_parameters
+
+        for axis in range(matrix.ndim):
+            candidate = self._reduce_param_matrix(np.all(matrix, axis=axis), remove_index_from_tuple(parameter_names, axis))
+            if len(candidate):
+                return candidate
+
+        return list()
+
+    def _get_codependent_parameters(self, stats, param):
+        """
+        Return list of parameters which affect whether `param` influences the model attribute described in `stats` or not.
+        """
+        safe_div = np.vectorize(lambda x,y: 0. if x == 0 else 1 - x/y)
+        ratio_by_value = safe_div(stats['lut_by_param_values'][param], stats['std_by_param_values'][param])
+        err_mode = np.seterr('ignore')
+        dep_by_value = ratio_by_value > 0.5
+        np.seterr(**err_mode)
+
+        other_param_list = list(filter(lambda x: x != param, self._parameter_names))
+        influencer_parameters = self._reduce_param_matrix(dep_by_value, other_param_list)
+        return influencer_parameters
+
+    def _param_independence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float:
+        """
+        Return the heuristic ratio of parameter independence for state_or_trans, attribute, and param.
+
+        A value close to 1 means no influence, a value close to 0 means high probability of influence.
+        """
+        statistics = self.stats[state_or_trans][attribute]
+        if self.use_corrcoef:
+            return 1 - np.abs(statistics['corr_by_param'][param])
+        if statistics['std_by_param'][param] == 0:
+            if statistics['std_param_lut'] != 0:
+                raise RuntimeError("wat")
+            # In general, std_param_lut < std_by_param. So, if std_by_param == 0, std_param_lut == 0 follows.
+            # This means that the variation of param does not affect the model quality -> no influence, return 1
+            return 1.
+
+        return statistics['std_param_lut'] / statistics['std_by_param'][param]
+
+    def param_dependence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float:
+        """
+        Return the heuristic ratio of parameter dependence for state_or_trans, attribute, and param.
+
+        A value close to 0 means no influence, a value close to 1 means high probability of influence.
+
+        :param state_or_trans: state or transition name
+        :param attribute: model attribute
+        :param param: parameter name
+
+        :returns: parameter dependence (float between 0 == no influence and 1 == high probability of influence)
+        """
+        return 1 - self._param_independence_ratio(state_or_trans, attribute, param)
+
+    def reverse_dependent_parameters(self, state_or_trans: str, attribute: str, param: str) -> list:
+        """
+        Return parameters whose value influences whether `attribute` of `state_or_trans` depends on `param` or not.
+
+        For example, a radio's TX POWER is only influenced by the packet length if dynamically sized payloads are enabled.
+        So reverse_dependent_parameters('TX', 'POWER', 'packet_length') == ['dynamic_payload_size'].
+
+        :param state_or_trans: state or transition name
+        :param attribute: model attribute
+        :param param: parameter name
+
+        :returns: list of parameters
+        """
+        return self._get_codependent_parameters(self.stats[state_or_trans][attribute], param)
+
+    def _arg_independence_ratio(self, state_or_trans, attribute, arg_index):
+        statistics = self.stats[state_or_trans][attribute]
+        if self.use_corrcoef:
+            return 1 - np.abs(statistics['corr_by_arg'][arg_index])
+        if statistics['std_by_arg'][arg_index] == 0:
+            if statistics['std_param_lut'] != 0:
+                raise RuntimeError("wat")
+            # In general, std_param_lut < std_by_arg. So, if std_by_arg == 0, std_param_lut == 0 follows.
+            # This means that the variation of arg does not affect the model quality -> no influence, return 1
+            return 1
+        return statistics['std_param_lut'] / statistics['std_by_arg'][arg_index]
+
+    def arg_dependence_ratio(self, state_or_trans: str, attribute: str, arg_index: int) -> float:
+        return 1 - self._arg_independence_ratio(state_or_trans, attribute, arg_index)
+
+    # This heuristic is very similar to the "function is not much better than
+    # median" checks in get_fitted. So far, doing it here as well is mostly
+    # a performance and not an algorithm quality decision.
+    # --df, 2018-04-18
+    def depends_on_param(self, state_or_trans, attribute, param):
+        """Return whether attribute of state_or_trans depens on param."""
+        if self.use_corrcoef:
+            return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.1
+        else:
+            return self.param_dependence_ratio(state_or_trans, attribute, param) > 0.5
+
+    # See notes on depends_on_param
+    def depends_on_arg(self, state_or_trans, attribute, arg_index):
+        """Return whether attribute of state_or_trans depens on arg_index."""
+        if self.use_corrcoef:
+            return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.1
+        else:
+            return self.arg_dependence_ratio(state_or_trans, attribute, arg_index) > 0.5
+
author	Daniel Friesel <daniel.friesel@uos.de>	2019-10-07 09:48:37 +0200
committer	Daniel Friesel <daniel.friesel@uos.de>	2019-10-07 09:48:37 +0200
commit	eb634401bfa4f93154eeb6265f100fd9db2bf7d4 (patch)
tree	6adfbf12d55d6f3f925dfa672207352734b30186 /lib/parameters.py
parent	8b496797773a95bac66d76acc0d4dfee53f70ff7 (diff)