From eb634401bfa4f93154eeb6265f100fd9db2bf7d4 Mon Sep 17 00:00:00 2001
From: Daniel Friesel <daniel.friesel@uos.de>
Date: Mon, 7 Oct 2019 09:48:37 +0200
Subject: move ParamStats and helper functions to lib/parameters.py

---
 lib/utils.py | 253 -----------------------------------------------------------
 1 file changed, 253 deletions(-)

(limited to 'lib/utils.py')

diff --git a/lib/utils.py b/lib/utils.py
index 549b673..5daa7cf 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,4 +1,3 @@
-import itertools
 import numpy as np
 import re
 
@@ -115,258 +114,6 @@ def param_slice_eq(a, b, index):
         return True
     return False
 
-def prune_dependent_parameters(by_name, parameter_names, correlation_threshold = 0.5):
-    """
-    Remove dependent parameters from aggregate.
-
-    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
-        by_name[name][attribute] must be a list or 1-D numpy array.
-        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
-        Other dict members are left as-is
-    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
-    :param correlation_threshold: Remove parameter if absolute correlation exceeds this threshold (default: 0.5)
-
-    Model generation (and its components, such as relevant parameter detection and least squares optimization) only works if input variables (i.e., parameters)
-    are independent of each other. This function computes the correlation coefficient for each pair of parameters and removes those which depend on each other.
-    For each pair of dependent parameters, the lexically greater one is removed (e.g. "a" and "b" -> "b" is removed).
-    """
-
-    parameter_indices_to_remove = list()
-    for parameter_combination in itertools.product(range(len(parameter_names)), range(len(parameter_names))):
-        index_1, index_2 = parameter_combination
-        if index_1 >= index_2:
-            continue
-        parameter_values = [list(), list()] # both parameters have a value
-        parameter_values_1 = list() # parameter 1 has a value
-        parameter_values_2 = list() # parameter 2 has a value
-        for name in by_name:
-            for measurement in by_name[name]['param']:
-                value_1 = measurement[index_1]
-                value_2 = measurement[index_2]
-                if is_numeric(value_1):
-                    parameter_values_1.append(value_1)
-                if is_numeric(value_2):
-                    parameter_values_2.append(value_2)
-                if is_numeric(value_1) and is_numeric(value_2):
-                    parameter_values[0].append(value_1)
-                    parameter_values[1].append(value_2)
-        if len(parameter_values[0]):
-            # Calculating the correlation coefficient only makes sense when neither value is constant
-            if np.std(parameter_values_1) != 0 and np.std(parameter_values_2) != 0:
-                correlation = np.corrcoef(parameter_values)[0][1]
-                if correlation != np.nan and np.abs(correlation) > correlation_threshold:
-                    print('[!] Parameters {} <-> {} are correlated with coefficcient {}'.format(parameter_names[index_1], parameter_names[index_2], correlation))
-                    if len(parameter_values_1) < len(parameter_values_2):
-                        index_to_remove = index_1
-                    else:
-                        index_to_remove = index_2
-                    print('    Removing parameter {}'.format(parameter_names[index_to_remove]))
-                    parameter_indices_to_remove.append(index_to_remove)
-    remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove)
-
-def remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove):
-    """
-    Remove parameters listed in `parameter_indices` from aggregate `by_name` and `parameter_names`.
-
-    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
-        by_name[name][attribute] must be a list or 1-D numpy array.
-        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
-        Other dict members are left as-is
-    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
-    :param parameter_indices_to_remove: List of parameter indices to be removed
-    """
-
-    # Start removal from the end of the list to avoid renumbering of list elemenets
-    for parameter_index in sorted(parameter_indices_to_remove, reverse = True):
-        for name in by_name:
-            for measurement in by_name[name]['param']:
-                measurement.pop(parameter_index)
-        parameter_names.pop(parameter_index)
-
-def compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False):
-    """
-    Compute standard deviation and correlation coefficient for various data partitions.
-
-    It is strongly recommended to vary all parameter values evenly across partitions.
-    For instance, given two parameters, providing only the combinations
-    (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
-    It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
-
-    :param by_name: ground truth partitioned by state/transition name.
-        by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
-        by_name[state_or_trans]['param'] must be a list of parameter values
-        corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
-        first ground truth element has the (lexically) first parameter set to 1,
-        the second to 2 and the third to 3.
-    :param by_param: ground truth partitioned by state/transition name and parameters.
-        by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
-    :param parameter_names: list of parameter names, must have the same order as the parameter
-        values in by_param (lexical sorting is recommended).
-    :param arg_count: dict providing the number of functions args ("local parameters") for each function.
-    :param state_or_trans: state or transition name, e.g. 'send' or 'TX'
-    :param attribute: model attribute, e.g. 'power' or 'duration'
-    :param verbose: print warning if some parameter partitions are too small for fitting
-
-    :returns: a dict with the following content:
-    std_static -- static parameter-unaware model error: stddev of by_name[state_or_trans][attribute]
-    std_param_lut -- static parameter-aware model error: mean stddev of by_param[(state_or_trans, *)][attribute]
-    std_by_param -- static parameter-aware model error ignoring a single parameter.
-        dictionary with one key per parameter. The value is the mean stddev
-        of measurements where all other parameters are fixed and the parameter
-        in question is variable. E.g. std_by_param['X'] is the mean stddev of
-        by_param[(state_or_trans, (X=*, Y=..., Z=...))][attribute].
-    std_by_arg -- same, but ignoring a single function argument
-        Only set if state_or_trans appears in arg_count, empty dict otherwise.
-    corr_by_param -- correlation coefficient
-    corr_by_arg -- same, but ignoring a single function argument
-        Only set if state_or_trans appears in arg_count, empty dict otherwise.
-    """
-    ret = {
-        'std_static' : np.std(by_name[state_or_trans][attribute]),
-        'std_param_lut' : np.mean([np.std(by_param[x][attribute]) for x in by_param.keys() if x[0] == state_or_trans]),
-        'std_by_param' : {},
-        'std_by_param_values' : {},
-        'lut_by_param_values' : {},
-        'std_by_arg' : [],
-        'std_by_arg_values' : [],
-        'lut_by_arg_values' : [],
-        'corr_by_param' : {},
-        'corr_by_arg' : [],
-    }
-
-    np.seterr('raise')
-
-    param_values = distinct_param_values(by_name, state_or_trans)
-
-    for param_idx, param in enumerate(parameter_names):
-        std_matrix, mean_std, lut_matrix = _std_by_param(by_param, param_values, state_or_trans, attribute, param_idx, verbose)
-        ret['std_by_param'][param] = mean_std
-        ret['std_by_param_values'][param] = std_matrix
-        ret['lut_by_param_values'][param] = lut_matrix
-        ret['corr_by_param'][param] = _corr_by_param(by_name, state_or_trans, attribute, param_idx)
-    if arg_support_enabled and state_or_trans in arg_count:
-        for arg_index in range(arg_count[state_or_trans]):
-            std_matrix, mean_std, lut_matrix = _std_by_param(by_param, param_values, state_or_trans, attribute, len(parameter_names) + arg_index, verbose)
-            ret['std_by_arg'].append(mean_std)
-            ret['std_by_arg_values'].append(std_matrix)
-            ret['lut_by_arg_values'].append(lut_matrix)
-            ret['corr_by_arg'].append(_corr_by_param(by_name, state_or_trans, attribute, len(parameter_names) + arg_index))
-
-    return ret
-
-def distinct_param_values(by_name, state_or_tran):
-    """
-    Return the distinct values of each parameter in by_name[state_or_tran].
-
-    E.g. if by_name[state_or_tran]['param'] contains the distinct entries (1, 1), (1, 2), (1, 3), (0, 3),
-    this function returns [[1, 0], [1, 2, 3]].
-    Note that the order is not guaranteed to be deterministic at the moment.
-
-    Also note that this function deliberately also consider None
-    (uninitialized parameter with unknown value) as a distinct value. Benchmarks
-    and drivers must ensure that a parameter is only None when its value is
-    not important yet, e.g. a packet length parameter must only be None when
-    write() or similar has not been called yet. Other parameters should always
-    be initialized when leaving UNINITIALIZED.
-    """
-    # TODO a set() is an _unordered_ collection, so this must be converted to
-    # an OrderedDict or a list with a duplicate-pruning step
-    distinct_values = [set() for i in range(len(by_name[state_or_tran]['param'][0]))]
-    for param_tuple in by_name[state_or_tran]['param']:
-        for i in range(len(param_tuple)):
-            distinct_values[i].add(param_tuple[i])
-
-    # Convert sets to lists
-    distinct_values = list(map(list, distinct_values))
-    return distinct_values
-
-def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_index, verbose = False):
-    u"""
-    Calculate standard deviations for a static model where all parameters but param_index are constant.
-
-    :param by_param: measurements sorted by key/transition name and parameter values
-    :param state_or_tran: state or transition name (-> by_param[(state_or_tran, *)])
-    :param attribute: model attribute, e.g. 'power' or 'duration'
-           (-> by_param[(state_or_tran, *)][attribute])
-    :param param_index: index of variable parameter
-    :returns: (stddev matrix, mean stddev)
-
-    Returns the mean standard deviation of all measurements of 'attribute'
-    (e.g. power consumption or timeout) for state/transition 'state_or_tran' where
-    parameter 'param_index' is dynamic and all other parameters are fixed.
-    I.e., if parameters are a, b, c ∈ {1,2,3} and 'index' corresponds to b, then
-    this function returns the mean of the standard deviations of (a=1, b=*, c=1),
-    (a=1, b=*, c=2), and so on.
-    Also returns an (n-1)-dimensional array (where n is the number of parameters)
-    giving the standard deviation of each individual partition. E.g. for
-    param_index == 2 and 4 parameters, array[a][b][d] is the
-    stddev of measurements with param0 == a, param1 == b, param2 variable,
-    and param3 == d.
-    """
-    param_values = list(remove_index_from_tuple(all_param_values, param_index))
-    info_shape = tuple(map(len, param_values))
-
-    # We will calculate the mean over the entire matrix later on. We cannot
-    # guarantee that each entry will be filled in this loop (e.g. transitions
-    # whose arguments are combined using 'zip' rather than 'cartesian' always
-    # have missing parameter combinations), we pre-fill it with NaN and use
-    # np.nanmean to skip those when calculating the mean.
-    stddev_matrix = np.full(info_shape, np.nan)
-    lut_matrix = np.full(info_shape, np.nan)
-
-    for param_value in itertools.product(*param_values):
-        param_partition = list()
-        std_list = list()
-        for k, v in by_param.items():
-            if k[0] == state_or_tran and (*k[1][:param_index], *k[1][param_index+1:]) == param_value:
-                param_partition.extend(v[attribute])
-                std_list.append(np.std(v[attribute]))
-
-        if len(param_partition) > 1:
-            matrix_index = list(range(len(param_value)))
-            for i in range(len(param_value)):
-                matrix_index[i] = param_values[i].index(param_value[i])
-            matrix_index = tuple(matrix_index)
-            stddev_matrix[matrix_index] = np.std(param_partition)
-            lut_matrix[matrix_index] = np.mean(std_list)
-        # This can (and will) happen in normal operation, e.g. when a transition's
-        # arguments are combined using 'zip' rather than 'cartesian'.
-        #elif len(param_partition) == 1:
-        #    vprint(verbose, '[W] parameter value partition for {} contains only one element -- skipping'.format(param_value))
-        #else:
-        #    vprint(verbose, '[W] parameter value partition for {} is empty'.format(param_value))
-
-    if np.all(np.isnan(stddev_matrix)):
-        vprint(verbose, '[W] {}/{} parameter #{} has no data partitions -- how did this even happen?'.format(state_or_tran, attribute, param_index))
-        vprint(verbose, 'stddev_matrix = {}'.format(stddev_matrix))
-        return stddev_matrix, 0.
-
-    return stddev_matrix, np.nanmean(stddev_matrix), lut_matrix #np.mean([np.std(partition) for partition in partitions])
-
-def _corr_by_param(by_name, state_or_trans, attribute, param_index):
-    if _all_params_are_numeric(by_name[state_or_trans], param_index):
-        param_values = np.array(list((map(lambda x: x[param_index], by_name[state_or_trans]['param']))))
-        try:
-            return np.corrcoef(by_name[state_or_trans][attribute], param_values)[0, 1]
-        except FloatingPointError:
-            # Typically happens when all parameter values are identical.
-            # Building a correlation coefficient is pointless in this case
-            # -> assume no correlation
-            return 0.
-        except ValueError:
-            print('[!] Exception in _corr_by_param(by_name, state_or_trans={}, attribute={}, param_index={})'.format(state_or_trans, attribute, param_index))
-            print('[!] while executing np.corrcoef(by_name[{}][{}]={}, {}))'.format(state_or_trans, attribute, by_name[state_or_trans][attribute], param_values))
-            raise
-    else:
-        return 0.
-
-def _all_params_are_numeric(data, param_idx):
-    """Check if all `data['param'][*][param_idx]` elements are numeric, as reported by `utils.is_numeric`."""
-    param_values = list(map(lambda x: x[param_idx], data['param']))
-    if len(list(filter(is_numeric, param_values))) == len(param_values):
-        return True
-    return False
-
 class OptionalTimingAnalysis:
     def __init__(self, enabled = True):
         self.enabled = enabled
-- 
cgit v1.2.3