summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDaniel Friesel <daniel.friesel@uos.de>2019-11-12 10:07:09 +0100
committerDaniel Friesel <daniel.friesel@uos.de>2019-11-12 10:07:09 +0100
commit8c8545553910a75a6c53900f7b729465a03f4846 (patch)
tree308059a09b9d1e3103962d30eee73037e7690f73 /lib
parent734e137c0800656dbd1f86489896abd9763c6767 (diff)
ParamStats: compute statistics in parallel
Diffstat (limited to 'lib')
-rw-r--r--lib/parameters.py344
1 files changed, 183 insertions, 161 deletions
diff --git a/lib/parameters.py b/lib/parameters.py
index bc26643..27b1a4e 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -2,6 +2,7 @@ import itertools
import numpy as np
from collections import OrderedDict
from copy import deepcopy
+from multiprocessing import Pool
from utils import remove_index_from_tuple, is_numeric, is_power_of_two
from utils import filter_aggregate_by_param, by_name_to_by_param
@@ -28,6 +29,62 @@ def distinct_param_values(by_name, state_or_tran):
distinct_values = list(map(lambda x: list(x.keys()), distinct_values))
return distinct_values
+def _depends_on_param(corr_param, std_param, std_lut):
+ #if self.use_corrcoef:
+ if False:
+ return corr_param > 0.1
+ elif std_param == 0:
+ # In general, std_param_lut < std_by_param. So, if std_by_param == 0, std_param_lut == 0 follows.
+ # This means that the variation of param does not affect the model quality -> no influence
+ return False
+ return std_lut / std_param < 0.5
+
+def _reduce_param_matrix(matrix: np.ndarray, parameter_names: list) -> list:
+ """
+ :param matrix: parameter dependence matrix, M[(...)] == 1 iff (model attribute) is influenced by (parameter) for other parameter value indxe == (...)
+ :param parameter_names: names of parameters in the order in which they appear in the matrix index. The first entry corresponds to the first axis, etc.
+ :returns: parameters which determine whether (parameter) has an effect on (model attribute). If a parameter is not part of this list, its value does not
+ affect (parameter)'s influence on (model attribute) -- it either always or never has an influence
+ """
+ if np.all(matrix == True) or np.all(matrix == False):
+ return list()
+
+ # Diese Abbruchbedingung scheint noch nicht so schlau zu sein...
+ # Mit wird zu viel rausgefiltert (z.B. auto_ack! -> max_retry_count in "bin/analyze-timing.py ../data/20190815_122531_nRF24_no-rx.json" nicht erkannt)
+ # Ohne wird zu wenig rausgefiltert (auch ganz viele Abhängigkeiten erkannt, bei denen eine Parameter-Abhängigketi immer unabhängig vom Wert der anderen Parameter besteht)
+ #if not is_power_of_two(np.count_nonzero(matrix)):
+ # # cannot be reliably reduced to a list of parameters
+ # return list()
+
+ if np.count_nonzero(matrix) == 1:
+ influential_parameters = list()
+ for i, parameter_name in enumerate(parameter_names):
+ if matrix.shape[i] > 1:
+ influential_parameters.append(parameter_name)
+ return influential_parameters
+
+ for axis in range(matrix.ndim):
+ candidate = _reduce_param_matrix(np.all(matrix, axis=axis), remove_index_from_tuple(parameter_names, axis))
+ if len(candidate):
+ return candidate
+
+ return list()
+
+def _codependent_parameters(param, lut_by_param_values, std_by_param_values):
+ """
+ Return list of parameters which affect whether a parameter affects a model attribute or not.
+ """
+ return list()
+ safe_div = np.vectorize(lambda x,y: 0. if x == 0 else 1 - x/y)
+ ratio_by_value = safe_div(lut_by_param_values, std_by_param_values)
+ err_mode = np.seterr('ignore')
+ dep_by_value = ratio_by_value > 0.5
+ np.seterr(**err_mode)
+
+ other_param_list = list(filter(lambda x: x != param, self._parameter_names))
+ influencer_parameters = _reduce_param_matrix(dep_by_value, other_param_list)
+ return influencer_parameters
+
def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_index, verbose = False):
u"""
Calculate standard deviations for a static model where all parameters but `param_index` are constant.
@@ -126,6 +183,118 @@ def _corr_by_param(by_name, state_or_trans, attribute, param_index):
else:
return 0.
+def _compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, distinct_values, distinct_values_by_param_index, verbose = False):
+ """
+ Compute standard deviation and correlation coefficient for various data partitions.
+
+ It is strongly recommended to vary all parameter values evenly across partitions.
+ For instance, given two parameters, providing only the combinations
+ (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
+ It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
+
+ :param by_name: ground truth partitioned by state/transition name.
+ by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
+ by_name[state_or_trans]['param'] must be a list of parameter values
+ corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
+ first ground truth element has the (lexically) first parameter set to 1,
+ the second to 2 and the third to 3.
+ :param by_param: ground truth partitioned by state/transition name and parameters.
+ by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
+ :param parameter_names: list of parameter names, must have the same order as the parameter
+ values in by_param (lexical sorting is recommended).
+ :param arg_count: dict providing the number of functions args ("local parameters") for each function.
+ :param state_or_trans: state or transition name, e.g. 'send' or 'TX'
+ :param attribute: model attribute, e.g. 'power' or 'duration'
+ :param verbose: print warning if some parameter partitions are too small for fitting
+
+ :returns: a dict with the following content:
+ std_static -- static parameter-unaware model error: stddev of by_name[state_or_trans][attribute]
+ std_param_lut -- static parameter-aware model error: mean stddev of by_param[(state_or_trans, *)][attribute]
+ std_by_param -- static parameter-aware model error ignoring a single parameter.
+ dictionary with one key per parameter. The value is the mean stddev
+ of measurements where all other parameters are fixed and the parameter
+ in question is variable. E.g. std_by_param['X'] is the mean stddev of
+ by_param[(state_or_trans, (X=*, Y=..., Z=...))][attribute].
+ std_by_arg -- same, but ignoring a single function argument
+ Only set if state_or_trans appears in arg_count, empty dict otherwise.
+ corr_by_param -- correlation coefficient
+ corr_by_arg -- same, but ignoring a single function argument
+ Only set if state_or_trans appears in arg_count, empty dict otherwise.
+ """
+ ret = {
+ 'std_static' : np.std(by_name[state_or_trans][attribute]),
+ 'std_param_lut' : np.mean([np.std(by_param[x][attribute]) for x in by_param.keys() if x[0] == state_or_trans]),
+ 'std_by_param' : {},
+ 'std_by_param_values' : {},
+ 'lut_by_param_values' : {},
+ 'std_by_arg' : [],
+ 'std_by_arg_values' : [],
+ 'lut_by_arg_values' : [],
+ 'corr_by_param' : {},
+ 'corr_by_arg' : [],
+ 'depends_on_param' : {},
+ 'depends_on_arg' : [],
+ 'param_data' : {},
+ }
+
+ np.seterr('raise')
+
+ for param_idx, param in enumerate(parameter_names):
+ std_matrix, mean_std, lut_matrix = _std_by_param(by_param, distinct_values_by_param_index, state_or_trans, attribute, param_idx, verbose)
+ ret['std_by_param'][param] = mean_std
+ ret['std_by_param_values'][param] = std_matrix
+ ret['lut_by_param_values'][param] = lut_matrix
+ ret['corr_by_param'][param] = _corr_by_param(by_name, state_or_trans, attribute, param_idx)
+
+ ret['depends_on_param'][param] = _depends_on_param(ret['corr_by_param'][param], ret['std_by_param'][param], ret['std_param_lut'])
+
+ if ret['depends_on_param'][param]:
+ ret['param_data'][param] = {
+ 'codependent_parameters': _codependent_parameters(param, lut_matrix, std_matrix),
+ 'depends_for_codependent_value': dict()
+ }
+
+ # calculate parameter dependence for individual values of codependent parameters
+ codependent_param_values = list()
+ for codependent_param in ret['param_data'][param]['codependent_parameters']:
+ codependent_param_values.append(distinct_values[codependent_param])
+ for combi in itertools.product(*codependent_param_values):
+ by_name_part = deepcopy(by_name)
+ filter_list = list(zip(ret['param_data'][param]['codependent_parameters'], combi))
+ filter_aggregate_by_param(by_name_part, parameter_names, filter_list)
+ by_param_part = by_name_to_by_param(by_name_part)
+ # there may be no data for this specific parameter value combination
+ if state_or_trans in by_name_part:
+ part_corr = _corr_by_param(by_name_part, state_or_trans, attribute, param_idx)
+ part_std_lut = np.mean([np.std(by_param_part[x][attribute]) for x in by_param_part.keys() if x[0] == state_or_trans])
+ _, part_std_param, _ = _std_by_param(by_param_part, distinct_values_by_param_index, state_or_trans, attribute, param_idx, verbose)
+ ret['param_data'][param]['depends_for_codependent_value'][combi] = _depends_on_param(part_corr, part_std_param, part_std_lut)
+
+ if state_or_trans in arg_count:
+ for arg_index in range(arg_count[state_or_trans]):
+ std_matrix, mean_std, lut_matrix = _std_by_param(by_param, distinct_values_by_param_index, state_or_trans, attribute, len(parameter_names) + arg_index, verbose)
+ ret['std_by_arg'].append(mean_std)
+ ret['std_by_arg_values'].append(std_matrix)
+ ret['lut_by_arg_values'].append(lut_matrix)
+ ret['corr_by_arg'].append(_corr_by_param(by_name, state_or_trans, attribute, len(parameter_names) + arg_index))
+
+ if False:
+ ret['depends_on_arg'].append(ret['corr_by_arg'][arg_index] > 0.1)
+ elif ret['std_by_arg'][arg_index] == 0:
+ # In general, std_param_lut < std_by_arg. So, if std_by_arg == 0, std_param_lut == 0 follows.
+ # This means that the variation of arg does not affect the model quality -> no influence
+ ret['depends_on_arg'].append(False)
+ else:
+ ret['depends_on_arg'].append(ret['std_param_lut'] / ret['std_by_arg'][arg_index] < 0.5)
+
+ return ret
+
+def _compute_param_statistics_parallel(arg):
+ return {
+ 'key' : arg['key'],
+ 'result': _compute_param_statistics(*arg['args'])
+ }
+
def _all_params_are_numeric(data, param_idx):
"""Check if all `data['param'][*][param_idx]` elements are numeric, as reported by `utils.is_numeric`."""
param_values = list(map(lambda x: x[param_idx], data['param']))
@@ -236,6 +405,9 @@ class ParamStats:
self.distinct_values_by_param_index = dict()
self.use_corrcoef = use_corrcoef
self._parameter_names = parameter_names
+
+ stats_queue = list()
+
# Note: This is deliberately single-threaded. The overhead incurred
# by multiprocessing is higher than the speed gained by parallel
# computation of statistics measures.
@@ -246,7 +418,17 @@ class ParamStats:
for i, param in enumerate(parameter_names):
self.distinct_values[state_or_tran][param] = self.distinct_values_by_param_index[state_or_tran][i]
for attribute in by_name[state_or_tran]['attributes']:
- self.stats[state_or_tran][attribute] = self.compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_tran, attribute, verbose = verbose)
+ stats_queue.append({
+ 'key': [state_or_tran, attribute],
+ 'args': [by_name, by_param, parameter_names, arg_count, state_or_tran, attribute, self.distinct_values[state_or_tran], self.distinct_values_by_param_index[state_or_tran], verbose],
+ })
+
+ with Pool() as pool:
+ stats_results = pool.map(_compute_param_statistics_parallel, stats_queue)
+
+ for stats in stats_results:
+ state_or_tran, attribute = stats['key']
+ self.stats[state_or_tran][attribute] = stats['result']
def can_be_fitted(self, state_or_tran = None) -> bool:
"""
@@ -266,112 +448,6 @@ class ParamStats:
return True
return False
- def compute_param_statistics(self, by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False):
- """
- Compute standard deviation and correlation coefficient for various data partitions.
-
- It is strongly recommended to vary all parameter values evenly across partitions.
- For instance, given two parameters, providing only the combinations
- (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
- It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
-
- :param by_name: ground truth partitioned by state/transition name.
- by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
- by_name[state_or_trans]['param'] must be a list of parameter values
- corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
- first ground truth element has the (lexically) first parameter set to 1,
- the second to 2 and the third to 3.
- :param by_param: ground truth partitioned by state/transition name and parameters.
- by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
- :param parameter_names: list of parameter names, must have the same order as the parameter
- values in by_param (lexical sorting is recommended).
- :param arg_count: dict providing the number of functions args ("local parameters") for each function.
- :param state_or_trans: state or transition name, e.g. 'send' or 'TX'
- :param attribute: model attribute, e.g. 'power' or 'duration'
- :param verbose: print warning if some parameter partitions are too small for fitting
-
- :returns: a dict with the following content:
- std_static -- static parameter-unaware model error: stddev of by_name[state_or_trans][attribute]
- std_param_lut -- static parameter-aware model error: mean stddev of by_param[(state_or_trans, *)][attribute]
- std_by_param -- static parameter-aware model error ignoring a single parameter.
- dictionary with one key per parameter. The value is the mean stddev
- of measurements where all other parameters are fixed and the parameter
- in question is variable. E.g. std_by_param['X'] is the mean stddev of
- by_param[(state_or_trans, (X=*, Y=..., Z=...))][attribute].
- std_by_arg -- same, but ignoring a single function argument
- Only set if state_or_trans appears in arg_count, empty dict otherwise.
- corr_by_param -- correlation coefficient
- corr_by_arg -- same, but ignoring a single function argument
- Only set if state_or_trans appears in arg_count, empty dict otherwise.
- """
- ret = {
- 'std_static' : np.std(by_name[state_or_trans][attribute]),
- 'std_param_lut' : np.mean([np.std(by_param[x][attribute]) for x in by_param.keys() if x[0] == state_or_trans]),
- 'std_by_param' : {},
- 'std_by_param_values' : {},
- 'lut_by_param_values' : {},
- 'std_by_arg' : [],
- 'std_by_arg_values' : [],
- 'lut_by_arg_values' : [],
- 'corr_by_param' : {},
- 'corr_by_arg' : [],
- 'depends_on_param' : {},
- 'depends_on_arg' : [],
- 'param_data' : {},
- }
-
- np.seterr('raise')
-
- for param_idx, param in enumerate(parameter_names):
- std_matrix, mean_std, lut_matrix = _std_by_param(by_param, self.distinct_values_by_param_index[state_or_trans], state_or_trans, attribute, param_idx, verbose)
- ret['std_by_param'][param] = mean_std
- ret['std_by_param_values'][param] = std_matrix
- ret['lut_by_param_values'][param] = lut_matrix
- ret['corr_by_param'][param] = _corr_by_param(by_name, state_or_trans, attribute, param_idx)
-
- ret['depends_on_param'][param] = self._depends_on_param(ret['corr_by_param'][param], ret['std_by_param'][param], ret['std_param_lut'])
-
- if ret['depends_on_param'][param]:
- ret['param_data'][param] = {
- 'codependent_parameters': self._codependent_parameters(param, lut_matrix, std_matrix),
- 'depends_for_codependent_value': dict()
- }
-
- # calculate parameter dependence for individual values of codependent parameters
- codependent_param_values = list()
- for codependent_param in ret['param_data'][param]['codependent_parameters']:
- codependent_param_values.append(self.distinct_values[state_or_trans][codependent_param])
- for combi in itertools.product(*codependent_param_values):
- by_name_part = deepcopy(by_name)
- filter_list = list(zip(ret['param_data'][param]['codependent_parameters'], combi))
- filter_aggregate_by_param(by_name_part, parameter_names, filter_list)
- by_param_part = by_name_to_by_param(by_name_part)
- # there may be no data for this specific parameter value combination
- if state_or_trans in by_name_part:
- part_corr = _corr_by_param(by_name_part, state_or_trans, attribute, param_idx)
- part_std_lut = np.mean([np.std(by_param_part[x][attribute]) for x in by_param_part.keys() if x[0] == state_or_trans])
- _, part_std_param, _ = _std_by_param(by_param_part, self.distinct_values_by_param_index[state_or_trans], state_or_trans, attribute, param_idx, verbose)
- ret['param_data'][param]['depends_for_codependent_value'][combi] = self._depends_on_param(part_corr, part_std_param, part_std_lut)
-
- if state_or_trans in arg_count:
- for arg_index in range(arg_count[state_or_trans]):
- std_matrix, mean_std, lut_matrix = _std_by_param(by_param, self.distinct_values_by_param_index[state_or_trans], state_or_trans, attribute, len(parameter_names) + arg_index, verbose)
- ret['std_by_arg'].append(mean_std)
- ret['std_by_arg_values'].append(std_matrix)
- ret['lut_by_arg_values'].append(lut_matrix)
- ret['corr_by_arg'].append(_corr_by_param(by_name, state_or_trans, attribute, len(parameter_names) + arg_index))
-
- if self.use_corrcoef:
- ret['depends_on_arg'].append(ret['corr_by_arg'][arg_index] > 0.1)
- elif ret['std_by_arg'][arg_index] == 0:
- # In general, std_param_lut < std_by_arg. So, if std_by_arg == 0, std_param_lut == 0 follows.
- # This means that the variation of arg does not affect the model quality -> no influence
- ret['depends_on_arg'].append(False)
- else:
- ret['depends_on_arg'].append(ret['std_param_lut'] / ret['std_by_arg'][arg_index] < 0.5)
-
- return ret
-
def static_submodel_params(self, state_or_tran, attribute):
"""
Return the union of all parameter values which decide whether another parameter influences the model or not.
@@ -474,15 +550,6 @@ class ParamStats:
yield dict(zip(codependent_parameters, param_values))
- def _depends_on_param(self, corr_param, std_param, std_lut):
- if self.use_corrcoef:
- return corr_param > 0.1
- elif std_param == 0:
- # In general, std_param_lut < std_by_param. So, if std_by_param == 0, std_param_lut == 0 follows.
- # This means that the variation of param does not affect the model quality -> no influence
- return False
- return std_lut / std_param < 0.5
-
def _generic_param_independence_ratio(self, state_or_trans, attribute):
"""
Return the heuristic ratio of parameter independence for state_or_trans and attribute.
@@ -507,51 +574,6 @@ class ParamStats:
"""
return 1 - self._generic_param_independence_ratio(state_or_trans, attribute)
- def _reduce_param_matrix(self, matrix: np.ndarray, parameter_names: list) -> list:
- """
- :param matrix: parameter dependence matrix, M[(...)] == 1 iff (model attribute) is influenced by (parameter) for other parameter value indxe == (...)
- :param parameter_names: names of parameters in the order in which they appear in the matrix index. The first entry corresponds to the first axis, etc.
- :returns: parameters which determine whether (parameter) has an effect on (model attribute). If a parameter is not part of this list, its value does not
- affect (parameter)'s influence on (model attribute) -- it either always or never has an influence
- """
- if np.all(matrix == True) or np.all(matrix == False):
- return list()
-
- # Diese Abbruchbedingung scheint noch nicht so schlau zu sein...
- # Mit wird zu viel rausgefiltert (z.B. auto_ack! -> max_retry_count in "bin/analyze-timing.py ../data/20190815_122531_nRF24_no-rx.json" nicht erkannt)
- # Ohne wird zu wenig rausgefiltert (auch ganz viele Abhängigkeiten erkannt, bei denen eine Parameter-Abhängigketi immer unabhängig vom Wert der anderen Parameter besteht)
- #if not is_power_of_two(np.count_nonzero(matrix)):
- # # cannot be reliably reduced to a list of parameters
- # return list()
-
- if np.count_nonzero(matrix) == 1:
- influential_parameters = list()
- for i, parameter_name in enumerate(parameter_names):
- if matrix.shape[i] > 1:
- influential_parameters.append(parameter_name)
- return influential_parameters
-
- for axis in range(matrix.ndim):
- candidate = self._reduce_param_matrix(np.all(matrix, axis=axis), remove_index_from_tuple(parameter_names, axis))
- if len(candidate):
- return candidate
-
- return list()
-
- def _codependent_parameters(self, param, lut_by_param_values, std_by_param_values):
- """
- Return list of parameters which affect whether a parameter affects a model attribute or not.
- """
- safe_div = np.vectorize(lambda x,y: 0. if x == 0 else 1 - x/y)
- ratio_by_value = safe_div(lut_by_param_values, std_by_param_values)
- err_mode = np.seterr('ignore')
- dep_by_value = ratio_by_value > 0.5
- np.seterr(**err_mode)
-
- other_param_list = list(filter(lambda x: x != param, self._parameter_names))
- influencer_parameters = self._reduce_param_matrix(dep_by_value, other_param_list)
- return influencer_parameters
-
def _param_independence_ratio(self, state_or_trans: str, attribute: str, param: str) -> float:
"""
Return the heuristic ratio of parameter independence for state_or_trans, attribute, and param.