From 97a523327fae89ee0d6e245f226f920b0b67f6d7 Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Tue, 13 Aug 2019 07:39:14 +0200 Subject: _try_fits: documentation, minimal refactoring, note a possible bug --- lib/dfatool.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/lib/dfatool.py b/lib/dfatool.py index fbc5c7f..ecd3051 100755 --- a/lib/dfatool.py +++ b/lib/dfatool.py @@ -902,15 +902,51 @@ class ParallelParamFit: self.results = pool.map(_try_fits_parallel, self.fit_queue) def _try_fits_parallel(arg): + """ + Call _try_fits(*arg['args']) and return arg['key'] and the _try_fits result. + + Must be a global function as it is called from a multiprocessing Pool. + """ return { 'key' : arg['key'], 'result' : _try_fits(*arg['args']) } -def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functions_enabled = False): +def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functions_enabled = False, boolean_parameters = list()): + """ + Determine goodness-of-fit for prediction of `by_param[(state_or_tran, *)][model_attribute]` dependence on `param_index` using various functions. + + This is done by varying `param_index` while keeping all other parameters constant and doing one least squares optimization for each function and for each combination of the remaining parameters. + The value of the parameter corresponding to `param_index` (e.g. txpower or packet length) is the sole input to the model function. + + Returns a dictionary with the following elements: + best -- name of the best-fitting function (see `analytic.functions`) + best_rmsd -- mean Root Mean Square Deviation of best-fitting function over all combinations of the remaining parameters + mean_rmsd -- mean Root Mean Square Deviation of a reference model using the mean of its respective input data as model value + median_rmsd -- mean Root Mean Square Deviation of a reference model using the median of its respective input data as model value + results -- mean goodness-of-fit measures for the individual functions. See `analytic.functions` for keys and `aggregate_measures` for values + + arguments + --- + + by_param: measurements partitioned by state/transition/... name and parameter values. + Example: `{('foo', (0, 2)): {'bar': [2]}, ('foo', (0, 4)): {'bar': [4]}, ('foo', (0, 6)): {'bar': [6]}}` + + state_or_tran: state/transition/... name for which goodness-of-fit will be calculated (first element of by_param key tuple). + Example: `'foo'` + + model_attribute: attribute for which goodness-of-fit will be calculated. + Example: `'bar'` + + param_index -- index of the parameter used as model input + safe_functions_enabled -- Include "safe" variants of functions with limited argument range. + """ + functions = analytic.functions(safe_functions_enabled = safe_functions_enabled) + #print('_try_fits(..., {}, {}, {})'.format(state_or_tran, model_attribute, param_index)) + for param_key in filter(lambda x: x[0] == state_or_tran, by_param.keys()): # We might remove elements from 'functions' while iterating over @@ -928,19 +964,29 @@ def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functi 'median' : [] } results = {} - + results_by_param = {} + + # TODO diese Funktion ist unfair, wenn ein Parameter in einer Variante deutlich mehr unterschiedliche Werte + # aufweist als bei der Kombination mit anderen Parametern. Gibt es z.B. die Parameterkombinationen + # (0,2), (0, 4), (0,6), (0,8), (0, 10), 0,12), (2, 2), (2, 4), (2, 6) und wird der Parameter mit Index 1 bestimmt, + # so haben die Messwerte für Parameter-Index 0 == 0 mehr Gewicht als die für Parameter-Index 0 == 2. + # Bei klassischen AEMR-generierten Benchmarks macht das nichts, weil für alle Kombinationen die gleichen Parameterwerte + # genutzt werden, das kann sich aber noch ändern... + # for each parameter combination: for param_key in filter(lambda x: x[0] == state_or_tran, by_param.keys()): X = [] Y = [] num_valid = 0 num_total = 0 - for k, v in by_param.items(): - if param_slice_eq(k, param_key, param_index): - num_total += 1 - if is_numeric(k[1][param_index]): - num_valid += 1 - X.extend([float(k[1][param_index])] * len(v[model_attribute])) - Y.extend(v[model_attribute]) + # for each value of the parameter denoted by param_index (all other parameters remain the same): + for k, v in filter(lambda kv: param_slice_eq(kv[0], param_key, param_index), by_param.items()): + num_total += 1 + if is_numeric(k[1][param_index]): + num_valid += 1 + X.extend([float(k[1][param_index])] * len(v[model_attribute])) + Y.extend(v[model_attribute]) + + #print(param_key, X, Y) if num_valid > 2: X = np.array(X) @@ -1037,7 +1083,7 @@ class AnalyticModel: assess -- calculate model quality """ - def __init__(self, by_name, parameters, arg_count = None, verbose = True): + def __init__(self, by_name, parameters, arg_count = None, verbose = True, use_corrcoef = False): """ Create a new AnalyticModel and compute parameter statistics. @@ -1067,6 +1113,8 @@ class AnalyticModel: } `parameters`: List of parameter names `verbose`: Print debug/info output while generating the model? + use_corrcoef -- use correlation coefficient instead of stddev comparison + to detect whether a model attribute depends on a parameter """ self.cache = dict() self.by_name = by_name @@ -1074,11 +1122,12 @@ class AnalyticModel: self.names = sorted(by_name.keys()) self.parameters = sorted(parameters) self.verbose = verbose + self._use_corrcoef = use_corrcoef self._num_args = arg_count if self._num_args is None: self._num_args = _num_args_from_by_name(by_name) - self.stats = ParamStats(self.by_name, self.by_param, self.parameters, self._num_args, verbose = verbose) + self.stats = ParamStats(self.by_name, self.by_param, self.parameters, self._num_args, verbose = verbose, use_corrcoef = use_corrcoef) def _get_model_from_dict(self, model_dict, model_function): model = {} -- cgit v1.2.3