Do not pass the entirety of by_nane/by_param to subprocesses

This should speed up analysis quite a bit and also reduce memory usage significantly
author: Daniel Friesel <daniel.friesel@uos.de> 2020-11-02 16:04:12 +0100
committer: Daniel Friesel <daniel.friesel@uos.de> 2020-11-02 16:04:12 +0100
commit: ea627ab6d9b47c53e6b1e34837e928c9d599db51 (patch)
tree: 40f706c314cbe62b7858848915a43bbe200342ef /lib
parent: 69b4f98dc1de99971b3f5fb56fb07c04fa229e9c (diff)
3 files changed, 76 insertions, 105 deletions
diff --git a/lib/model.py b/lib/model.py
index 192cea3..1190fb0 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -114,17 +114,17 @@ class ParallelParamFit:
 
         This causes fit() to compute the best-fitting function for this model part.
         """
+        # Transform by_param[(state_or_tran, param_value)][attribute] = ...
+        # into n_by_param[param_value] = ...
+        # (param_value is dynamic, the rest is fixed)
+        n_by_param = dict()
+        for k, v in self.by_param.items():
+            if k[0] == state_or_tran:
+                n_by_param[k[1]] = v[attribute]
         self.fit_queue.append(
             {
                 "key": [state_or_tran, attribute, param_name, param_filter],
-                "args": [
-                    self.by_param,
-                    state_or_tran,
-                    attribute,
-                    param_index,
-                    safe_functions_enabled,
-                    param_filter,
-                ],
+                "args": [n_by_param, param_index, safe_functions_enabled, param_filter],
             }
         )
 
@@ -201,20 +201,15 @@ def _try_fits_parallel(arg):
 
 
 def _try_fits(
-    by_param,
-    state_or_tran,
-    model_attribute,
-    param_index,
-    safe_functions_enabled=False,
-    param_filter: dict = None,
+    n_by_param, param_index, safe_functions_enabled=False, param_filter: dict = None
 ):
     """
-    Determine goodness-of-fit for prediction of `by_param[(state_or_tran, *)][model_attribute]` dependence on `param_index` using various functions.
+    Determine goodness-of-fit for prediction of `n_by_param[(param1_value, param2_value, ...)]` dependence on `param_index` using various functions.
 
     This is done by varying `param_index` while keeping all other parameters constant and doing one least squares optimization for each function and for each combination of the remaining parameters.
     The value of the parameter corresponding to `param_index` (e.g. txpower or packet length) is the sole input to the model function.
     Only numeric parameter values (as determined by `utils.is_numeric`) are used for fitting, non-numeric values such as None or enum strings are ignored.
-    Fitting is only performed if at least three distinct parameter values exist in `by_param[(state_or_tran, *)]`.
+    Fitting is only performed if at least three distinct parameter values exist in `by_param[*]`.
 
     :returns:  a dictionary with the following elements:
         best -- name of the best-fitting function (see `analytic.functions`). `None` in case of insufficient data.
@@ -223,14 +218,8 @@ def _try_fits(
         median_rmsd -- mean Root Mean Square Deviation of a reference model using the median of its respective input data as model value
         results -- mean goodness-of-fit measures for the individual functions. See `analytic.functions` for keys and `aggregate_measures` for values
 
-    :param by_param: measurements partitioned by state/transition/... name and parameter values.
-    Example: `{('foo', (0, 2)): {'bar': [2]}, ('foo', (0, 4)): {'bar': [4]}, ('foo', (0, 6)): {'bar': [6]}}`
-
-    :param state_or_tran: state/transition/... name for which goodness-of-fit will be calculated (first element of by_param key tuple).
-    Example: `'foo'`
-
-    :param model_attribute: attribute for which goodness-of-fit will be calculated.
-    Example: `'bar'`
+    :param n_by_param: measurements of a specific model attribute partitioned by parameter values.
+        Example: `{(0, 2): [2], (0, 4): [4], (0, 6): [6]}`
 
     :param param_index: index of the parameter used as model input
     :param safe_functions_enabled: Include "safe" variants of functions with limited argument range.
@@ -239,15 +228,15 @@ def _try_fits(
 
     functions = analytic.functions(safe_functions_enabled=safe_functions_enabled)
 
-    for param_key in filter(lambda x: x[0] == state_or_tran, by_param.keys()):
+    for param_key in n_by_param.keys():
         # We might remove elements from 'functions' while iterating over
         # its keys. A generator will not allow this, so we need to
         # convert to a list.
         function_names = list(functions.keys())
         for function_name in function_names:
             function_object = functions[function_name]
-            if is_numeric(param_key[1][param_index]) and not function_object.is_valid(
-                param_key[1][param_index]
+            if is_numeric(param_key[param_index]) and not function_object.is_valid(
+                param_key[param_index]
             ):
                 functions.pop(function_name, None)
 
@@ -261,12 +250,11 @@ def _try_fits(
 
     # for each parameter combination:
     for param_key in filter(
-        lambda x: x[0] == state_or_tran
-        and remove_index_from_tuple(x[1], param_index)
+        lambda x: remove_index_from_tuple(x, param_index)
         not in seen_parameter_combinations
-        and len(by_param[x]["param"])
-        and match_parameter_values(by_param[x]["param"][0], param_filter),
-        by_param.keys(),
+        and len(n_by_param[x])
+        and match_parameter_values(n_by_param[x][0], param_filter),
+        n_by_param.keys(),
     ):
         X = []
         Y = []
@@ -275,24 +263,22 @@ def _try_fits(
 
         # Ensure that each parameter combination is only optimized once. Otherwise, with parameters (1, 2, 5), (1, 3, 5), (1, 4, 5) and param_index == 1,
         # the parameter combination (1, *, 5) would be optimized three times, both wasting time and biasing results towards more frequently occuring combinations of non-param_index parameters
-        seen_parameter_combinations.add(
-            remove_index_from_tuple(param_key[1], param_index)
-        )
+        seen_parameter_combinations.add(remove_index_from_tuple(param_key, param_index))
 
         # for each value of the parameter denoted by param_index (all other parameters remain the same):
         for k, v in filter(
-            lambda kv: param_slice_eq(kv[0], param_key, param_index), by_param.items()
+            lambda kv: param_slice_eq(kv[0], param_key, param_index), n_by_param.items()
         ):
             num_total += 1
-            if is_numeric(k[1][param_index]):
+            if is_numeric(k[param_index]):
                 num_valid += 1
-                X.extend([float(k[1][param_index])] * len(v[model_attribute]))
-                Y.extend(v[model_attribute])
+                X.extend([float(k[param_index])] * len(v))
+                Y.extend(v)
 
         if num_valid > 2:
             X = np.array(X)
             Y = np.array(Y)
-            other_parameters = remove_index_from_tuple(k[1], param_index)
+            other_parameters = remove_index_from_tuple(k, param_index)
             raw_results_by_param[other_parameters] = dict()
             results_by_param[other_parameters] = dict()
             for function_name, param_function in functions.items():
@@ -318,7 +304,7 @@ def _try_fits(
 
     if not len(ref_results["mean"]):
         # Insufficient data for fitting
-        # print('[W] Insufficient data for fitting {}/{}/{}'.format(state_or_tran, model_attribute, param_index))
+        # print('[W] Insufficient data for fitting {}'.format(param_index))
         return {"best": None, "best_rmsd": np.inf, "results": results}
 
     for (
diff --git a/lib/parameters.py b/lib/parameters.py
index fa966a3..71f98db 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -82,17 +82,17 @@ def _reduce_param_matrix(matrix: np.ndarray, parameter_names: list) -> list:
     return list()
 
 
-def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_index):
+def _std_by_param(n_by_param, all_param_values, state_or_tran, attribute, param_index):
     u"""
     Calculate standard deviations for a static model where all parameters but `param_index` are constant.
 
-    :param by_param: measurements sorted by key/transition name and parameter values
+    :param n_by_param: measurements of a specific model attribute partitioned by parameter values.
+        Example: `{(0, 2): [2], (0, 4): [4], (0, 6): [6]}`
     :param all_param_values: distinct values of each parameter in `state_or_tran`.
         E.g. for two parameters, the first being None, FOO, or BAR, and the second being 1, 2, 3, or 4, the argument is
         `[[None, 'FOO', 'BAR'], [1, 2, 3, 4]]`.
-    :param state_or_tran: state or transition name (-> by_param[(state_or_tran, *)])
-    :param attribute: model attribute, e.g. 'power' or 'duration'
-           (-> by_param[(state_or_tran, *)][attribute])
+    :param state_or_tran: state or transition name for debugging
+    :param attribute: model attribute for debugging, e.g. 'power' or 'duration'
     :param param_index: index of variable parameter
     :returns: (stddev matrix, mean stddev, LUT matrix)
         *stddev matrix* is an ((number of parameters)-1)-dimensional matrix giving the standard deviation of each individual parameter variation partition.
@@ -125,13 +125,10 @@ def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_in
     for param_value in itertools.product(*param_values):
         param_partition = list()
         std_list = list()
-        for k, v in by_param.items():
-            if (
-                k[0] == state_or_tran
-                and (*k[1][:param_index], *k[1][param_index + 1 :]) == param_value
-            ):
-                param_partition.extend(v[attribute])
-                std_list.append(np.std(v[attribute]))
+        for k, v in n_by_param.items():
+            if (*k[:param_index], *k[param_index + 1 :]) == param_value:
+                param_partition.extend(v)
+                std_list.append(np.std(v))
 
         if len(param_partition) > 1:
             matrix_index = list(range(len(param_value)))
@@ -162,25 +159,22 @@ def _std_by_param(by_param, all_param_values, state_or_tran, attribute, param_in
     )  # np.mean([np.std(partition) for partition in partitions])
 
 
-def _corr_by_param(by_name, state_or_trans, attribute, param_index):
+def _corr_by_param(attribute_data, param_values, param_index):
     """
-    Return correlation coefficient (`np.corrcoef`) of `by_name[state_or_trans][attribute][:]` <-> `by_name[state_or_trans]['param'][:][param_index]`
+    Return correlation coefficient (`np.corrcoef`) of `attribute_data` <-> `param_values[param_index]`
 
     A correlation coefficient close to 1 indicates that the attribute likely depends on the value of the parameter denoted by `param_index`, if it is nearly 0, it likely does not depend on it.
 
     If any value of `param_index` is not numeric (i.e., can not be parsed as float), this function returns 0.
 
-    :param by_name: measurements partitioned by state/transition name
-    :param state_or_trans: state or transition name
-    :param attribute: model attribute
+    :param attribute_data: list or 1-D numpy array taken from by_name[state_or_trans][attribute].
+    :param param_values: list of parameter values taken from by_name[state_or_trans]["param"].
     :param param_index: index of parameter in `by_name[state_or_trans]['param']`
     """
-    if _all_params_are_numeric(by_name[state_or_trans], param_index):
-        param_values = np.array(
-            list((map(lambda x: x[param_index], by_name[state_or_trans]["param"])))
-        )
+    if _all_params_are_numeric(param_values, param_index):
+        param_values = np.array(list((map(lambda x: x[param_index], param_values))))
         try:
-            return np.corrcoef(by_name[state_or_trans][attribute], param_values)[0, 1]
+            return np.corrcoef(attribute_data, param_values)[0, 1]
         except FloatingPointError:
             # Typically happens when all parameter values are identical.
             # Building a correlation coefficient is pointless in this case
@@ -188,16 +182,11 @@ def _corr_by_param(by_name, state_or_trans, attribute, param_index):
             return 0.0
         except ValueError:
             logger.error(
-                "ValueError in _corr_by_param(by_name, state_or_trans={}, attribute={}, param_index={})".format(
-                    state_or_trans, attribute, param_index
-                )
+                "ValueError in _corr_by_param(param_index={})".format(param_index)
             )
             logger.error(
-                "while executing np.corrcoef(by_name[{}][{}]={}, {}))".format(
-                    state_or_trans,
-                    attribute,
-                    by_name[state_or_trans][attribute],
-                    param_values,
+                "while executing np.corrcoef({}, {}))".format(
+                    attribute_data, param_values
                 )
             )
             raise
@@ -206,8 +195,9 @@ def _corr_by_param(by_name, state_or_trans, attribute, param_index):
 
 
 def _compute_param_statistics(
-    by_name,
-    by_param,
+    attribute_data,
+    param_values,
+    n_by_param,
     parameter_names,
     arg_count,
     state_or_trans,
@@ -223,14 +213,14 @@ def _compute_param_statistics(
     (1, 1), (5, 1), (7, 1,) (10, 1), (1, 2), (1, 6) will lead to bogus results.
     It is better to provide (1, 1), (5, 1), (1, 2), (5, 2), ... (i.e. a cross product of all individual parameter values)
 
-    :param by_name: ground truth partitioned by state/transition name.
-        by_name[state_or_trans][attribute] must be a list or 1-D numpy array.
-        by_name[state_or_trans]['param'] must be a list of parameter values
+    :param attribute_data: list or 1-D numpy array taken from by_name[state_or_trans][attribute]
+        (ground truth partitioned by state/transition name).
+    :param param_values: list of parameter values 
         corresponding to the ground truth, e.g. [[1, 2, 3], ...] if the
         first ground truth element has the (lexically) first parameter set to 1,
-        the second to 2 and the third to 3.
-    :param by_param: ground truth partitioned by state/transition name and parameters.
-        by_name[(state_or_trans, *)][attribute] must be a list or 1-D numpy array.
+        the second to 2 and the third to 3. Taken from by_name[state_or_trans]["param"].
+    :param n_by_param: measurements of a specific model attribute partitioned by parameter values.
+        Example: `{(0, 2): [2], (0, 4): [4], (0, 6): [6]}`
     :param parameter_names: list of parameter names, must have the same order as the parameter
         values in by_param (lexical sorting is recommended).
     :param arg_count: dict providing the number of functions args ("local parameters") for each function.
@@ -239,12 +229,12 @@ def _compute_param_statistics(
 
     :returns: a dict with the following content:
     std_static -- static parameter-unaware model error: stddev of by_name[state_or_trans][attribute]
-    std_param_lut -- static parameter-aware model error: mean stddev of by_param[(state_or_trans, *)][attribute]
+    std_param_lut -- static parameter-aware model error: mean stddev of n_by_param[*]
     std_by_param -- static parameter-aware model error ignoring a single parameter.
         dictionary with one key per parameter. The value is the mean stddev
         of measurements where all other parameters are fixed and the parameter
         in question is variable. E.g. std_by_param['X'] is the mean stddev of
-        by_param[(state_or_trans, (X=*, Y=..., Z=...))][attribute].
+        n_by_param[(X=*, Y=..., Z=...)].
     std_by_arg -- same, but ignoring a single function argument
         Only set if state_or_trans appears in arg_count, empty dict otherwise.
     corr_by_param -- correlation coefficient
@@ -254,14 +244,8 @@ def _compute_param_statistics(
     depends_on_arg -- list(bool). Same, but for function arguments, if any.
     """
     ret = {
-        "std_static": np.std(by_name[state_or_trans][attribute]),
-        "std_param_lut": np.mean(
-            [
-                np.std(by_param[x][attribute])
-                for x in by_param.keys()
-                if x[0] == state_or_trans
-            ]
-        ),
+        "std_static": np.std(attribute_data),
+        "std_param_lut": np.mean([np.std(n_by_param[x]) for x in n_by_param.keys()]),
         "std_by_param": {},
         "std_by_param_values": {},
         "lut_by_param_values": {},
@@ -278,7 +262,7 @@ def _compute_param_statistics(
 
     for param_idx, param in enumerate(parameter_names):
         std_matrix, mean_std, lut_matrix = _std_by_param(
-            by_param,
+            n_by_param,
             distinct_values_by_param_index,
             state_or_trans,
             attribute,
@@ -288,7 +272,7 @@ def _compute_param_statistics(
         ret["std_by_param_values"][param] = std_matrix
         ret["lut_by_param_values"][param] = lut_matrix
         ret["corr_by_param"][param] = _corr_by_param(
-            by_name, state_or_trans, attribute, param_idx
+            attribute_data, param_values, param_idx
         )
 
         ret["depends_on_param"][param] = _depends_on_param(
@@ -300,7 +284,7 @@ def _compute_param_statistics(
     if state_or_trans in arg_count:
         for arg_index in range(arg_count[state_or_trans]):
             std_matrix, mean_std, lut_matrix = _std_by_param(
-                by_param,
+                n_by_param,
                 distinct_values_by_param_index,
                 state_or_trans,
                 attribute,
@@ -311,7 +295,7 @@ def _compute_param_statistics(
             ret["lut_by_arg_values"].append(lut_matrix)
             ret["corr_by_arg"].append(
                 _corr_by_param(
-                    by_name, state_or_trans, attribute, len(parameter_names) + arg_index
+                    attribute_data, param_values, len(parameter_names) + arg_index
                 )
             )
 
@@ -335,7 +319,7 @@ def _compute_param_statistics_parallel(arg):
 
 def _all_params_are_numeric(data, param_idx):
     """Check if all `data['param'][*][param_idx]` elements are numeric, as reported by `utils.is_numeric`."""
-    param_values = list(map(lambda x: x[param_idx], data["param"]))
+    param_values = list(map(lambda x: x[param_idx], data))
     if len(list(filter(is_numeric, param_values))) == len(param_values):
         return True
     return False
@@ -434,12 +418,7 @@ class ParamStats:
     """
 
     def __init__(
-        self,
-        by_name,
-        by_param,
-        parameter_names,
-        arg_count,
-        use_corrcoef=False,
+        self, by_name, by_param, parameter_names, arg_count, use_corrcoef=False
     ):
         """
         Compute standard deviation and correlation coefficient on parameterized data partitions.
@@ -482,12 +461,17 @@ class ParamStats:
                     param
                 ] = self.distinct_values_by_param_index[state_or_tran][i]
             for attribute in by_name[state_or_tran]["attributes"]:
+                n_by_param = dict()
+                for k, v in by_param.items():
+                    if k[0] == state_or_tran:
+                        n_by_param[k[1]] = v[attribute]
                 stats_queue.append(
                     {
                         "key": [state_or_tran, attribute],
                         "args": [
-                            by_name,
-                            by_param,
+                            by_name[state_or_tran][attribute],
+                            by_name[state_or_tran]["param"],
+                            n_by_param,
                             parameter_names,
                             arg_count,
                             state_or_tran,
@@ -498,6 +482,9 @@ class ParamStats:
                     }
                 )
 
+        # Fails if an object is > 2 GB in size. This happens when using
+        # --plot-traces or --pelt, which cause by_param and by_name to contain
+        # "power_traces" data with raw traces
         with Pool() as pool:
             stats_results = pool.map(_compute_param_statistics_parallel, stats_queue)
 
diff --git a/lib/utils.py b/lib/utils.py
index 31fedcf..2ed3d6e 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -152,9 +152,7 @@ def param_slice_eq(a, b, index):
     ('foo', [1, 4]), ('foo', [2, 4]), 1 -> False
 
     """
-    if (*a[1][:index], *a[1][index + 1 :]) == (*b[1][:index], *b[1][index + 1 :]) and a[
-        0
-    ] == b[0]:
+    if (*a[:index], *a[index + 1 :]) == (*b[:index], *b[index + 1 :]):
         return True
     return False
author	Daniel Friesel <daniel.friesel@uos.de>	2020-11-02 16:04:12 +0100
committer	Daniel Friesel <daniel.friesel@uos.de>	2020-11-02 16:04:12 +0100
commit	ea627ab6d9b47c53e6b1e34837e928c9d599db51 (patch)
tree	40f706c314cbe62b7858848915a43bbe200342ef /lib
parent	69b4f98dc1de99971b3f5fb56fb07c04fa229e9c (diff)