diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2021-12-03 10:39:10 +0100 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2021-12-03 10:39:10 +0100 |
commit | baae23bd2bd5342adecfe3d404d55f5df2dd4826 (patch) | |
tree | 371b8c82e07ee837b954625338bf7f6d167d4b31 | |
parent | e36a84688715f8d0a9af204dd823f548b4b2e4f9 (diff) |
optionally ignore scalar parameters during dtree split generation
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | lib/model.py | 16 | ||||
-rw-r--r-- | lib/parameters.py | 67 |
3 files changed, 68 insertions, 16 deletions
@@ -37,3 +37,4 @@ The following variables may be set to alter the behaviour of dfatool components. | `DFATOOL_FIT_LINEAR_ONLY` | **0**, 1 | Only consider linear functions (a + bx) in regression analysis. Useful for comparison with Linear Model Trees / M5. | | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) | | `DFATOOL_DTREE_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees | +| `DFATOOL_DTREE_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. | diff --git a/lib/model.py b/lib/model.py index f25e44f..e5d1647 100644 --- a/lib/model.py +++ b/lib/model.py @@ -157,8 +157,11 @@ class AnalyticModel: with_nonbinary_nodes = bool( int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1")) ) + loss_ignore_scalar = bool( + int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) + ) logger.debug( - f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes})" + f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" ) self.build_dtree( name, @@ -166,6 +169,7 @@ class AnalyticModel: threshold=threshold, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + loss_ignore_scalar=loss_ignore_scalar, ) self.fit_done = True @@ -189,6 +193,7 @@ class AnalyticModel: self.parameters, self._num_args.get(name, 0), codependent_param=codependent_param, + param_type=self.param_type_by_name[name], ) self.attr_by_name[name][attr] = model_attr paramstats.enqueue((name, attr), model_attr) @@ -319,8 +324,11 @@ class AnalyticModel: and attr in self.dtree_max_std[name] ): threshold = self.dtree_max_std[name][attr] + loss_ignore_scalar = bool( + int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) + ) logger.debug( - f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes})" + f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" ) self.build_dtree( name, @@ -328,6 +336,7 @@ class AnalyticModel: threshold=threshold, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + loss_ignore_scalar=loss_ignore_scalar, ) else: self.attr_by_name[name][attr].set_data_from_paramfit(paramfit) @@ -405,6 +414,7 @@ class AnalyticModel: threshold=100, with_function_leaves=False, with_nonbinary_nodes=True, + loss_ignore_scalar=False, ): if name not in self.attr_by_name: @@ -417,6 +427,7 @@ class AnalyticModel: self.by_name[name][attribute], self.by_name[name]["param"], self.parameters, + param_type=ParamType(self.by_name[name]["param"]), ) # temporary hack for ResKIL / kconfig-webconf evaluation of regression trees with function nodes @@ -431,6 +442,7 @@ class AnalyticModel: self.by_name[name][attribute], with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, ) diff --git a/lib/parameters.py b/lib/parameters.py index 1b46a8e..266bcec 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -540,6 +540,7 @@ class ModelAttribute: param_names, arg_count=0, codependent_param=dict(), + param_type=dict(), ): # Data for model generation @@ -556,6 +557,22 @@ class ModelAttribute: map(lambda i: f"arg{i}", range(arg_count)) ) + # dict: Parameter index -> Parameter type (UNSET, BOOLEAN, SCALAR, ...) + self.param_type = param_type + + self.nonscalar_param_indexes = list( + map( + lambda kv: kv[0], + filter(lambda kv: kv[1] != ParamType.SCALAR, self.param_type.items()), + ) + ) + self.scalar_param_indexes = list( + map( + lambda kv: kv[0], + filter(lambda kv: kv[1] == ParamType.SCALAR, self.param_type.items()), + ) + ) + # Co-dependent parameters. If (param1_index, param2_index) in codependent_param, they are codependent. # In this case, only one of them must be used for parameter-dependent model attribute detection and modeling self.codependent_param_pair = codependent_param @@ -826,24 +843,33 @@ class ModelAttribute: data, with_function_leaves=False, with_nonbinary_nodes=True, + loss_ignore_scalar=False, threshold=100, ): """ Build a Decision Tree on `param` / `data` for kconfig models. - :param this_symbols: parameter names - :param this_data: list of measurements. Each entry is a (param vector, mearusements vector) tuple. - param vector holds parameter values (same order as parameter names). mearuserements vector holds measurements. - :param data_index: Index in measurements vector to use for model generation. Default 0. - :param threshold: Return a StaticFunction leaf node if std(data[data_index]) < threshold. Default 100. + :param parameters: parameter values for each measurement. [(data 1 param 1, data 1 param 2, ...), (data 2 param 1, data 2 param 2, ...), ...] + :param data: Measurements. [data 1, data 2, data 3, ...] + :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters + :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children) + :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled. + :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100. :returns: SplitFunction or StaticFunction """ + + if loss_ignore_scalar and not with_function_leaves: + logger.warning( + "build_dtree called with loss_ignore_scalar=True, with_function_leaves=False. This does not make sense." + ) + self.model_function = self._build_dtree( parameters, data, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, ) @@ -853,23 +879,23 @@ class ModelAttribute: data, with_function_leaves=False, with_nonbinary_nodes=True, + loss_ignore_scalar=False, threshold=100, level=0, ): """ Build a Decision Tree on `param` / `data` for kconfig models. - :param this_symbols: parameter names - :param this_data: list of measurements. Each entry is a (param vector, mearusements vector) tuple. - param vector holds parameter values (same order as parameter names). mearuserements vector holds measurements. - :param data_index: Index in measurements vector to use for model generation. Default 0. - :param threshold: Return a StaticFunction leaf node if std(data[data_index]) < threshold. Default 100. + :param parameters: parameter values for each measurement. [(data 1 param 1, data 1 param 2, ...), (data 2 param 1, data 2 param 2, ...), ...] + :param data: Measurements. [data 1, data 2, data 3, ...] + :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters + :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children) + :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled. + :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100. :returns: SplitFunction or StaticFunction """ - # TODO remove data entries which are None (and remove corresponding parameters, too!) - param_count = len(self.param_names) + self.arg_count if param_count == 0 or np.std(data) < threshold: return df.StaticFunction(np.mean(data)) @@ -889,14 +915,16 @@ class ModelAttribute: mean_stds.append(np.inf) continue + # if not with_nonbinary_nodes and sorted(unique_values) != [0, 1]: if not with_nonbinary_nodes and len(unique_values) > 2: + # param cannot be handled with a binary split mean_stds.append(np.inf) continue if ( with_function_leaves + and self.param_type[param_index] == ParamType.SCALAR and len(unique_values) >= self.min_values_for_analytic_model - and all(map(lambda x: type(x) is int, unique_values)) ): # param can be modeled as a function. Do not split on it. mean_stds.append(np.inf) @@ -920,7 +948,17 @@ class ModelAttribute: children = list() for child in child_indexes: - children.append(np.std(list(map(lambda i: data[i], child)))) + child_data = list(map(lambda i: data[i], child)) + if loss_ignore_scalar: + child_param = list(map(lambda i: parameters[i], child)) + child_data_by_scalar = partition_by_param( + child_data, + child_param, + ignore_parameters=self.nonscalar_param_indexes, + ) + children.extend(map(np.std, child_data_by_scalar.values())) + else: + children.append(np.std(child_data)) if np.any(np.isnan(children)): mean_stds.append(np.inf) @@ -938,6 +976,7 @@ class ModelAttribute: parameters, self.param_names, arg_count=self.arg_count, + param_type=self.param_type, ) ParamStats.compute_for_attr(ma) paramfit = ParamFit(parallel=False) |