optionally ignore scalar parameters during dtree split generation

author: Daniel Friesel <daniel.friesel@uos.de> 2021-12-03 10:39:10 +0100
committer: Daniel Friesel <daniel.friesel@uos.de> 2021-12-03 10:39:10 +0100
commit: baae23bd2bd5342adecfe3d404d55f5df2dd4826 (patch)
tree: 371b8c82e07ee837b954625338bf7f6d167d4b31
parent: e36a84688715f8d0a9af204dd823f548b4b2e4f9 (diff)
3 files changed, 68 insertions, 16 deletions
diff --git a/README.md b/README.md
index 8f010f2..16235da 100644
--- a/README.md
+++ b/README.md
@@ -37,3 +37,4 @@ The following variables may be set to alter the behaviour of dfatool components.
 | `DFATOOL_FIT_LINEAR_ONLY` | **0**, 1 | Only consider linear functions (a + bx) in regression analysis. Useful for comparison with Linear Model Trees / M5. |
 | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) |
 | `DFATOOL_DTREE_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees |
+| `DFATOOL_DTREE_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. |
diff --git a/lib/model.py b/lib/model.py
index f25e44f..e5d1647 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -157,8 +157,11 @@ class AnalyticModel:
                     with_nonbinary_nodes = bool(
                         int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1"))
                     )
+                    loss_ignore_scalar = bool(
+                        int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
+                    )
                     logger.debug(
-                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes})"
+                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
                     )
                     self.build_dtree(
                         name,
@@ -166,6 +169,7 @@ class AnalyticModel:
                         threshold=threshold,
                         with_function_leaves=with_function_leaves,
                         with_nonbinary_nodes=with_nonbinary_nodes,
+                        loss_ignore_scalar=loss_ignore_scalar,
                     )
             self.fit_done = True
 
@@ -189,6 +193,7 @@ class AnalyticModel:
                     self.parameters,
                     self._num_args.get(name, 0),
                     codependent_param=codependent_param,
+                    param_type=self.param_type_by_name[name],
                 )
                 self.attr_by_name[name][attr] = model_attr
                 paramstats.enqueue((name, attr), model_attr)
@@ -319,8 +324,11 @@ class AnalyticModel:
                             and attr in self.dtree_max_std[name]
                         ):
                             threshold = self.dtree_max_std[name][attr]
+                        loss_ignore_scalar = bool(
+                            int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
+                        )
                         logger.debug(
-                            f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes})"
+                            f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
                         )
                         self.build_dtree(
                             name,
@@ -328,6 +336,7 @@ class AnalyticModel:
                             threshold=threshold,
                             with_function_leaves=with_function_leaves,
                             with_nonbinary_nodes=with_nonbinary_nodes,
+                            loss_ignore_scalar=loss_ignore_scalar,
                         )
                     else:
                         self.attr_by_name[name][attr].set_data_from_paramfit(paramfit)
@@ -405,6 +414,7 @@ class AnalyticModel:
         threshold=100,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        loss_ignore_scalar=False,
     ):
 
         if name not in self.attr_by_name:
@@ -417,6 +427,7 @@ class AnalyticModel:
                 self.by_name[name][attribute],
                 self.by_name[name]["param"],
                 self.parameters,
+                param_type=ParamType(self.by_name[name]["param"]),
             )
 
         # temporary hack for ResKIL / kconfig-webconf evaluation of regression trees with function nodes
@@ -431,6 +442,7 @@ class AnalyticModel:
             self.by_name[name][attribute],
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
+            loss_ignore_scalar=loss_ignore_scalar,
             threshold=threshold,
         )
 
diff --git a/lib/parameters.py b/lib/parameters.py
index 1b46a8e..266bcec 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -540,6 +540,7 @@ class ModelAttribute:
         param_names,
         arg_count=0,
         codependent_param=dict(),
+        param_type=dict(),
     ):
 
         # Data for model generation
@@ -556,6 +557,22 @@ class ModelAttribute:
             map(lambda i: f"arg{i}", range(arg_count))
         )
 
+        # dict: Parameter index -> Parameter type (UNSET, BOOLEAN, SCALAR, ...)
+        self.param_type = param_type
+
+        self.nonscalar_param_indexes = list(
+            map(
+                lambda kv: kv[0],
+                filter(lambda kv: kv[1] != ParamType.SCALAR, self.param_type.items()),
+            )
+        )
+        self.scalar_param_indexes = list(
+            map(
+                lambda kv: kv[0],
+                filter(lambda kv: kv[1] == ParamType.SCALAR, self.param_type.items()),
+            )
+        )
+
         # Co-dependent parameters. If (param1_index, param2_index) in codependent_param, they are codependent.
         # In this case, only one of them must be used for parameter-dependent model attribute detection and modeling
         self.codependent_param_pair = codependent_param
@@ -826,24 +843,33 @@ class ModelAttribute:
         data,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        loss_ignore_scalar=False,
         threshold=100,
     ):
         """
         Build a Decision Tree on `param` / `data` for kconfig models.
 
-        :param this_symbols: parameter names
-        :param this_data: list of measurements. Each entry is a (param vector, mearusements vector) tuple.
-            param vector holds parameter values (same order as parameter names). mearuserements vector holds measurements.
-        :param data_index: Index in measurements vector to use for model generation. Default 0.
-        :param threshold: Return a StaticFunction leaf node if std(data[data_index]) < threshold. Default 100.
+        :param parameters: parameter values for each measurement. [(data 1 param 1, data 1 param 2, ...), (data 2 param 1, data 2 param 2, ...), ...]
+        :param data: Measurements. [data 1, data 2, data 3, ...]
+        :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters
+        :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children)
+        :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled.
+        :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100.
 
         :returns: SplitFunction or StaticFunction
         """
+
+        if loss_ignore_scalar and not with_function_leaves:
+            logger.warning(
+                "build_dtree called with loss_ignore_scalar=True, with_function_leaves=False. This does not make sense."
+            )
+
         self.model_function = self._build_dtree(
             parameters,
             data,
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
+            loss_ignore_scalar=loss_ignore_scalar,
             threshold=threshold,
         )
 
@@ -853,23 +879,23 @@ class ModelAttribute:
         data,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        loss_ignore_scalar=False,
         threshold=100,
         level=0,
     ):
         """
         Build a Decision Tree on `param` / `data` for kconfig models.
 
-        :param this_symbols: parameter names
-        :param this_data: list of measurements. Each entry is a (param vector, mearusements vector) tuple.
-            param vector holds parameter values (same order as parameter names). mearuserements vector holds measurements.
-        :param data_index: Index in measurements vector to use for model generation. Default 0.
-        :param threshold: Return a StaticFunction leaf node if std(data[data_index]) < threshold. Default 100.
+        :param parameters: parameter values for each measurement. [(data 1 param 1, data 1 param 2, ...), (data 2 param 1, data 2 param 2, ...), ...]
+        :param data: Measurements. [data 1, data 2, data 3, ...]
+        :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters
+        :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children)
+        :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled.
+        :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100.
 
         :returns: SplitFunction or StaticFunction
         """
 
-        # TODO remove data entries which are None (and remove corresponding parameters, too!)
-
         param_count = len(self.param_names) + self.arg_count
         if param_count == 0 or np.std(data) < threshold:
             return df.StaticFunction(np.mean(data))
@@ -889,14 +915,16 @@ class ModelAttribute:
                 mean_stds.append(np.inf)
                 continue
 
+            # if not with_nonbinary_nodes and sorted(unique_values) != [0, 1]:
             if not with_nonbinary_nodes and len(unique_values) > 2:
+                # param cannot be handled with a binary split
                 mean_stds.append(np.inf)
                 continue
 
             if (
                 with_function_leaves
+                and self.param_type[param_index] == ParamType.SCALAR
                 and len(unique_values) >= self.min_values_for_analytic_model
-                and all(map(lambda x: type(x) is int, unique_values))
             ):
                 # param can be modeled as a function. Do not split on it.
                 mean_stds.append(np.inf)
@@ -920,7 +948,17 @@ class ModelAttribute:
 
             children = list()
             for child in child_indexes:
-                children.append(np.std(list(map(lambda i: data[i], child))))
+                child_data = list(map(lambda i: data[i], child))
+                if loss_ignore_scalar:
+                    child_param = list(map(lambda i: parameters[i], child))
+                    child_data_by_scalar = partition_by_param(
+                        child_data,
+                        child_param,
+                        ignore_parameters=self.nonscalar_param_indexes,
+                    )
+                    children.extend(map(np.std, child_data_by_scalar.values()))
+                else:
+                    children.append(np.std(child_data))
 
             if np.any(np.isnan(children)):
                 mean_stds.append(np.inf)
@@ -938,6 +976,7 @@ class ModelAttribute:
                     parameters,
                     self.param_names,
                     arg_count=self.arg_count,
+                    param_type=self.param_type,
                 )
                 ParamStats.compute_for_attr(ma)
                 paramfit = ParamFit(parallel=False)
author	Daniel Friesel <daniel.friesel@uos.de>	2021-12-03 10:39:10 +0100
committer	Daniel Friesel <daniel.friesel@uos.de>	2021-12-03 10:39:10 +0100
commit	baae23bd2bd5342adecfe3d404d55f5df2dd4826 (patch)
tree	371b8c82e07ee837b954625338bf7f6d167d4b31
parent	e36a84688715f8d0a9af204dd823f548b4b2e4f9 (diff)