model: add sklearn CART support (CART with scalar features)

author: Daniel Friesel <daniel.friesel@uos.de> 2021-12-23 15:33:45 +0100
committer: Daniel Friesel <daniel.friesel@uos.de> 2021-12-23 15:33:45 +0100
commit: 3473ff227b9c085b5333147f0c2f6cb1e431c875 (patch)
tree: ab782568c0adea3e4b00be786299525f8b3e7692
parent: a9d538afbd9d766a35093e851fbe5c12112fb2eb (diff)
3 files changed, 91 insertions, 3 deletions
diff --git a/lib/functions.py b/lib/functions.py
index b1477da..320e8ed 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -409,6 +409,36 @@ class SubstateFunction(ModelFunction):
         return "SubstateFunction"
 
 
+class SKLearnRegressionFunction(ModelFunction):
+    def __init__(self, value, regressor, ignore_index):
+        super().__init__(value)
+        self.regressor = regressor
+        self.ignore_index = ignore_index
+
+    def is_predictable(self, param_list=None):
+        """
+        Return whether the model function can be evaluated on the given parameter values.
+
+        For a StaticFunction, this is always the case (i.e., this function always returns true).
+        """
+        return True
+
+    def eval(self, param_list=None):
+        """
+        Evaluate model function with specified param/arg values.
+
+        Far a Staticfunction, this is just the static value
+
+        """
+        if param_list is None:
+            return self.value
+        actual_param_list = list()
+        for i, param in enumerate(param_list):
+            if not self.ignore_index[i]:
+                actual_param_list.append(param)
+        return self.regressor.predict([actual_param_list])
+
+
 class AnalyticFunction(ModelFunction):
     """
     A multi-dimensional model function, generated from a string, which can be optimized using regression.
diff --git a/lib/model.py b/lib/model.py
index 9133196..4f5f60f 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -157,6 +157,9 @@ class AnalyticModel:
                     with_nonbinary_nodes = bool(
                         int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1"))
                     )
+                    with_sklearn_cart = bool(
+                        int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
+                    )
                     loss_ignore_scalar = bool(
                         int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
                     )
@@ -169,6 +172,7 @@ class AnalyticModel:
                         threshold=threshold,
                         with_function_leaves=with_function_leaves,
                         with_nonbinary_nodes=with_nonbinary_nodes,
+                        with_sklearn_cart=with_sklearn_cart,
                         loss_ignore_scalar=loss_ignore_scalar,
                     )
             self.fit_done = True
@@ -317,6 +321,12 @@ class AnalyticModel:
                         with_nonbinary_nodes = bool(
                             int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1"))
                         )
+                        with_sklearn_cart = bool(
+                            int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
+                        )
+                        loss_ignore_scalar = bool(
+                            int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
+                        )
                         threshold = self.attr_by_name[name][attr].stats.std_param_lut
                         if (
                             self.dtree_max_std
@@ -324,9 +334,6 @@ class AnalyticModel:
                             and attr in self.dtree_max_std[name]
                         ):
                             threshold = self.dtree_max_std[name][attr]
-                        loss_ignore_scalar = bool(
-                            int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
-                        )
                         logger.debug(
                             f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
                         )
@@ -336,6 +343,7 @@ class AnalyticModel:
                             threshold=threshold,
                             with_function_leaves=with_function_leaves,
                             with_nonbinary_nodes=with_nonbinary_nodes,
+                            with_sklearn_cart=with_sklearn_cart,
                             loss_ignore_scalar=loss_ignore_scalar,
                         )
                     else:
@@ -414,6 +422,7 @@ class AnalyticModel:
         threshold=100,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        with_sklearn_cart=False,
         loss_ignore_scalar=False,
     ):
 
@@ -435,6 +444,7 @@ class AnalyticModel:
             self.by_name[name][attribute],
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
+            with_sklearn_cart=with_sklearn_cart,
             loss_ignore_scalar=loss_ignore_scalar,
             threshold=threshold,
         )
diff --git a/lib/parameters.py b/lib/parameters.py
index e199153..9cfe145 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -38,6 +38,37 @@ def distinct_param_values(param_tuples):
     return distinct_values
 
 
+def param_to_ndarray(param_tuples, with_nan=True):
+    has_nan = dict()
+    has_non_numeric = dict()
+
+    for param_tuple in param_tuples:
+        for i, param in enumerate(param_tuple):
+            if not is_numeric(param):
+                if param is None:
+                    has_nan[i] = True
+                else:
+                    has_non_numeric[i] = True
+
+    ignore_index = dict()
+    for i in range(len(param_tuples[0])):
+        if has_non_numeric.get(i, False):
+            ignore_index[i] = True
+        elif not with_nan and has_nan.get(i, False):
+            ignore_index[i] = True
+        else:
+            ignore_index[i] = False
+
+    ret_tuples = list()
+    for param_tuple in param_tuples:
+        ret_tuple = list()
+        for i, param in enumerate(param_tuple):
+            if not ignore_index[i]:
+                ret_tuple.append(param)
+        ret_tuples.append(ret_tuple)
+    return np.asarray(ret_tuples), ignore_index
+
+
 def _depends_on_param(corr_param, std_param, std_lut):
     # if self.use_corrcoef:
     if False:
@@ -843,6 +874,7 @@ class ModelAttribute:
         data,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        with_sklearn_cart=False,
         loss_ignore_scalar=False,
         threshold=100,
     ):
@@ -853,12 +885,28 @@ class ModelAttribute:
         :param data: Measurements. [data 1, data 2, data 3, ...]
         :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters
         :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children)
+        :param with_sklearn_cart: Use `sklearn.tree.DecisionTreeRegressor` CART implementation for tree generation. Does not support categorial (enum)
+            and sparse parameters. Both are ignored during fitting. All other options are ignored as well.
         :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled.
         :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100.
 
         :returns: SplitFunction or StaticFunction
         """
 
+        if with_sklearn_cart:
+            from sklearn.tree import DecisionTreeRegressor
+
+            max_depth = int(os.getenv("DFATOOL_CART_MAX_DEPTH", "0"))
+            if max_depth == 0:
+                max_depth = None
+            cart = DecisionTreeRegressor(max_depth=max_depth)
+            fit_parameters, ignore_index = param_to_ndarray(parameters, with_nan=False)
+            cart.fit(fit_parameters, data)
+            self.model_function = df.SKLearnRegressionFunction(
+                np.mean(data), cart, ignore_index
+            )
+            return
+
         if loss_ignore_scalar and not with_function_leaves:
             logger.warning(
                 "build_dtree called with loss_ignore_scalar=True, with_function_leaves=False. This does not make sense."
author	Daniel Friesel <daniel.friesel@uos.de>	2021-12-23 15:33:45 +0100
committer	Daniel Friesel <daniel.friesel@uos.de>	2021-12-23 15:33:45 +0100
commit	3473ff227b9c085b5333147f0c2f6cb1e431c875 (patch)
tree	ab782568c0adea3e4b00be786299525f8b3e7692
parent	a9d538afbd9d766a35093e851fbe5c12112fb2eb (diff)