Add SKLEARN DECART support

author: Daniel Friesel <daniel.friesel@uos.de> 2022-03-01 08:40:01 +0100
committer: Daniel Friesel <daniel.friesel@uos.de> 2022-03-01 08:40:01 +0100
commit: 9d42e811a1cd70dc87cc96f4d847fb239ae88d64 (patch)
tree: 818929561edd91ad3b6115ec1d6de8e754446457
parent: 8813bc4f07bcb6960845beef1d0908bade927215 (diff)
3 files changed, 38 insertions, 5 deletions
diff --git a/README.md b/README.md
index 87ddc85..df7dd3b 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ The following variables may be set to alter the behaviour of dfatool components.
 | `DFATOOL_DTREE_ENABLED` | 0, **1** | Use decision trees in get\_fitted |
 | `DFATOOL_DTREE_FUNCTION_LEAVES` | 0, **1** | Use functions (fitted via linear regression) in decision tree leaves when modeling numeric parameters with at least three distinct values. If 0, integer parameters are treated as enums instead. |
 | `DFATOOL_DTREE_SKLEARN_CART` | **0**, 1 | Use sklearn CART ("Decision Tree Regression") algorithm for decision tree generation. Uses binary nodes and supports splits on scalar variables. Overrides `FUNCTION_LEAVES` (=0) and `NONBINARY_NODES` (=0). |
+| `DFATOOL_DTREE_SKLEARN_DECART` | **0**, 1 | Use sklearn CART ("Decision Tree Regression") algorithm for decision tree generation. Ignore scalar parameters, thus emulating the DECART algorithm. |
 | `DFATOOL_DTREE_LMT` | **0**, 1 | Use [Linear Model Tree](https://github.com/cerlymarco/linear-tree) algorithm for regression tree generation. Uses binary nodes and linear functions. Overrides `FUNCTION_LEAVES` (=0) and `NONBINARY_NODES` (=0). |
 | `DFATOOL_CART_MAX_DEPTH` | **0** .. *n* | maximum depth for sklearn CART. Default: unlimited. |
 | `DFATOOL_USE_XGBOOST` | **0**, 1 | Use Extreme Gradient Boosting algorithm for decision forest generation. |
diff --git a/lib/model.py b/lib/model.py
index 558f049..baa22da 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -160,6 +160,9 @@ class AnalyticModel:
                     with_sklearn_cart = bool(
                         int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
                     )
+                    with_sklearn_decart = bool(
+                        int(os.getenv("DFATOOL_DTREE_SKLEARN_DECART", "0"))
+                    )
                     with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                     with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
                     ignore_irrelevant_parameters = bool(
@@ -178,6 +181,7 @@ class AnalyticModel:
                         with_function_leaves=with_function_leaves,
                         with_nonbinary_nodes=with_nonbinary_nodes,
                         with_sklearn_cart=with_sklearn_cart,
+                        with_sklearn_decart=with_sklearn_decart,
                         with_lmt=with_lmt,
                         with_xgboost=with_xgboost,
                         ignore_irrelevant_parameters=ignore_irrelevant_parameters,
@@ -332,6 +336,9 @@ class AnalyticModel:
                         with_sklearn_cart = bool(
                             int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
                         )
+                        with_sklearn_decart = bool(
+                            int(os.getenv("DFATOOL_DTREE_SKLEARN_DECART", "0"))
+                        )
                         with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                         with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
                         ignore_irrelevant_parameters = bool(
@@ -359,6 +366,7 @@ class AnalyticModel:
                             with_function_leaves=with_function_leaves,
                             with_nonbinary_nodes=with_nonbinary_nodes,
                             with_sklearn_cart=with_sklearn_cart,
+                            with_sklearn_decart=with_sklearn_decart,
                             with_lmt=with_lmt,
                             with_xgboost=with_xgboost,
                             ignore_irrelevant_parameters=ignore_irrelevant_parameters,
@@ -450,6 +458,7 @@ class AnalyticModel:
         with_function_leaves=False,
         with_nonbinary_nodes=True,
         with_sklearn_cart=False,
+        with_sklearn_decart=False,
         with_lmt=False,
         with_xgboost=False,
         ignore_irrelevant_parameters=True,
@@ -475,6 +484,7 @@ class AnalyticModel:
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
             with_sklearn_cart=with_sklearn_cart,
+            with_sklearn_decart=with_sklearn_decart,
             with_lmt=with_lmt,
             with_xgboost=with_xgboost,
             ignore_irrelevant_parameters=ignore_irrelevant_parameters,
@@ -778,6 +788,9 @@ class PTAModel(AnalyticModel):
                     with_sklearn_cart = bool(
                         int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
                     )
+                    with_sklearn_decart = bool(
+                        int(os.getenv("DFATOOL_DTREE_SKLEARN_DECART", "0"))
+                    )
                     with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                     with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
                     ignore_irrelevant_parameters = bool(
@@ -796,6 +809,7 @@ class PTAModel(AnalyticModel):
                         with_function_leaves=with_function_leaves,
                         with_nonbinary_nodes=with_nonbinary_nodes,
                         with_sklearn_cart=with_sklearn_cart,
+                        with_sklearn_decart=with_sklearn_decart,
                         with_lmt=with_lmt,
                         with_xgboost=with_xgboost,
                         ignore_irrelevant_parameters=ignore_irrelevant_parameters,
diff --git a/lib/parameters.py b/lib/parameters.py
index bca189e..fc6512f 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -38,7 +38,9 @@ def distinct_param_values(param_tuples):
     return distinct_values
 
 
-def param_to_ndarray(param_tuples, with_nan=True, categorial_to_scalar=False):
+def param_to_ndarray(
+    param_tuples, with_nan=True, categorial_to_scalar=False, ignore_indexes=list()
+):
     has_nan = dict()
     has_non_numeric = dict()
     distinct_values = dict()
@@ -71,6 +73,9 @@ def param_to_ndarray(param_tuples, with_nan=True, categorial_to_scalar=False):
         else:
             ignore_index[i] = False
 
+    for i in ignore_indexes:
+        ignore_index[i] = True
+
     ret_tuples = list()
     for param_tuple in param_tuples:
         ret_tuple = list()
@@ -928,6 +933,7 @@ class ModelAttribute:
         with_function_leaves=False,
         with_nonbinary_nodes=True,
         with_sklearn_cart=False,
+        with_sklearn_decart=False,
         with_xgboost=False,
         with_lmt=False,
         ignore_irrelevant_parameters=True,
@@ -943,6 +949,8 @@ class ModelAttribute:
         :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children)
         :param with_sklearn_cart: Use `sklearn.tree.DecisionTreeRegressor` CART implementation for tree generation. Does not support categorial (enum)
             and sparse parameters. Both are ignored during fitting. All other options are ignored as well.
+        :param with_sklearn_decart: Use `sklearn.tree.DecisionTreeRegressor` CART implementation in DECART mode for tree generation. CART limitations
+            apply; additionaly, scalar parameters are ignored during fitting.
         :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled.
         :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100.
 
@@ -953,16 +961,26 @@ class ModelAttribute:
             int(os.getenv("DFATOOL_PARAM_CATEGORIAL_TO_SCALAR", "0"))
         )
 
-        if with_sklearn_cart:
+        if with_sklearn_cart or with_sklearn_decart:
             from sklearn.tree import DecisionTreeRegressor
 
             max_depth = int(os.getenv("DFATOOL_CART_MAX_DEPTH", "0"))
             if max_depth == 0:
                 max_depth = None
             cart = DecisionTreeRegressor(max_depth=max_depth)
-            fit_parameters, category_to_index, ignore_index = param_to_ndarray(
-                parameters, with_nan=False, categorial_to_scalar=categorial_to_scalar
-            )
+            if with_sklearn_cart:
+                fit_parameters, category_to_index, ignore_index = param_to_ndarray(
+                    parameters,
+                    with_nan=False,
+                    categorial_to_scalar=categorial_to_scalar,
+                )
+            elif with_sklearn_decart:
+                fit_parameters, category_to_index, ignore_index = param_to_ndarray(
+                    parameters,
+                    with_nan=False,
+                    categorial_to_scalar=categorial_to_scalar,
+                    ignore_indexes=self.scalar_param_indexes,
+                )
             if fit_parameters.shape[1] == 0:
                 logger.warning(
                     f"Cannot generate CART for {self.name} {self.attr} due to lack of parameters: parameter shape is {np.array(parameters).shape}, fit_parameter shape is {fit_parameters.shape}"
author	Daniel Friesel <daniel.friesel@uos.de>	2022-03-01 08:40:01 +0100
committer	Daniel Friesel <daniel.friesel@uos.de>	2022-03-01 08:40:01 +0100
commit	9d42e811a1cd70dc87cc96f4d847fb239ae88d64 (patch)
tree	818929561edd91ad3b6115ec1d6de8e754446457
parent	8813bc4f07bcb6960845beef1d0908bade927215 (diff)