From c29a59c2b0ca67f0169beef9312b3cfff19416d5 Mon Sep 17 00:00:00 2001
From: Daniel Friesel <daniel.friesel@uos.de>
Date: Wed, 9 Feb 2022 16:25:23 +0100
Subject: allow dtree relevant parameter detection to be disabled

---
 README.md         |  1 +
 lib/model.py      | 22 +++++++++++++++++++---
 lib/parameters.py | 18 ++++++++++++------
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4d7354e..87ddc85 100644
--- a/README.md
+++ b/README.md
@@ -43,5 +43,6 @@ The following variables may be set to alter the behaviour of dfatool components.
 | `DFATOOL_FIT_LINEAR_ONLY` | **0**, 1 | Only consider linear functions (a + bx) in regression analysis. Useful for comparison with Linear Model Trees / M5. |
 | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) |
 | `DFATOOL_DTREE_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees |
+| `DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS` | 0, **1** | Ignore parameters deemed irrelevant by stddev heuristic during regression tree generation |
 | `DFATOOL_DTREE_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. |
 | `DFATOOL_PARAM_CATEGORIAL_TO_SCALAR` | **0**, 1 | Some models (e.g. sklearn CART, XGBoost) do not support categorial parameters. Ignore them (0) or convert them to scalar indexes (1). |
diff --git a/lib/model.py b/lib/model.py
index 3b1279f..227a323 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -162,11 +162,14 @@ class AnalyticModel:
                     )
                     with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                     with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
+                    ignore_irrelevant_parameters = bool(
+                        int(os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1"))
+                    )
                     loss_ignore_scalar = bool(
                         int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
                     )
                     logger.debug(
-                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
+                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, loss_ignore_scalar={loss_ignore_scalar})"
                     )
                     self.build_dtree(
                         name,
@@ -177,6 +180,7 @@ class AnalyticModel:
                         with_sklearn_cart=with_sklearn_cart,
                         with_lmt=with_lmt,
                         with_xgboost=with_xgboost,
+                        ignore_irrelevant_parameters=ignore_irrelevant_parameters,
                         loss_ignore_scalar=loss_ignore_scalar,
                     )
             self.fit_done = True
@@ -330,6 +334,11 @@ class AnalyticModel:
                         )
                         with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                         with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
+                        ignore_irrelevant_parameters = bool(
+                            int(
+                                os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1")
+                            )
+                        )
                         loss_ignore_scalar = bool(
                             int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
                         )
@@ -341,7 +350,7 @@ class AnalyticModel:
                         ):
                             threshold = self.dtree_max_std[name][attr]
                         logger.debug(
-                            f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
+                            f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
                         )
                         self.build_dtree(
                             name,
@@ -352,6 +361,7 @@ class AnalyticModel:
                             with_sklearn_cart=with_sklearn_cart,
                             with_lmt=with_lmt,
                             with_xgboost=with_xgboost,
+                            ignore_irrelevant_parameters=ignore_irrelevant_parameters,
                             loss_ignore_scalar=loss_ignore_scalar,
                         )
                     else:
@@ -433,6 +443,7 @@ class AnalyticModel:
         with_sklearn_cart=False,
         with_lmt=False,
         with_xgboost=False,
+        ignore_irrelevant_parameters=True,
         loss_ignore_scalar=False,
     ):
 
@@ -457,6 +468,7 @@ class AnalyticModel:
             with_sklearn_cart=with_sklearn_cart,
             with_lmt=with_lmt,
             with_xgboost=with_xgboost,
+            ignore_irrelevant_parameters=ignore_irrelevant_parameters,
             loss_ignore_scalar=loss_ignore_scalar,
             threshold=threshold,
         )
@@ -759,11 +771,14 @@ class PTAModel(AnalyticModel):
                     )
                     with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0")))
                     with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0")))
+                    ignore_irrelevant_parameters = bool(
+                        int(os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1"))
+                    )
                     loss_ignore_scalar = bool(
                         int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
                     )
                     logger.debug(
-                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
+                        f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, loss_ignore_scalar={loss_ignore_scalar})"
                     )
                     self.build_dtree(
                         name,
@@ -774,6 +789,7 @@ class PTAModel(AnalyticModel):
                         with_sklearn_cart=with_sklearn_cart,
                         with_lmt=with_lmt,
                         with_xgboost=with_xgboost,
+                        ignore_irrelevant_parameters=ignore_irrelevant_parameters,
                         loss_ignore_scalar=loss_ignore_scalar,
                     )
             self.fit_done = True
diff --git a/lib/parameters.py b/lib/parameters.py
index a615e5f..50a7ae8 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -866,6 +866,7 @@ class ModelAttribute:
         with_sklearn_cart=False,
         with_xgboost=False,
         with_lmt=False,
+        ignore_irrelevant_parameters=True,
         loss_ignore_scalar=False,
         threshold=100,
     ):
@@ -980,6 +981,7 @@ class ModelAttribute:
             data,
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
+            ignore_irrelevant_parameters=ignore_irrelevant_parameters,
             loss_ignore_scalar=loss_ignore_scalar,
             threshold=threshold,
         )
@@ -990,6 +992,7 @@ class ModelAttribute:
         data,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
+        ignore_irrelevant_parameters=True,
         loss_ignore_scalar=False,
         threshold=100,
         level=0,
@@ -1050,12 +1053,13 @@ class ModelAttribute:
                 ffs_feasible = True
                 continue
 
-            std_by_param = _mean_std_by_param(
-                by_param, distinct_values_by_param_index, param_index
-            )
-            if not _depends_on_param(None, std_by_param, std_lut):
-                loss.append(np.inf)
-                continue
+            if ignore_irrelevant_parameters:
+                std_by_param = _mean_std_by_param(
+                    by_param, distinct_values_by_param_index, param_index
+                )
+                if not _depends_on_param(None, std_by_param, std_lut):
+                    loss.append(np.inf)
+                    continue
 
             child_indexes = list()
             for value in unique_values:
@@ -1141,6 +1145,8 @@ class ModelAttribute:
                     child_data,
                     with_function_leaves=with_function_leaves,
                     with_nonbinary_nodes=with_nonbinary_nodes,
+                    ignore_irrelevant_parameters=ignore_irrelevant_parameters,
+                    loss_ignore_scalar=loss_ignore_scalar,
                     threshold=threshold,
                     level=level + 1,
                 )
-- 
cgit v1.2.3