DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS=1 → DFATOOL_RMT_RELEVANCE_METHOD=std_by_param

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-07 10:52:34 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-07 10:52:34 +0100
commit: 92b4dd6e05df3b2805570fa1f86c35c33f147bec (patch)
tree: 96bddb10dc889888269e5dc808319720c74d30a3
parent: 9754b3a46dad43211539a3dbfbc7c5095bdf30f5 (diff)
3 files changed, 13 insertions, 23 deletions
diff --git a/README.md b/README.md
index fcc8eba..aafdc51 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ The following variables may be set to alter the behaviour of dfatool components.
 | `DFATOOL_KCONF_IGNORE_STRING` | 0, **1** | Ignore string configuration options. These often hold compiler paths and other not really helpful information. |
 | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) |
 | `DFATOOL_RMT_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees |
-| `DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS` | **0**, 1 | Ignore parameters deemed irrelevant by stddev heuristic during regression tree generation. Use with caution. |
+| `DFATOOL_RMT_RELEVANCE_METHOD` | **none**, std\_by\_param | Ignore parameters deemed irrelevant by the specified heuristic during regression tree generation. Use with caution. |
 | `DFATOOL_PARAM_RELEVANCE_THRESHOLD` | 0 .. **0.5** .. 1 | Threshold for relevant parameter detection: parameter *i* is relevant if mean standard deviation (data partitioned by all parameters) / mean standard deviation (data partition by all parameters but *i*) is less than threshold |
 | `DFATOOL_RMT_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. |
 | `DFATOOL_PARAM_CATEGORICAL_TO_SCALAR` | **0**, 1 | Some models (e.g. FOL, sklearn CART, XGBoost) do not support categorical parameters. Ignore them (0) or convert them to scalar indexes (1). Conversion uses lexical order. |
diff --git a/doc/modeling-method.md b/doc/modeling-method.md
index 98d8fcf..585a8ea 100644
--- a/doc/modeling-method.md
+++ b/doc/modeling-method.md
@@ -43,7 +43,6 @@ All of these are valid regression model trees.
 ### Related Options
 
 * `--force-tree` builds a tree structure even if dfatool's heuristic indicates that no non-integer parameter affects the modeled performance attribute.
-* `DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS=0` disables the relevant parameter detection heuristic when building the tree structure. By default, irrelevant parameters cannot end up as decision nodes.
 * `DFATOOL_RMT_SUBMODEL=fol` makes RMT only consider linear functions (a + bx) in regression analysis. Useful for comparison with LMT / M5.
 * `DFATOOL_PARAM_CATEGORICAL_TO_SCALAR=1`
 * `DFATOOL_ULS_SKIP_CODEPENDENT_CHECK=1`
diff --git a/lib/parameters.py b/lib/parameters.py
index 8c7c9cb..a154918 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -15,6 +15,10 @@ from .utils import soft_cast_int, soft_cast_float
 
 logger = logging.getLogger(__name__)
 
+dfatool_fol_relevance_method = os.getenv("DFATOOL_FOL_RELEVANCE_METHOD", None)
+dfatool_symreg_relevance_method = os.getenv("DFATOOL_SYMREG_RELEVANCE_METHOD", None)
+dfatool_rmt_relevance_method = os.getenv("DFATOOL_RMT_RELEVANCE_METHOD", None)
+
 
 def distinct_param_values(param_tuples):
     """
@@ -912,11 +916,8 @@ class ModelAttribute:
             return False
 
     def build_fol(self):
-        ignore_irrelevant = bool(
-            int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0"))
-        )
         ignore_param_indexes = list()
-        if ignore_irrelevant:
+        if dfatool_fol_relevance_method == "std_by_param":
             for param_index, param in enumerate(self.param_names):
                 if not self.stats.depends_on_param(param):
                     ignore_param_indexes.append(param_index)
@@ -964,11 +965,8 @@ class ModelAttribute:
             return False
 
     def build_symreg(self):
-        ignore_irrelevant = bool(
-            int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0"))
-        )
         ignore_param_indexes = list()
-        if ignore_irrelevant:
+        if dfatool_symreg_relevance_method == "std_by_param":
             for param_index, param in enumerate(self.param_names):
                 if not self.stats.depends_on_param(param):
                     ignore_param_indexes.append(param_index)
@@ -1031,7 +1029,6 @@ class ModelAttribute:
         with_function_leaves=None,
         with_nonbinary_nodes=None,
         with_gplearn_symreg=None,
-        ignore_irrelevant_parameters=None,
         loss_ignore_scalar=None,
         threshold=100,
     ):
@@ -1059,10 +1056,6 @@ class ModelAttribute:
             )
         if with_gplearn_symreg is None:
             with_gplearn_symreg = bool(int(os.getenv("DFATOOL_USE_SYMREG", "0")))
-        if ignore_irrelevant_parameters is None:
-            ignore_irrelevant_parameters = bool(
-                int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0"))
-            )
         if loss_ignore_scalar is None:
             loss_ignore_scalar = bool(
                 int(os.getenv("DFATOOL_RMT_LOSS_IGNORE_SCALAR", "0"))
@@ -1084,7 +1077,6 @@ class ModelAttribute:
             self.data,
             with_function_leaves=with_function_leaves,
             with_nonbinary_nodes=with_nonbinary_nodes,
-            ignore_irrelevant_parameters=ignore_irrelevant_parameters,
             loss_ignore_scalar=loss_ignore_scalar,
             submodel=os.getenv("DFATOOL_RMT_SUBMODEL", "uls"),
             threshold=threshold,
@@ -1097,7 +1089,6 @@ class ModelAttribute:
         data,
         with_function_leaves=False,
         with_nonbinary_nodes=True,
-        ignore_irrelevant_parameters=True,
         loss_ignore_scalar=False,
         submodel="uls",
         threshold=100,
@@ -1127,11 +1118,12 @@ class ModelAttribute:
         loss = list()
 
         ffs_feasible = False
-        if ignore_irrelevant_parameters:
-            by_param = partition_by_param(data, parameters)
-            distinct_values_by_param_index = distinct_param_values(parameters)
-            std_lut = np.mean([np.std(v) for v in by_param.values()])
+        if dfatool_rmt_relevance_method:
             irrelevant_params = list()
+            if dfatool_rmt_relevance_method == "std_by_param":
+                by_param = partition_by_param(data, parameters)
+                distinct_values_by_param_index = distinct_param_values(parameters)
+                std_lut = np.mean([np.std(v) for v in by_param.values()])
 
         if loss_ignore_scalar:
             ffs_eligible_params = list()
@@ -1182,7 +1174,7 @@ class ModelAttribute:
                 loss.append(np.inf)
                 continue
 
-            if ignore_irrelevant_parameters:
+            if dfatool_rmt_relevance_method == "std_by_param":
                 std_by_param = _mean_std_by_params(
                     by_param,
                     distinct_values_by_param_index,
@@ -1335,7 +1327,6 @@ class ModelAttribute:
                 child_data,
                 with_function_leaves=with_function_leaves,
                 with_nonbinary_nodes=with_nonbinary_nodes,
-                ignore_irrelevant_parameters=ignore_irrelevant_parameters,
                 loss_ignore_scalar=loss_ignore_scalar,
                 submodel=submodel,
                 threshold=threshold,
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-07 10:52:34 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-07 10:52:34 +0100
commit	92b4dd6e05df3b2805570fa1f86c35c33f147bec (patch)
tree	96bddb10dc889888269e5dc808319720c74d30a3
parent	9754b3a46dad43211539a3dbfbc7c5095bdf30f5 (diff)