diff options
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | doc/modeling-method.md | 1 | ||||
-rw-r--r-- | lib/parameters.py | 33 |
3 files changed, 13 insertions, 23 deletions
@@ -145,7 +145,7 @@ The following variables may be set to alter the behaviour of dfatool components. | `DFATOOL_KCONF_IGNORE_STRING` | 0, **1** | Ignore string configuration options. These often hold compiler paths and other not really helpful information. | | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) | | `DFATOOL_RMT_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees | -| `DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS` | **0**, 1 | Ignore parameters deemed irrelevant by stddev heuristic during regression tree generation. Use with caution. | +| `DFATOOL_RMT_RELEVANCE_METHOD` | **none**, std\_by\_param | Ignore parameters deemed irrelevant by the specified heuristic during regression tree generation. Use with caution. | | `DFATOOL_PARAM_RELEVANCE_THRESHOLD` | 0 .. **0.5** .. 1 | Threshold for relevant parameter detection: parameter *i* is relevant if mean standard deviation (data partitioned by all parameters) / mean standard deviation (data partition by all parameters but *i*) is less than threshold | | `DFATOOL_RMT_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. | | `DFATOOL_PARAM_CATEGORICAL_TO_SCALAR` | **0**, 1 | Some models (e.g. FOL, sklearn CART, XGBoost) do not support categorical parameters. Ignore them (0) or convert them to scalar indexes (1). Conversion uses lexical order. | diff --git a/doc/modeling-method.md b/doc/modeling-method.md index 98d8fcf..585a8ea 100644 --- a/doc/modeling-method.md +++ b/doc/modeling-method.md @@ -43,7 +43,6 @@ All of these are valid regression model trees. ### Related Options * `--force-tree` builds a tree structure even if dfatool's heuristic indicates that no non-integer parameter affects the modeled performance attribute. -* `DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS=0` disables the relevant parameter detection heuristic when building the tree structure. By default, irrelevant parameters cannot end up as decision nodes. * `DFATOOL_RMT_SUBMODEL=fol` makes RMT only consider linear functions (a + bx) in regression analysis. Useful for comparison with LMT / M5. * `DFATOOL_PARAM_CATEGORICAL_TO_SCALAR=1` * `DFATOOL_ULS_SKIP_CODEPENDENT_CHECK=1` diff --git a/lib/parameters.py b/lib/parameters.py index 8c7c9cb..a154918 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -15,6 +15,10 @@ from .utils import soft_cast_int, soft_cast_float logger = logging.getLogger(__name__) +dfatool_fol_relevance_method = os.getenv("DFATOOL_FOL_RELEVANCE_METHOD", None) +dfatool_symreg_relevance_method = os.getenv("DFATOOL_SYMREG_RELEVANCE_METHOD", None) +dfatool_rmt_relevance_method = os.getenv("DFATOOL_RMT_RELEVANCE_METHOD", None) + def distinct_param_values(param_tuples): """ @@ -912,11 +916,8 @@ class ModelAttribute: return False def build_fol(self): - ignore_irrelevant = bool( - int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0")) - ) ignore_param_indexes = list() - if ignore_irrelevant: + if dfatool_fol_relevance_method == "std_by_param": for param_index, param in enumerate(self.param_names): if not self.stats.depends_on_param(param): ignore_param_indexes.append(param_index) @@ -964,11 +965,8 @@ class ModelAttribute: return False def build_symreg(self): - ignore_irrelevant = bool( - int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0")) - ) ignore_param_indexes = list() - if ignore_irrelevant: + if dfatool_symreg_relevance_method == "std_by_param": for param_index, param in enumerate(self.param_names): if not self.stats.depends_on_param(param): ignore_param_indexes.append(param_index) @@ -1031,7 +1029,6 @@ class ModelAttribute: with_function_leaves=None, with_nonbinary_nodes=None, with_gplearn_symreg=None, - ignore_irrelevant_parameters=None, loss_ignore_scalar=None, threshold=100, ): @@ -1059,10 +1056,6 @@ class ModelAttribute: ) if with_gplearn_symreg is None: with_gplearn_symreg = bool(int(os.getenv("DFATOOL_USE_SYMREG", "0"))) - if ignore_irrelevant_parameters is None: - ignore_irrelevant_parameters = bool( - int(os.getenv("DFATOOL_RMT_IGNORE_IRRELEVANT_PARAMS", "0")) - ) if loss_ignore_scalar is None: loss_ignore_scalar = bool( int(os.getenv("DFATOOL_RMT_LOSS_IGNORE_SCALAR", "0")) @@ -1084,7 +1077,6 @@ class ModelAttribute: self.data, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, - ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, submodel=os.getenv("DFATOOL_RMT_SUBMODEL", "uls"), threshold=threshold, @@ -1097,7 +1089,6 @@ class ModelAttribute: data, with_function_leaves=False, with_nonbinary_nodes=True, - ignore_irrelevant_parameters=True, loss_ignore_scalar=False, submodel="uls", threshold=100, @@ -1127,11 +1118,12 @@ class ModelAttribute: loss = list() ffs_feasible = False - if ignore_irrelevant_parameters: - by_param = partition_by_param(data, parameters) - distinct_values_by_param_index = distinct_param_values(parameters) - std_lut = np.mean([np.std(v) for v in by_param.values()]) + if dfatool_rmt_relevance_method: irrelevant_params = list() + if dfatool_rmt_relevance_method == "std_by_param": + by_param = partition_by_param(data, parameters) + distinct_values_by_param_index = distinct_param_values(parameters) + std_lut = np.mean([np.std(v) for v in by_param.values()]) if loss_ignore_scalar: ffs_eligible_params = list() @@ -1182,7 +1174,7 @@ class ModelAttribute: loss.append(np.inf) continue - if ignore_irrelevant_parameters: + if dfatool_rmt_relevance_method == "std_by_param": std_by_param = _mean_std_by_params( by_param, distinct_values_by_param_index, @@ -1335,7 +1327,6 @@ class ModelAttribute: child_data, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, - ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, submodel=submodel, threshold=threshold, |