From c29a59c2b0ca67f0169beef9312b3cfff19416d5 Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Wed, 9 Feb 2022 16:25:23 +0100 Subject: allow dtree relevant parameter detection to be disabled --- README.md | 1 + lib/model.py | 22 +++++++++++++++++++--- lib/parameters.py | 18 ++++++++++++------ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4d7354e..87ddc85 100644 --- a/README.md +++ b/README.md @@ -43,5 +43,6 @@ The following variables may be set to alter the behaviour of dfatool components. | `DFATOOL_FIT_LINEAR_ONLY` | **0**, 1 | Only consider linear functions (a + bx) in regression analysis. Useful for comparison with Linear Model Trees / M5. | | `DFATOOL_REGRESSION_SAFE_FUNCTIONS` | **0**, 1 | Use safe functions only (e.g. 1/x returnning 1 for x==0) | | `DFATOOL_DTREE_NONBINARY_NODES` | 0, **1** | Enable non-binary nodes (i.e., nodes with more than two children corresponding to enum variables) in decision trees | +| `DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS` | 0, **1** | Ignore parameters deemed irrelevant by stddev heuristic during regression tree generation | | `DFATOOL_DTREE_LOSS_IGNORE_SCALAR` | **0**, 1 | Ignore scalar parameters when computing the loss for split node candidates. Instead of computing the loss of a single partition for each `x_i == j`, compute the loss of partitions for `x_i == j` in which non-scalar parameters vary and scalar parameters are constant. This way, scalar parameters do not affect the decision about which non-scalar parameter to use for splitting. | | `DFATOOL_PARAM_CATEGORIAL_TO_SCALAR` | **0**, 1 | Some models (e.g. sklearn CART, XGBoost) do not support categorial parameters. Ignore them (0) or convert them to scalar indexes (1). | diff --git a/lib/model.py b/lib/model.py index 3b1279f..227a323 100644 --- a/lib/model.py +++ b/lib/model.py @@ -162,11 +162,14 @@ class AnalyticModel: ) with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0"))) with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0"))) + ignore_irrelevant_parameters = bool( + int(os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1")) + ) loss_ignore_scalar = bool( int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) ) logger.debug( - f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" + f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, loss_ignore_scalar={loss_ignore_scalar})" ) self.build_dtree( name, @@ -177,6 +180,7 @@ class AnalyticModel: with_sklearn_cart=with_sklearn_cart, with_lmt=with_lmt, with_xgboost=with_xgboost, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, ) self.fit_done = True @@ -330,6 +334,11 @@ class AnalyticModel: ) with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0"))) with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0"))) + ignore_irrelevant_parameters = bool( + int( + os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1") + ) + ) loss_ignore_scalar = bool( int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) ) @@ -341,7 +350,7 @@ class AnalyticModel: ): threshold = self.dtree_max_std[name][attr] logger.debug( - f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" + f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" ) self.build_dtree( name, @@ -352,6 +361,7 @@ class AnalyticModel: with_sklearn_cart=with_sklearn_cart, with_lmt=with_lmt, with_xgboost=with_xgboost, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, ) else: @@ -433,6 +443,7 @@ class AnalyticModel: with_sklearn_cart=False, with_lmt=False, with_xgboost=False, + ignore_irrelevant_parameters=True, loss_ignore_scalar=False, ): @@ -457,6 +468,7 @@ class AnalyticModel: with_sklearn_cart=with_sklearn_cart, with_lmt=with_lmt, with_xgboost=with_xgboost, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, ) @@ -759,11 +771,14 @@ class PTAModel(AnalyticModel): ) with_lmt = bool(int(os.getenv("DFATOOL_DTREE_LMT", "0"))) with_xgboost = bool(int(os.getenv("DFATOOL_USE_XGBOOST", "0"))) + ignore_irrelevant_parameters = bool( + int(os.getenv("DFATOOL_DTREE_IGNORE_IRRELEVANT_PARAMS", "1")) + ) loss_ignore_scalar = bool( int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) ) logger.debug( - f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" + f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, ignore_irrelevant_parameters={ignore_irrelevant_parameters}, loss_ignore_scalar={loss_ignore_scalar})" ) self.build_dtree( name, @@ -774,6 +789,7 @@ class PTAModel(AnalyticModel): with_sklearn_cart=with_sklearn_cart, with_lmt=with_lmt, with_xgboost=with_xgboost, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, ) self.fit_done = True diff --git a/lib/parameters.py b/lib/parameters.py index a615e5f..50a7ae8 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -866,6 +866,7 @@ class ModelAttribute: with_sklearn_cart=False, with_xgboost=False, with_lmt=False, + ignore_irrelevant_parameters=True, loss_ignore_scalar=False, threshold=100, ): @@ -980,6 +981,7 @@ class ModelAttribute: data, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, ) @@ -990,6 +992,7 @@ class ModelAttribute: data, with_function_leaves=False, with_nonbinary_nodes=True, + ignore_irrelevant_parameters=True, loss_ignore_scalar=False, threshold=100, level=0, @@ -1050,12 +1053,13 @@ class ModelAttribute: ffs_feasible = True continue - std_by_param = _mean_std_by_param( - by_param, distinct_values_by_param_index, param_index - ) - if not _depends_on_param(None, std_by_param, std_lut): - loss.append(np.inf) - continue + if ignore_irrelevant_parameters: + std_by_param = _mean_std_by_param( + by_param, distinct_values_by_param_index, param_index + ) + if not _depends_on_param(None, std_by_param, std_lut): + loss.append(np.inf) + continue child_indexes = list() for value in unique_values: @@ -1141,6 +1145,8 @@ class ModelAttribute: child_data, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + ignore_irrelevant_parameters=ignore_irrelevant_parameters, + loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, level=level + 1, ) -- cgit v1.2.3