From c3043d8537e4dceb303929582dab92a6024924ce Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Fri, 12 Jan 2024 09:24:23 +0100 Subject: Expose DFATOOL_ULS_MIN_DISTINCT_VALUES training hyper-parameter --- README.md | 1 + lib/parameters.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6da4fcc..d168510 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ The following variables may be set to alter the behaviour of dfatool components. | `DFATOOL_DTREE_LMT` | **0**, 1 | Use [Linear Model Tree](https://github.com/cerlymarco/linear-tree) algorithm for regression tree generation. Uses binary nodes and linear functions. Overrides `FUNCTION_LEAVES` (=0) and `NONBINARY_NODES` (=0). | | `DFATOOL_CART_MAX_DEPTH` | **0** .. *n* | maximum depth for sklearn CART. Default (0): unlimited. | | `DFATOOL_ULS_ERROR_METRIC` | **rmsd**, mae, p50, p90 | Error metric to use when selecting best-fitting function during unsupervised least squares (ULS) regression. Least squares regression itself minimzes root mean square deviation (rmsd), hence rmsd is the default. | +| `DFATOOL_ULS_MIN_DISTINCT_VALUES` | 2 .. **3** .. *n* | Minimum number of unique values a parameter must take to be eligible for ULS | | `DFATOOL_USE_XGBOOST` | **0**, 1 | Use Extreme Gradient Boosting algorithm for decision forest generation. | | `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. Mandatory. | | `DFATOOL_XGB_MAX_DEPTH` | 2 .. **10** .. *n* | Maximum XGBoost tree depth. XGBoost default: 6 | diff --git a/lib/parameters.py b/lib/parameters.py index 74be565..3173784 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -604,7 +604,11 @@ class ModelAttribute: # There must be at least 3 distinct data values (≠ None) if an analytic model # is to be fitted. For 2 (or fewer) values, decision trees are better. - self.min_values_for_analytic_model = 3 + # Exceptions such as DFATOOL_FIT_LINEAR_ONLY=1 (2 values sufficient) + # can be handled via DFATOOL_ULS_MIN_DISTINCT_VALUES + self.min_values_for_analytic_model = int( + os.getenv("DFATOOL_ULS_MIN_DISTINCT_VALUES", "3") + ) def __repr__(self): mean = np.mean(self.data) -- cgit v1.2.3