summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md14
-rw-r--r--lib/functions.py7
-rw-r--r--lib/parameters.py70
3 files changed, 75 insertions, 16 deletions
diff --git a/README.md b/README.md
index 76074cb..53e1eeb 100644
--- a/README.md
+++ b/README.md
@@ -127,13 +127,13 @@ The following variables may be set to alter the behaviour of dfatool components.
| `DFATOOL_ULS_MIN_DISTINCT_VALUES` | 2 .. **3** .. *n* | Minimum number of unique values a parameter must take to be eligible for ULS |
| `DFATOOL_ULS_SKIP_CODEPENDENT_CHECK` | **0**, 1 | Do not detect and remove co-dependent features in ULS. |
| `DFATOOL_USE_XGBOOST` | **0**, 1 | Use Extreme Gradient Boosting algorithm for decision forest generation. |
-| `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. Mandatory. |
-| `DFATOOL_XGB_MAX_DEPTH` | 2 .. **10** .. *n* | Maximum XGBoost tree depth. XGBoost default: 6 |
-| `DFATOOL_XGB_SUBSAMPLE` | 0 .. **0.7** .. 1 | XGBoost subsampling ratio. XGBoost default: 1 |
-| `DFATOOL_XGB_ETA` | 0 .. **0.3** .. 1 | XGBoost learning rate (shrinkage). XGboost default: 0.3 |
-| `DFATOOL_XGB_GAMMA` | 0 .. **0.01** .. *n* | XGBoost minimum loss reduction required to to make a further partition on a leaf node. XGBoost default: 0 |
-| `DFATOOL_XGB_REG_ALPHA` | 0 .. **0.0006** .. *n* | XGBoost L1 regularization term on weights. |
-| `DFATOOL_XGB_REG_LAMBDA` | 0 .. **1** .. *n* | XGBoost L2 regularization term on weight. |
+| `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. |
+| `DFATOOL_XGB_MAX_DEPTH` | 2 .. **6** .. *n* | Maximum XGBoost tree depth. |
+| `DFATOOL_XGB_SUBSAMPLE` | 0.0 .. **1.0** | XGBoost subsampling ratio. |
+| `DFATOOL_XGB_ETA` | 0 .. **0.3** .. 1 | XGBoost learning rate (shrinkage). |
+| `DFATOOL_XGB_GAMMA` | **0.0** .. *n* | XGBoost minimum loss reduction required to to make a further partition on a leaf node. |
+| `DFATOOL_XGB_REG_ALPHA` | **0.0** .. *n* | XGBoost L1 regularization term on weights. |
+| `DFATOOL_XGB_REG_LAMBDA` | 0 .. **1** .. *n* | XGBoost L2 regularization term on weights. |
| `OMP_NUM_THREADS` | *number of CPU cores* | Maximum number of threads used per XGBoost learner. A limit of 4 threads appears to be ideal. Note that dfatool may spawn several XGBoost instances at the same time. |
| `DFATOOL_KCONF_IGNORE_NUMERIC` | **0**, 1 | Ignore numeric (int/hex) configuration options. Useful for comparison with CART/DECART. |
| `DFATOOL_KCONF_IGNORE_STRING` | 0, **1** | Ignore string configuration options. These often hold compiler paths and other not really helpful information. |
diff --git a/lib/functions.py b/lib/functions.py
index 32f777b..d18477e 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -763,7 +763,12 @@ class XGBoostFunction(SKLearnRegressionFunction):
def hyper_to_dref(self):
return {
"xgb/n estimators": self.regressor.n_estimators,
- "xgb/max depth": self.regressor.max_depth,
+ "xgb/max depth": self.regressor.max_depth == 0
+ and "infty"
+ or self.regressor.max_depth,
+ "xgb/max leaves": self.regressor.max_leaves == 0
+ and "infty"
+ or self.regressor.max_leaves,
"xgb/subsample": self.regressor.subsample,
"xgb/eta": self.regressor.learning_rate,
"xgb/gamma": self.regressor.gamma,
diff --git a/lib/parameters.py b/lib/parameters.py
index 8627b72..86f2338 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -11,6 +11,7 @@ from .paramfit import ParamFit
from .utils import remove_indexes_from_tuple, is_numeric
from .utils import filter_aggregate_by_param, partition_by_param
from .utils import param_to_ndarray
+from .utils import soft_cast_int, soft_cast_float
logger = logging.getLogger(__name__)
@@ -1066,19 +1067,72 @@ class ModelAttribute:
if with_xgboost:
import xgboost
- # TODO retrieve parameters from env
# <https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn>
+ # <https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster>
# n_estimators := number of trees in forest
# max_depth := maximum tree depth
# eta <=> learning_rate
+
+ # n_estimators : Optional[int]
+ # Number of gradient boosted trees. Equivalent to number of boosting
+ # rounds.
+ # xgboost/sklearn.py: DEFAULT_N_ESTIMATORS = 100
+ n_estimators = int(os.getenv("DFATOOL_XGB_N_ESTIMATORS", "100"))
+
+ # max_depth : Optional[int] [default=6]
+ # Maximum tree depth for base learners.
+ # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware
+ # that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.
+ # range: [0,∞]
+ max_depth = int(os.getenv("DFATOOL_XGB_MAX_DEPTH", "6"))
+
+ # max_leaves : [default=0]
+ # Maximum number of leaves; 0 indicates no limit.
+ # Maximum number of nodes to be added. Not used by exact tree method.
+ max_leaves = int(os.getenv("DFATOOL_XGB_MAX_LEAVES", "0"))
+
+ # learning_rate : Optional[float] [default=0.3]
+ # Boosting learning rate (xgb's "eta")
+ # Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta
+ # shrinks the feature weights to make the boosting process more conservative.
+ # range: [0,1]
+ learning_rate = float(os.getenv("DFATOOL_XGB_ETA", "0.3"))
+
+ # gamma : Optional[float] [default=0]
+ # (min_split_loss) Minimum loss reduction required to make a further partition on a
+ # leaf node of the tree.
+ # Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
+ # range: [0,∞]
+ gamma = float(os.getenv("DFATOOL_XGB_GAMMA", "0"))
+
+ # subsample : Optional[float] [default=1]
+ # Subsample ratio of the training instance.
+ # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing
+ # trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration.
+ # range: (0,1]
+ subsample = float(os.getenv("DFATOOL_XGB_SUBSAMPLE", "1"))
+
+ # reg_alpha : Optional[float] [default=0]
+ # L1 regularization term on weights (xgb's alpha).
+ # L1 regularization term on weights. Increasing this value will make model more conservative.
+ # range: [0, ∞]
+ reg_alpha = float(os.getenv("DFATOOL_XGB_REG_ALPHA", "0"))
+
+ # reg_lambda : Optional[float] [default=1]
+ # L2 regularization term on weights (xgb's lambda).
+ # L2 regularization term on weights. Increasing this value will make model more conservative.
+ # range: [0, ∞]
+ reg_lambda = float(os.getenv("DFATOOL_XGB_REG_LAMBDA", "1"))
+
xgb = xgboost.XGBRegressor(
- n_estimators=int(os.getenv("DFATOOL_XGB_N_ESTIMATORS", "100")),
- max_depth=int(os.getenv("DFATOOL_XGB_MAX_DEPTH", "10")),
- subsample=float(os.getenv("DFATOOL_XGB_SUBSAMPLE", "0.7")),
- learning_rate=float(os.getenv("DFATOOL_XGB_ETA", "0.3")),
- gamma=float(os.getenv("DFATOOL_XGB_GAMMA", "0.01")),
- reg_alpha=float(os.getenv("DFATOOL_XGB_REG_ALPHA", "0.0006")),
- reg_lambda=float(os.getenv("DFATOOL_XGB_REG_LAMBDA", "1")),
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ max_leaves=max_leaves,
+ subsample=subsample,
+ learning_rate=learning_rate,
+ gamma=gamma,
+ reg_alpha=reg_alpha,
+ reg_lambda=reg_lambda,
)
fit_parameters, category_to_index, ignore_index = param_to_ndarray(
parameters, with_nan=False, categorial_to_scalar=categorial_to_scalar