3 files changed, 75 insertions, 16 deletions
diff --git a/README.md b/README.md
index 76074cb..53e1eeb 100644
--- a/README.md
+++ b/README.md
@@ -127,13 +127,13 @@ The following variables may be set to alter the behaviour of dfatool components.
 | `DFATOOL_ULS_MIN_DISTINCT_VALUES` | 2 .. **3** .. *n* | Minimum number of unique values a parameter must take to be eligible for ULS |
 | `DFATOOL_ULS_SKIP_CODEPENDENT_CHECK` | **0**, 1 | Do not detect and remove co-dependent features in ULS. |
 | `DFATOOL_USE_XGBOOST` | **0**, 1 | Use Extreme Gradient Boosting algorithm for decision forest generation. |
-| `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. Mandatory. |
-| `DFATOOL_XGB_MAX_DEPTH` | 2 .. **10** .. *n* | Maximum XGBoost tree depth. XGBoost default: 6 |
-| `DFATOOL_XGB_SUBSAMPLE` | 0 .. **0.7** .. 1 | XGBoost subsampling ratio. XGBoost default: 1 |
-| `DFATOOL_XGB_ETA` | 0 .. **0.3** .. 1 | XGBoost learning rate (shrinkage). XGboost default: 0.3 |
-| `DFATOOL_XGB_GAMMA` | 0 .. **0.01** .. *n* | XGBoost minimum loss reduction required to to make a further partition on a leaf node. XGBoost default: 0 |
-| `DFATOOL_XGB_REG_ALPHA` | 0 .. **0.0006** .. *n* | XGBoost L1 regularization term on weights. |
-| `DFATOOL_XGB_REG_LAMBDA` | 0 .. **1** .. *n* | XGBoost L2 regularization term on weight. |
+| `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. |
+| `DFATOOL_XGB_MAX_DEPTH` | 2 .. **6** .. *n* | Maximum XGBoost tree depth. |
+| `DFATOOL_XGB_SUBSAMPLE` | 0.0 .. **1.0** | XGBoost subsampling ratio. |
+| `DFATOOL_XGB_ETA` | 0 .. **0.3** .. 1 | XGBoost learning rate (shrinkage). |
+| `DFATOOL_XGB_GAMMA` | **0.0** .. *n* | XGBoost minimum loss reduction required to to make a further partition on a leaf node. |
+| `DFATOOL_XGB_REG_ALPHA` | **0.0** .. *n* | XGBoost L1 regularization term on weights. |
+| `DFATOOL_XGB_REG_LAMBDA` | 0 .. **1** .. *n* | XGBoost L2 regularization term on weights. |
 | `OMP_NUM_THREADS` | *number of CPU cores* | Maximum number of threads used per XGBoost learner. A limit of 4 threads appears to be ideal. Note that dfatool may spawn several XGBoost instances at the same time. |
 | `DFATOOL_KCONF_IGNORE_NUMERIC` | **0**, 1 | Ignore numeric (int/hex) configuration options. Useful for comparison with CART/DECART. |
 | `DFATOOL_KCONF_IGNORE_STRING` | 0, **1** | Ignore string configuration options. These often hold compiler paths and other not really helpful information. |
diff --git a/lib/functions.py b/lib/functions.py
index 32f777b..d18477e 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -763,7 +763,12 @@ class XGBoostFunction(SKLearnRegressionFunction):
     def hyper_to_dref(self):
         return {
             "xgb/n estimators": self.regressor.n_estimators,
-            "xgb/max depth": self.regressor.max_depth,
+            "xgb/max depth": self.regressor.max_depth == 0
+            and "infty"
+            or self.regressor.max_depth,
+            "xgb/max leaves": self.regressor.max_leaves == 0
+            and "infty"
+            or self.regressor.max_leaves,
             "xgb/subsample": self.regressor.subsample,
             "xgb/eta": self.regressor.learning_rate,
             "xgb/gamma": self.regressor.gamma,
diff --git a/lib/parameters.py b/lib/parameters.py
index 8627b72..86f2338 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -11,6 +11,7 @@ from .paramfit import ParamFit
 from .utils import remove_indexes_from_tuple, is_numeric
 from .utils import filter_aggregate_by_param, partition_by_param
 from .utils import param_to_ndarray
+from .utils import soft_cast_int, soft_cast_float
 
 logger = logging.getLogger(__name__)
 
@@ -1066,19 +1067,72 @@ class ModelAttribute:
         if with_xgboost:
             import xgboost
 
-            # TODO retrieve parameters from env
             # <https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn>
+            # <https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster>
             # n_estimators := number of trees in forest
             # max_depth := maximum tree depth
             # eta <=> learning_rate
+
+            # n_estimators : Optional[int]
+            #     Number of gradient boosted trees.  Equivalent to number of boosting
+            #     rounds.
+            # xgboost/sklearn.py: DEFAULT_N_ESTIMATORS = 100
+            n_estimators = int(os.getenv("DFATOOL_XGB_N_ESTIMATORS", "100"))
+
+            # max_depth :  Optional[int] [default=6]
+            #     Maximum tree depth for base learners.
+            # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware
+            # that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.
+            # range: [0,∞]
+            max_depth = int(os.getenv("DFATOOL_XGB_MAX_DEPTH", "6"))
+
+            # max_leaves : [default=0]
+            #     Maximum number of leaves; 0 indicates no limit.
+            # Maximum number of nodes to be added. Not used by exact tree method.
+            max_leaves = int(os.getenv("DFATOOL_XGB_MAX_LEAVES", "0"))
+
+            # learning_rate : Optional[float] [default=0.3]
+            #     Boosting learning rate (xgb's "eta")
+            # Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta
+            # shrinks the feature weights to make the boosting process more conservative.
+            # range: [0,1]
+            learning_rate = float(os.getenv("DFATOOL_XGB_ETA", "0.3"))
+
+            # gamma : Optional[float] [default=0]
+            #     (min_split_loss) Minimum loss reduction required to make a further partition on a
+            #     leaf node of the tree.
+            # Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
+            # range: [0,∞]
+            gamma = float(os.getenv("DFATOOL_XGB_GAMMA", "0"))
+
+            # subsample : Optional[float] [default=1]
+            #     Subsample ratio of the training instance.
+            # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing
+            # trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration.
+            # range: (0,1]
+            subsample = float(os.getenv("DFATOOL_XGB_SUBSAMPLE", "1"))
+
+            # reg_alpha : Optional[float] [default=0]
+            #     L1 regularization term on weights (xgb's alpha).
+            # L1 regularization term on weights. Increasing this value will make model more conservative.
+            # range: [0, ∞]
+            reg_alpha = float(os.getenv("DFATOOL_XGB_REG_ALPHA", "0"))
+
+            # reg_lambda : Optional[float] [default=1]
+            #     L2 regularization term on weights (xgb's lambda).
+            # L2 regularization term on weights. Increasing this value will make model more conservative.
+            # range: [0, ∞]
+            reg_lambda = float(os.getenv("DFATOOL_XGB_REG_LAMBDA", "1"))
+
             xgb = xgboost.XGBRegressor(
-                n_estimators=int(os.getenv("DFATOOL_XGB_N_ESTIMATORS", "100")),
-                max_depth=int(os.getenv("DFATOOL_XGB_MAX_DEPTH", "10")),
-                subsample=float(os.getenv("DFATOOL_XGB_SUBSAMPLE", "0.7")),
-                learning_rate=float(os.getenv("DFATOOL_XGB_ETA", "0.3")),
-                gamma=float(os.getenv("DFATOOL_XGB_GAMMA", "0.01")),
-                reg_alpha=float(os.getenv("DFATOOL_XGB_REG_ALPHA", "0.0006")),
-                reg_lambda=float(os.getenv("DFATOOL_XGB_REG_LAMBDA", "1")),
+                n_estimators=n_estimators,
+                max_depth=max_depth,
+                max_leaves=max_leaves,
+                subsample=subsample,
+                learning_rate=learning_rate,
+                gamma=gamma,
+                reg_alpha=reg_alpha,
+                reg_lambda=reg_lambda,
             )
             fit_parameters, category_to_index, ignore_index = param_to_ndarray(
                 parameters, with_nan=False, categorial_to_scalar=categorial_to_scalar