diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-02-21 09:46:33 +0100 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-02-21 09:46:33 +0100 |
commit | 3d19e24370798f37d8119b4366b8486d67ed3110 (patch) | |
tree | 2dd3374ca8fb3ce69174813ef46123c4b22a935e /lib/functions.py | |
parent | ddf6e00b3b16a07b994107a79450909f43588445 (diff) |
Move CART/LMT/XGB fitting into the respective modules
Diffstat (limited to 'lib/functions.py')
-rw-r--r-- | lib/functions.py | 246 |
1 files changed, 236 insertions, 10 deletions
diff --git a/lib/functions.py b/lib/functions.py index d0ef7e2..501970e 100644 --- a/lib/functions.py +++ b/lib/functions.py @@ -590,7 +590,7 @@ class SKLearnRegressionFunction(ModelFunction): always_predictable = True has_eval_arr = True - def __init__(self, value, regressor, categorical_to_index, ignore_index, **kwargs): + def __init__(self, value, **kwargs): # Needed for JSON export self.param_names = kwargs.pop("param_names") self.arg_count = kwargs.pop("arg_count") @@ -600,10 +600,12 @@ class SKLearnRegressionFunction(ModelFunction): super().__init__(value, **kwargs) - self.regressor = regressor - self.categorical_to_index = categorical_to_index - self.ignore_index = ignore_index + self.categorical_to_scalar = bool( + int(os.getenv("DFATOOL_PARAM_CATEGORICAL_TO_SCALAR", "0")) + ) + self.fit_success = None + def _build_feature_names(self): # SKLearnRegressionFunction descendants use self.param_names \ self.ignore_index as features. # Thus, model feature indexes ≠ self.param_names indexes. # self.feature_names accounts for this and allows mapping feature indexes back to parameter names / parameter indexes. @@ -629,13 +631,11 @@ class SKLearnRegressionFunction(ModelFunction): ) ) - def is_predictable(self, param_list=None): - """ - Return whether the model function can be evaluated on the given parameter values. + def fit(self, param_values, data, ignore_param_indexes=None): + raise NotImplementedError - For a StaticFunction, this is always the case (i.e., this function always returns true). - """ - return True + def is_predictable(self, param_list=None): + return self.fit_success def eval(self, param_list=None): """ @@ -714,6 +714,52 @@ class SKLearnRegressionFunction(ModelFunction): class CARTFunction(SKLearnRegressionFunction): + def __init__(self, value, decart=False, **kwargs): + self.decart = decart + super().__init__(value, **kwargs) + + def fit(self, param_values, data, scalar_param_indexes=None): + + max_depth = int(os.getenv("DFATOOL_CART_MAX_DEPTH", "0")) + if max_depth == 0: + max_depth = None + + if self.decart: + fit_parameters, self.categorical_to_index, self.ignore_index = ( + param_to_ndarray( + param_values, + with_nan=False, + categorical_to_scalar=self.categorical_to_scalar, + ignore_indexes=scalar_param_indexes, + ) + ) + else: + fit_parameters, self.categorical_to_index, self.ignore_index = ( + param_to_ndarray( + param_values, + with_nan=False, + categorical_to_scalar=self.categorical_to_scalar, + ) + ) + + if fit_parameters.shape[1] == 0: + logger.warning( + f"Cannot generate CART due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}" + ) + self.fit_success = False + return self + + logger.debug("Fitting sklearn CART ...") + from sklearn.tree import DecisionTreeRegressor + + self.regressor = DecisionTreeRegressor(max_depth=max_depth) + self.regressor.fit(fit_parameters, data) + logger.debug("Fitted sklearn CART") + + self.fit_success = True + self._build_feature_names() + return self + def get_number_of_nodes(self): return self.regressor.tree_.node_count @@ -800,6 +846,91 @@ class CARTFunction(SKLearnRegressionFunction): class LMTFunction(SKLearnRegressionFunction): + + def fit(self, param_values, data): + # max_depth : int, default=5 + # The maximum depth of the tree considering only the splitting nodes. + # A higher value implies a higher training time. + max_depth = int(os.getenv("DFATOOL_LMT_MAX_DEPTH", "5")) + + # min_samples_split : int or float, default=6 + # The minimum number of samples required to split an internal node. + # The minimum valid number of samples in each node is 6. + # A lower value implies a higher training time. + # - If int, then consider `min_samples_split` as the minimum number. + # - If float, then `min_samples_split` is a fraction and + # `ceil(min_samples_split * n_samples)` are the minimum + # number of samples for each split. + if "." in os.getenv("DFATOOL_LMT_MIN_SAMPLES_SPLIT", ""): + min_samples_split = float(os.getenv("DFATOOL_LMT_MIN_SAMPLES_SPLIT")) + else: + min_samples_split = int(os.getenv("DFATOOL_LMT_MIN_SAMPLES_SPLIT", "6")) + + # min_samples_leaf : int or float, default=0.1 + # The minimum number of samples required to be at a leaf node. + # A split point at any depth will only be considered if it leaves at + # least `min_samples_leaf` training samples in each of the left and + # right branches. + # The minimum valid number of samples in each leaf is 3. + # A lower value implies a higher training time. + # - If int, then consider `min_samples_leaf` as the minimum number. + # - If float, then `min_samples_leaf` is a fraction and + # `ceil(min_samples_leaf * n_samples)` are the minimum + # number of samples for each node. + if "." in os.getenv("DFATOOL_LMT_MIN_SAMPLES_LEAF", "0.1"): + min_samples_leaf = float(os.getenv("DFATOOL_LMT_MIN_SAMPLES_LEAF", "0.1")) + else: + min_samples_leaf = int(os.getenv("DFATOOL_LMT_MIN_SAMPLES_LEAF")) + + # max_bins : int, default=25 + # The maximum number of bins to use to search the optimal split in each + # feature. Features with a small number of unique values may use less than + # ``max_bins`` bins. Must be lower than 120 and larger than 10. + # A higher value implies a higher training time. + max_bins = int(os.getenv("DFATOOL_LMT_MAX_BINS", "120")) + + # criterion : {"mse", "rmse", "mae", "poisson"}, default="mse" + # The function to measure the quality of a split. "poisson" + # requires ``y >= 0``. + criterion = os.getenv("DFATOOL_LMT_CRITERION", "mse") + + from sklearn.linear_model import LinearRegression + from dfatool.lineartree import LinearTreeRegressor + + lmt = LinearTreeRegressor( + base_estimator=LinearRegression(), + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + max_bins=max_bins, + criterion=criterion, + ) + fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray( + param_values, + with_nan=False, + categorical_to_scalar=self.categorical_to_scalar, + ) + if fit_parameters.shape[1] == 0: + logger.warning( + f"Cannot generate LMT due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}" + ) + self.fit_success = False + return self + + logger.debug("Fitting LMT ...") + try: + lmt.fit(fit_parameters, data) + except np.linalg.LinAlgError as e: + logger.error(f"LMT generation failed: {e}") + self.fit_success = False + return + logger.debug("Fitted LMT") + + self.regressor = lmt + self.fit_success = True + self._build_feature_names() + return self + def get_number_of_nodes(self): return self.regressor.node_count @@ -861,6 +992,101 @@ class LMTFunction(SKLearnRegressionFunction): class XGBoostFunction(SKLearnRegressionFunction): + + def fit(self, param_values, data): + + # <https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn> + # <https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster> + # n_estimators := number of trees in forest + # max_depth := maximum tree depth + # eta <=> learning_rate + + # n_estimators : Optional[int] + # Number of gradient boosted trees. Equivalent to number of boosting + # rounds. + # xgboost/sklearn.py: DEFAULT_N_ESTIMATORS = 100 + n_estimators = int(os.getenv("DFATOOL_XGB_N_ESTIMATORS", "100")) + + # max_depth : Optional[int] [default=6] + # Maximum tree depth for base learners. + # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware + # that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value. + # range: [0,∞] + max_depth = int(os.getenv("DFATOOL_XGB_MAX_DEPTH", "6")) + + # max_leaves : [default=0] + # Maximum number of leaves; 0 indicates no limit. + # Maximum number of nodes to be added. Not used by exact tree method. + max_leaves = int(os.getenv("DFATOOL_XGB_MAX_LEAVES", "0")) + + # learning_rate : Optional[float] [default=0.3] + # Boosting learning rate (xgb's "eta") + # Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta + # shrinks the feature weights to make the boosting process more conservative. + # range: [0,1] + learning_rate = float(os.getenv("DFATOOL_XGB_ETA", "0.3")) + + # gamma : Optional[float] [default=0] + # (min_split_loss) Minimum loss reduction required to make a further partition on a + # leaf node of the tree. + # Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be. + # range: [0,∞] + gamma = float(os.getenv("DFATOOL_XGB_GAMMA", "0")) + + # subsample : Optional[float] [default=1] + # Subsample ratio of the training instance. + # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing + # trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration. + # range: (0,1] + subsample = float(os.getenv("DFATOOL_XGB_SUBSAMPLE", "1")) + + # reg_alpha : Optional[float] [default=0] + # L1 regularization term on weights (xgb's alpha). + # L1 regularization term on weights. Increasing this value will make model more conservative. + # range: [0, ∞] + reg_alpha = float(os.getenv("DFATOOL_XGB_REG_ALPHA", "0")) + + # reg_lambda : Optional[float] [default=1] + # L2 regularization term on weights (xgb's lambda). + # L2 regularization term on weights. Increasing this value will make model more conservative. + # range: [0, ∞] + reg_lambda = float(os.getenv("DFATOOL_XGB_REG_LAMBDA", "1")) + + fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray( + param_values, + with_nan=False, + categorical_to_scalar=self.categorical_to_scalar, + ) + if fit_parameters.shape[1] == 0: + logger.warning( + f"Cannot run XGBoost due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}" + ) + self.fit_success = False + return self + + import xgboost + + xgb = xgboost.XGBRegressor( + n_estimators=n_estimators, + max_depth=max_depth, + max_leaves=max_leaves, + subsample=subsample, + learning_rate=learning_rate, + gamma=gamma, + reg_alpha=reg_alpha, + reg_lambda=reg_lambda, + ) + xgb.fit(fit_parameters, np.reshape(data, (-1, 1))) + self.fit_success = True + self.regressor = xgb + self._build_feature_names() + + if output_filename := os.getenv("DFATOOL_XGB_DUMP_MODEL", None): + xgb.get_booster().dump_model( + output_filename, dump_format="json", with_stats=True + ) + return self + def to_json(self, internal=False, **kwargs): import json |