summaryrefslogtreecommitdiff
path: root/ext/lightgbm/sklearn.py
diff options
context:
space:
mode:
Diffstat (limited to 'ext/lightgbm/sklearn.py')
-rw-r--r--ext/lightgbm/sklearn.py1370
1 files changed, 1370 insertions, 0 deletions
diff --git a/ext/lightgbm/sklearn.py b/ext/lightgbm/sklearn.py
new file mode 100644
index 0000000..120a666
--- /dev/null
+++ b/ext/lightgbm/sklearn.py
@@ -0,0 +1,1370 @@
+# coding: utf-8
+"""Scikit-learn wrapper interface for LightGBM."""
+import copy
+from inspect import signature
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import scipy.sparse
+
+from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
+ _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
+ _LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
+from .callback import _EvalResultDict, record_evaluation
+from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
+ _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
+ _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
+ dt_DataTable, np_random_Generator, pd_DataFrame)
+from .engine import train
+
+__all__ = [
+ 'LGBMClassifier',
+ 'LGBMModel',
+ 'LGBMRanker',
+ 'LGBMRegressor',
+]
+
+_LGBM_ScikitMatrixLike = Union[
+ dt_DataTable,
+ List[Union[List[float], List[int]]],
+ np.ndarray,
+ pd_DataFrame,
+ scipy.sparse.spmatrix
+]
+_LGBM_ScikitCustomObjectiveFunction = Union[
+ # f(labels, preds)
+ Callable[
+ [Optional[np.ndarray], np.ndarray],
+ Tuple[np.ndarray, np.ndarray]
+ ],
+ # f(labels, preds, weights)
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
+ Tuple[np.ndarray, np.ndarray]
+ ],
+ # f(labels, preds, weights, group)
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+ Tuple[np.ndarray, np.ndarray]
+ ],
+]
+_LGBM_ScikitCustomEvalFunction = Union[
+ # f(labels, preds)
+ Callable[
+ [Optional[np.ndarray], np.ndarray],
+ _LGBM_EvalFunctionResultType
+ ],
+ Callable[
+ [Optional[np.ndarray], np.ndarray],
+ List[_LGBM_EvalFunctionResultType]
+ ],
+ # f(labels, preds, weights)
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
+ _LGBM_EvalFunctionResultType
+ ],
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
+ List[_LGBM_EvalFunctionResultType]
+ ],
+ # f(labels, preds, weights, group)
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+ _LGBM_EvalFunctionResultType
+ ],
+ Callable[
+ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+ List[_LGBM_EvalFunctionResultType]
+ ]
+]
+_LGBM_ScikitEvalMetricType = Union[
+ str,
+ _LGBM_ScikitCustomEvalFunction,
+ List[Union[str, _LGBM_ScikitCustomEvalFunction]]
+]
+_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]
+
+
+def _get_group_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]:
+ group = dataset.get_group()
+ error_msg = (
+ "Estimators in lightgbm.sklearn should only retrieve query groups from a constructed Dataset. "
+ "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+ )
+ assert (group is None or isinstance(group, np.ndarray)), error_msg
+ return group
+
+
+def _get_label_from_constructed_dataset(dataset: Dataset) -> np.ndarray:
+ label = dataset.get_label()
+ error_msg = (
+ "Estimators in lightgbm.sklearn should only retrieve labels from a constructed Dataset. "
+ "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+ )
+ assert isinstance(label, np.ndarray), error_msg
+ return label
+
+
+def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]:
+ weight = dataset.get_weight()
+ error_msg = (
+ "Estimators in lightgbm.sklearn should only retrieve weights from a constructed Dataset. "
+ "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+ )
+ assert (weight is None or isinstance(weight, np.ndarray)), error_msg
+ return weight
+
+
+class _ObjectiveFunctionWrapper:
+ """Proxy class for objective function."""
+
+ def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction):
+ """Construct a proxy class.
+
+ This class transforms objective function to match objective function with signature ``new_func(preds, dataset)``
+ as expected by ``lightgbm.engine.train``.
+
+ Parameters
+ ----------
+ func : callable
+ Expects a callable with following signatures:
+ ``func(y_true, y_pred)``,
+ ``func(y_true, y_pred, weight)``
+ or ``func(y_true, y_pred, weight, group)``
+ and returns (grad, hess):
+
+ y_true : numpy 1-D array of shape = [n_samples]
+ The target values.
+ y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ Predicted values are returned before any transformation,
+ e.g. they are raw margin instead of probability of positive class for binary task.
+ weight : numpy 1-D array of shape = [n_samples]
+ The weight of samples. Weights should be non-negative.
+ group : numpy 1-D array
+ Group/query data.
+ Only used in the learning-to-rank task.
+ sum(group) = n_samples.
+ For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+ where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+ grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+ The value of the first order derivative (gradient) of the loss
+ with respect to the elements of y_pred for each sample point.
+ hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The value of the second order derivative (Hessian) of the loss
+ with respect to the elements of y_pred for each sample point.
+
+ .. note::
+
+ For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
+ and grad and hess should be returned in the same format.
+ """
+ self.func = func
+
+ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]:
+ """Call passed function with appropriate arguments.
+
+ Parameters
+ ----------
+ preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ dataset : Dataset
+ The training dataset.
+
+ Returns
+ -------
+ grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The value of the first order derivative (gradient) of the loss
+ with respect to the elements of preds for each sample point.
+ hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The value of the second order derivative (Hessian) of the loss
+ with respect to the elements of preds for each sample point.
+ """
+ labels = _get_label_from_constructed_dataset(dataset)
+ argc = len(signature(self.func).parameters)
+ if argc == 2:
+ grad, hess = self.func(labels, preds) # type: ignore[call-arg]
+ return grad, hess
+
+ weight = _get_weight_from_constructed_dataset(dataset)
+ if argc == 3:
+ grad, hess = self.func(labels, preds, weight) # type: ignore[call-arg]
+ return grad, hess
+
+ if argc == 4:
+ group = _get_group_from_constructed_dataset(dataset)
+ return self.func(labels, preds, weight, group) # type: ignore[call-arg]
+
+ raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
+
+
+class _EvalFunctionWrapper:
+ """Proxy class for evaluation function."""
+
+ def __init__(self, func: _LGBM_ScikitCustomEvalFunction):
+ """Construct a proxy class.
+
+ This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)``
+ as expected by ``lightgbm.engine.train``.
+
+ Parameters
+ ----------
+ func : callable
+ Expects a callable with following signatures:
+ ``func(y_true, y_pred)``,
+ ``func(y_true, y_pred, weight)``
+ or ``func(y_true, y_pred, weight, group)``
+ and returns (eval_name, eval_result, is_higher_better) or
+ list of (eval_name, eval_result, is_higher_better):
+
+ y_true : numpy 1-D array of shape = [n_samples]
+ The target values.
+ y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ In case of custom ``objective``, predicted values are returned before any transformation,
+ e.g. they are raw margin instead of probability of positive class for binary task in this case.
+ weight : numpy 1-D array of shape = [n_samples]
+ The weight of samples. Weights should be non-negative.
+ group : numpy 1-D array
+ Group/query data.
+ Only used in the learning-to-rank task.
+ sum(group) = n_samples.
+ For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+ where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+ eval_name : str
+ The name of evaluation function (without whitespace).
+ eval_result : float
+ The eval result.
+ is_higher_better : bool
+ Is eval result higher better, e.g. AUC is ``is_higher_better``.
+ """
+ self.func = func
+
+ def __call__(
+ self,
+ preds: np.ndarray,
+ dataset: Dataset
+ ) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]:
+ """Call passed function with appropriate arguments.
+
+ Parameters
+ ----------
+ preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ dataset : Dataset
+ The training dataset.
+
+ Returns
+ -------
+ eval_name : str
+ The name of evaluation function (without whitespace).
+ eval_result : float
+ The eval result.
+ is_higher_better : bool
+ Is eval result higher better, e.g. AUC is ``is_higher_better``.
+ """
+ labels = _get_label_from_constructed_dataset(dataset)
+ argc = len(signature(self.func).parameters)
+ if argc == 2:
+ return self.func(labels, preds) # type: ignore[call-arg]
+
+ weight = _get_weight_from_constructed_dataset(dataset)
+ if argc == 3:
+ return self.func(labels, preds, weight) # type: ignore[call-arg]
+
+ if argc == 4:
+ group = _get_group_from_constructed_dataset(dataset)
+ return self.func(labels, preds, weight, group) # type: ignore[call-arg]
+
+ raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
+
+
+# documentation templates for LGBMModel methods are shared between the classes in
+# this module and those in the ``dask`` module
+
+_lgbmmodel_doc_fit = (
+ """
+ Build a gradient boosting model from the training set (X, y).
+
+ Parameters
+ ----------
+ X : {X_shape}
+ Input feature matrix.
+ y : {y_shape}
+ The target values (class labels in classification, real numbers in regression).
+ sample_weight : {sample_weight_shape}
+ Weights of training data. Weights should be non-negative.
+ init_score : {init_score_shape}
+ Init score of training data.
+ group : {group_shape}
+ Group/query data.
+ Only used in the learning-to-rank task.
+ sum(group) = n_samples.
+ For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+ where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+ eval_set : list or None, optional (default=None)
+ A list of (X, y) tuple pairs to use as validation sets.
+ eval_names : list of str, or None, optional (default=None)
+ Names of eval_set.
+ eval_sample_weight : {eval_sample_weight_shape}
+ Weights of eval data. Weights should be non-negative.
+ eval_class_weight : list or None, optional (default=None)
+ Class weights of eval data.
+ eval_init_score : {eval_init_score_shape}
+ Init score of eval data.
+ eval_group : {eval_group_shape}
+ Group data of eval data.
+ eval_metric : str, callable, list or None, optional (default=None)
+ If str, it should be a built-in evaluation metric to use.
+ If callable, it should be a custom evaluation metric, see note below for more details.
+ If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both.
+ In either case, the ``metric`` from the model parameters will be evaluated and used as well.
+ Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker.
+ feature_name : list of str, or 'auto', optional (default='auto')
+ Feature names.
+ If 'auto' and data is pandas DataFrame, data columns names are used.
+ categorical_feature : list of str or int, or 'auto', optional (default='auto')
+ Categorical features.
+ If list of int, interpreted as indices.
+ If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
+ If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
+ All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
+ Large values could be memory consuming. Consider using consecutive integers starting from zero.
+ All negative values in categorical features will be treated as missing values.
+ The output cannot be monotonically constrained with respect to a categorical feature.
+ Floating point numbers in categorical features will be rounded towards 0.
+ callbacks : list of callable, or None, optional (default=None)
+ List of callback functions that are applied at each iteration.
+ See Callbacks in Python API for more information.
+ init_model : str, pathlib.Path, Booster, LGBMModel or None, optional (default=None)
+ Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.
+
+ Returns
+ -------
+ self : LGBMModel
+ Returns self.
+ """
+)
+
+_lgbmmodel_doc_custom_eval_note = """
+ Note
+ ----
+ Custom eval function expects a callable with following signatures:
+ ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
+ ``func(y_true, y_pred, weight, group)``
+ and returns (eval_name, eval_result, is_higher_better) or
+ list of (eval_name, eval_result, is_higher_better):
+
+ y_true : numpy 1-D array of shape = [n_samples]
+ The target values.
+ y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ In case of custom ``objective``, predicted values are returned before any transformation,
+ e.g. they are raw margin instead of probability of positive class for binary task in this case.
+ weight : numpy 1-D array of shape = [n_samples]
+ The weight of samples. Weights should be non-negative.
+ group : numpy 1-D array
+ Group/query data.
+ Only used in the learning-to-rank task.
+ sum(group) = n_samples.
+ For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+ where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+ eval_name : str
+ The name of evaluation function (without whitespace).
+ eval_result : float
+ The eval result.
+ is_higher_better : bool
+ Is eval result higher better, e.g. AUC is ``is_higher_better``.
+"""
+
+_lgbmmodel_doc_predict = (
+ """
+ {description}
+
+ Parameters
+ ----------
+ X : {X_shape}
+ Input features matrix.
+ raw_score : bool, optional (default=False)
+ Whether to predict raw scores.
+ start_iteration : int, optional (default=0)
+ Start index of the iteration to predict.
+ If <= 0, starts from the first iteration.
+ num_iteration : int or None, optional (default=None)
+ Total number of iterations used in the prediction.
+ If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
+ otherwise, all iterations from ``start_iteration`` are used (no limits).
+ If <= 0, all iterations from ``start_iteration`` are used (no limits).
+ pred_leaf : bool, optional (default=False)
+ Whether to predict leaf index.
+ pred_contrib : bool, optional (default=False)
+ Whether to predict feature contributions.
+
+ .. note::
+
+ If you want to get more explanations for your model's predictions using SHAP values,
+ like SHAP interaction values,
+ you can install the shap package (https://github.com/slundberg/shap).
+ Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
+ column, where the last column is the expected value.
+
+ validate_features : bool, optional (default=False)
+ If True, ensure that the features used to predict match the ones used to train.
+ Used only if data is pandas DataFrame.
+ **kwargs
+ Other parameters for the prediction.
+
+ Returns
+ -------
+ {output_name} : {predicted_result_shape}
+ The predicted values.
+ X_leaves : {X_leaves_shape}
+ If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
+ X_SHAP_values : {X_SHAP_values_shape}
+ If ``pred_contrib=True``, the feature contributions for each sample.
+ """
+)
+
+
+class LGBMModel(_LGBMModelBase):
+ """Implementation of the scikit-learn API for LightGBM."""
+
+ def __init__(
+ self,
+ boosting_type: str = 'gbdt',
+ num_leaves: int = 31,
+ max_depth: int = -1,
+ learning_rate: float = 0.1,
+ n_estimators: int = 100,
+ subsample_for_bin: int = 200000,
+ objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
+ class_weight: Optional[Union[Dict, str]] = None,
+ min_split_gain: float = 0.,
+ min_child_weight: float = 1e-3,
+ min_child_samples: int = 20,
+ subsample: float = 1.,
+ subsample_freq: int = 0,
+ colsample_bytree: float = 1.,
+ reg_alpha: float = 0.,
+ reg_lambda: float = 0.,
+ random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
+ n_jobs: Optional[int] = None,
+ importance_type: str = 'split',
+ **kwargs
+ ):
+ r"""Construct a gradient boosting model.
+
+ Parameters
+ ----------
+ boosting_type : str, optional (default='gbdt')
+ 'gbdt', traditional Gradient Boosting Decision Tree.
+ 'dart', Dropouts meet Multiple Additive Regression Trees.
+ 'rf', Random Forest.
+ num_leaves : int, optional (default=31)
+ Maximum tree leaves for base learners.
+ max_depth : int, optional (default=-1)
+ Maximum tree depth for base learners, <=0 means no limit.
+ learning_rate : float, optional (default=0.1)
+ Boosting learning rate.
+ You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
+ in training using ``reset_parameter`` callback.
+ Note, that this will ignore the ``learning_rate`` argument in training.
+ n_estimators : int, optional (default=100)
+ Number of boosted trees to fit.
+ subsample_for_bin : int, optional (default=200000)
+ Number of samples for constructing bins.
+ objective : str, callable or None, optional (default=None)
+ Specify the learning task and the corresponding learning objective or
+ a custom objective function to be used (see note below).
+ Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
+ class_weight : dict, 'balanced' or None, optional (default=None)
+ Weights associated with classes in the form ``{class_label: weight}``.
+ Use this parameter only for multi-class classification task;
+ for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
+ Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities.
+ You may want to consider performing probability calibration
+ (https://scikit-learn.org/stable/modules/calibration.html) of your model.
+ The 'balanced' mode uses the values of y to automatically adjust weights
+ inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
+ If None, all classes are supposed to have weight one.
+ Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method)
+ if ``sample_weight`` is specified.
+ min_split_gain : float, optional (default=0.)
+ Minimum loss reduction required to make a further partition on a leaf node of the tree.
+ min_child_weight : float, optional (default=1e-3)
+ Minimum sum of instance weight (Hessian) needed in a child (leaf).
+ min_child_samples : int, optional (default=20)
+ Minimum number of data needed in a child (leaf).
+ subsample : float, optional (default=1.)
+ Subsample ratio of the training instance.
+ subsample_freq : int, optional (default=0)
+ Frequency of subsample, <=0 means no enable.
+ colsample_bytree : float, optional (default=1.)
+ Subsample ratio of columns when constructing each tree.
+ reg_alpha : float, optional (default=0.)
+ L1 regularization term on weights.
+ reg_lambda : float, optional (default=0.)
+ L2 regularization term on weights.
+ random_state : int, RandomState object or None, optional (default=None)
+ Random number seed.
+ If int, this number is used to seed the C++ code.
+ If RandomState or Generator object (numpy), a random integer is picked based on its state to seed the C++ code.
+ If None, default seeds in C++ code are used.
+ n_jobs : int or None, optional (default=None)
+ Number of parallel threads to use for training (can be changed at prediction time by
+ passing it as an extra keyword argument).
+
+ For better performance, it is recommended to set this to the number of physical cores
+ in the CPU.
+
+ Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
+ scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
+ threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
+ to using the number of physical cores in the system (its correct detection requires
+ either the ``joblib`` or the ``psutil`` util libraries to be installed).
+
+ .. versionchanged:: 4.0.0
+
+ importance_type : str, optional (default='split')
+ The type of feature importance to be filled into ``feature_importances_``.
+ If 'split', result contains numbers of times the feature is used in a model.
+ If 'gain', result contains total gains of splits which use the feature.
+ **kwargs
+ Other parameters for the model.
+ Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
+
+ .. warning::
+
+ \*\*kwargs is not supported in sklearn, it may cause unexpected issues.
+
+ Note
+ ----
+ A custom objective function can be provided for the ``objective`` parameter.
+ In this case, it should have the signature
+ ``objective(y_true, y_pred) -> grad, hess``,
+ ``objective(y_true, y_pred, weight) -> grad, hess``
+ or ``objective(y_true, y_pred, weight, group) -> grad, hess``:
+
+ y_true : numpy 1-D array of shape = [n_samples]
+ The target values.
+ y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The predicted values.
+ Predicted values are returned before any transformation,
+ e.g. they are raw margin instead of probability of positive class for binary task.
+ weight : numpy 1-D array of shape = [n_samples]
+ The weight of samples. Weights should be non-negative.
+ group : numpy 1-D array
+ Group/query data.
+ Only used in the learning-to-rank task.
+ sum(group) = n_samples.
+ For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+ where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+ grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The value of the first order derivative (gradient) of the loss
+ with respect to the elements of y_pred for each sample point.
+ hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
+ The value of the second order derivative (Hessian) of the loss
+ with respect to the elements of y_pred for each sample point.
+
+ For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
+ and grad and hess should be returned in the same format.
+ """
+ if not SKLEARN_INSTALLED:
+ raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '
+ 'You must install scikit-learn and restart your session to use this module.')
+
+ self.boosting_type = boosting_type
+ self.objective = objective
+ self.num_leaves = num_leaves
+ self.max_depth = max_depth
+ self.learning_rate = learning_rate
+ self.n_estimators = n_estimators
+ self.subsample_for_bin = subsample_for_bin
+ self.min_split_gain = min_split_gain
+ self.min_child_weight = min_child_weight
+ self.min_child_samples = min_child_samples
+ self.subsample = subsample
+ self.subsample_freq = subsample_freq
+ self.colsample_bytree = colsample_bytree
+ self.reg_alpha = reg_alpha
+ self.reg_lambda = reg_lambda
+ self.random_state = random_state
+ self.n_jobs = n_jobs
+ self.importance_type = importance_type
+ self._Booster: Optional[Booster] = None
+ self._evals_result: _EvalResultDict = {}
+ self._best_score: _LGBM_BoosterBestScoreType = {}
+ self._best_iteration: int = -1
+ self._other_params: Dict[str, Any] = {}
+ self._objective = objective
+ self.class_weight = class_weight
+ self._class_weight: Optional[Union[Dict, str]] = None
+ self._class_map: Optional[Dict[int, int]] = None
+ self._n_features: int = -1
+ self._n_features_in: int = -1
+ self._classes: Optional[np.ndarray] = None
+ self._n_classes: int = -1
+ self.set_params(**kwargs)
+
+ def _more_tags(self) -> Dict[str, Any]:
+ return {
+ 'allow_nan': True,
+ 'X_types': ['2darray', 'sparse', '1dlabels'],
+ '_xfail_checks': {
+ 'check_no_attributes_set_in_init':
+ 'scikit-learn incorrectly asserts that private attributes '
+ 'cannot be set in __init__: '
+ '(see https://github.com/microsoft/LightGBM/issues/2628)'
+ }
+ }
+
+ def __sklearn_is_fitted__(self) -> bool:
+ return getattr(self, "fitted_", False)
+
+ def get_params(self, deep: bool = True) -> Dict[str, Any]:
+ """Get parameters for this estimator.
+
+ Parameters
+ ----------
+ deep : bool, optional (default=True)
+ If True, will return the parameters for this estimator and
+ contained subobjects that are estimators.
+
+ Returns
+ -------
+ params : dict
+ Parameter names mapped to their values.
+ """
+ params = super().get_params(deep=deep)
+ params.update(self._other_params)
+ return params
+
+ def set_params(self, **params: Any) -> "LGBMModel":
+ """Set the parameters of this estimator.
+
+ Parameters
+ ----------
+ **params
+ Parameter names with their new values.
+
+ Returns
+ -------
+ self : object
+ Returns self.
+ """
+ for key, value in params.items():
+ setattr(self, key, value)
+ if hasattr(self, f"_{key}"):
+ setattr(self, f"_{key}", value)
+ self._other_params[key] = value
+ return self
+
+ def _process_params(self, stage: str) -> Dict[str, Any]:
+ """Process the parameters of this estimator based on its type, parameter aliases, etc.
+
+ Parameters
+ ----------
+ stage : str
+ Name of the stage (can be ``fit`` or ``predict``) this method is called from.
+
+ Returns
+ -------
+ processed_params : dict
+ Processed parameter names mapped to their values.
+ """
+ assert stage in {"fit", "predict"}
+ params = self.get_params()
+
+ params.pop('objective', None)
+ for alias in _ConfigAliases.get('objective'):
+ if alias in params:
+ obj = params.pop(alias)
+ _log_warning(f"Found '{alias}' in params. Will use it instead of 'objective' argument")
+ if stage == "fit":
+ self._objective = obj
+ if stage == "fit":
+ if self._objective is None:
+ if isinstance(self, LGBMRegressor):
+ self._objective = "regression"
+ elif isinstance(self, LGBMClassifier):
+ if self._n_classes > 2:
+ self._objective = "multiclass"
+ else:
+ self._objective = "binary"
+ elif isinstance(self, LGBMRanker):
+ self._objective = "lambdarank"
+ else:
+ raise ValueError("Unknown LGBMModel type.")
+ if callable(self._objective):
+ if stage == "fit":
+ params['objective'] = _ObjectiveFunctionWrapper(self._objective)
+ else:
+ params['objective'] = 'None'
+ else:
+ params['objective'] = self._objective
+
+ params.pop('importance_type', None)
+ params.pop('n_estimators', None)
+ params.pop('class_weight', None)
+
+ if isinstance(params['random_state'], np.random.RandomState):
+ params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
+ elif isinstance(params['random_state'], np_random_Generator):
+ params['random_state'] = int(
+ params['random_state'].integers(np.iinfo(np.int32).max)
+ )
+ if self._n_classes > 2:
+ for alias in _ConfigAliases.get('num_class'):
+ params.pop(alias, None)
+ params['num_class'] = self._n_classes
+ if hasattr(self, '_eval_at'):
+ eval_at = self._eval_at
+ for alias in _ConfigAliases.get('eval_at'):
+ if alias in params:
+ _log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument")
+ eval_at = params.pop(alias)
+ params['eval_at'] = eval_at
+
+ # register default metric for consistency with callable eval_metric case
+ original_metric = self._objective if isinstance(self._objective, str) else None
+ if original_metric is None:
+ # try to deduce from class instance
+ if isinstance(self, LGBMRegressor):
+ original_metric = "l2"
+ elif isinstance(self, LGBMClassifier):
+ original_metric = "multi_logloss" if self._n_classes > 2 else "binary_logloss"
+ elif isinstance(self, LGBMRanker):
+ original_metric = "ndcg"
+
+ # overwrite default metric by explicitly set metric
+ params = _choose_param_value("metric", params, original_metric)
+
+ # use joblib conventions for negative n_jobs, just like scikit-learn
+ # at predict time, this is handled later due to the order of parameter updates
+ if stage == "fit":
+ params = _choose_param_value("num_threads", params, self.n_jobs)
+ params["num_threads"] = self._process_n_jobs(params["num_threads"])
+
+ return params
+
+ def _process_n_jobs(self, n_jobs: Optional[int]) -> int:
+ """Convert special values of n_jobs to their actual values according to the formulas that apply.
+
+ Parameters
+ ----------
+ n_jobs : int or None
+ The original value of n_jobs, potentially having special values such as 'None' or
+ negative integers.
+
+ Returns
+ -------
+ n_jobs : int
+ The value of n_jobs with special values converted to actual number of threads.
+ """
+ if n_jobs is None:
+ n_jobs = _LGBMCpuCount(only_physical_cores=True)
+ elif n_jobs < 0:
+ n_jobs = max(_LGBMCpuCount(only_physical_cores=False) + 1 + n_jobs, 1)
+ return n_jobs
+
+ def fit(
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ y: _LGBM_LabelType,
+ sample_weight: Optional[_LGBM_WeightType] = None,
+ init_score: Optional[_LGBM_InitScoreType] = None,
+ group: Optional[_LGBM_GroupType] = None,
+ eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
+ eval_names: Optional[List[str]] = None,
+ eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+ eval_class_weight: Optional[List[float]] = None,
+ eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+ eval_group: Optional[List[_LGBM_GroupType]] = None,
+ eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
+ feature_name: _LGBM_FeatureNameConfiguration = 'auto',
+ categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
+ callbacks: Optional[List[Callable]] = None,
+ init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None
+ ) -> "LGBMModel":
+ """Docstring is set after definition, using a template."""
+ params = self._process_params(stage="fit")
+
+ # Do not modify original args in fit function
+ # Refer to https://github.com/microsoft/LightGBM/pull/2619
+ eval_metric_list: List[Union[str, _LGBM_ScikitCustomEvalFunction]]
+ if eval_metric is None:
+ eval_metric_list = []
+ elif isinstance(eval_metric, list):
+ eval_metric_list = copy.deepcopy(eval_metric)
+ else:
+ eval_metric_list = [copy.deepcopy(eval_metric)]
+
+ # Separate built-in from callable evaluation metrics
+ eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)]
+ eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)]
+
+ # concatenate metric from params (or default if not provided in params) and eval_metric
+ params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric']
+ params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
+ params['metric'] = [metric for metric in params['metric'] if metric is not None]
+
+ if not isinstance(X, (pd_DataFrame, dt_DataTable)):
+ _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
+ if sample_weight is not None:
+ sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
+ else:
+ _X, _y = X, y
+
+ if self._class_weight is None:
+ self._class_weight = self.class_weight
+ if self._class_weight is not None:
+ class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y)
+ if sample_weight is None or len(sample_weight) == 0:
+ sample_weight = class_sample_weight
+ else:
+ sample_weight = np.multiply(sample_weight, class_sample_weight)
+
+ self._n_features = _X.shape[1]
+ # copy for consistency
+ self._n_features_in = self._n_features
+
+ train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
+ init_score=init_score, categorical_feature=categorical_feature,
+ params=params)
+
+ valid_sets: List[Dataset] = []
+ if eval_set is not None:
+
+ def _get_meta_data(collection, name, i):
+ if collection is None:
+ return None
+ elif isinstance(collection, list):
+ return collection[i] if len(collection) > i else None
+ elif isinstance(collection, dict):
+ return collection.get(i, None)
+ else:
+ raise TypeError(f"{name} should be dict or list")
+
+ if isinstance(eval_set, tuple):
+ eval_set = [eval_set]
+ for i, valid_data in enumerate(eval_set):
+ # reduce cost for prediction training data
+ if valid_data[0] is X and valid_data[1] is y:
+ valid_set = train_set
+ else:
+ valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
+ valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
+ if valid_class_weight is not None:
+ if isinstance(valid_class_weight, dict) and self._class_map is not None:
+ valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
+ valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
+ if valid_weight is None or len(valid_weight) == 0:
+ valid_weight = valid_class_sample_weight
+ else:
+ valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
+ valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
+ valid_group = _get_meta_data(eval_group, 'eval_group', i)
+ valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
+ group=valid_group, init_score=valid_init_score,
+ categorical_feature='auto', params=params)
+
+ valid_sets.append(valid_set)
+
+ if isinstance(init_model, LGBMModel):
+ init_model = init_model.booster_
+
+ if callbacks is None:
+ callbacks = []
+ else:
+ callbacks = copy.copy(callbacks) # don't use deepcopy here to allow non-serializable objects
+
+ evals_result: _EvalResultDict = {}
+ callbacks.append(record_evaluation(evals_result))
+
+ self._Booster = train(
+ params=params,
+ train_set=train_set,
+ num_boost_round=self.n_estimators,
+ valid_sets=valid_sets,
+ valid_names=eval_names,
+ feval=eval_metrics_callable, # type: ignore[arg-type]
+ init_model=init_model,
+ feature_name=feature_name,
+ callbacks=callbacks
+ )
+
+ self._evals_result = evals_result
+ self._best_iteration = self._Booster.best_iteration
+ self._best_score = self._Booster.best_score
+
+ self.fitted_ = True
+
+ # free dataset
+ self._Booster.free_dataset()
+ del train_set, valid_sets
+ return self
+
+ fit.__doc__ = _lgbmmodel_doc_fit.format(
+ X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+ y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
+ sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
+ init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
+ group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
+ eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
+ eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
+ eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
+ ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
+
+ def predict(
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ raw_score: bool = False,
+ start_iteration: int = 0,
+ num_iteration: Optional[int] = None,
+ pred_leaf: bool = False,
+ pred_contrib: bool = False,
+ validate_features: bool = False,
+ **kwargs: Any
+ ):
+ """Docstring is set after definition, using a template."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
+ if not isinstance(X, (pd_DataFrame, dt_DataTable)):
+ X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
+ n_features = X.shape[1]
+ if self._n_features != n_features:
+ raise ValueError("Number of features of the model must "
+ f"match the input. Model n_features_ is {self._n_features} and "
+ f"input n_features is {n_features}")
+ # retrive original params that possibly can be used in both training and prediction
+ # and then overwrite them (considering aliases) with params that were passed directly in prediction
+ predict_params = self._process_params(stage="predict")
+ for alias in _ConfigAliases.get_by_alias(
+ "data",
+ "X",
+ "raw_score",
+ "start_iteration",
+ "num_iteration",
+ "pred_leaf",
+ "pred_contrib",
+ *kwargs.keys()
+ ):
+ predict_params.pop(alias, None)
+ predict_params.update(kwargs)
+
+ # number of threads can have values with special meaning which is only applied
+ # in the scikit-learn interface, these should not reach the c++ side as-is
+ predict_params = _choose_param_value("num_threads", predict_params, self.n_jobs)
+ predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"])
+
+ return self._Booster.predict( # type: ignore[union-attr]
+ X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
+ pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features,
+ **predict_params
+ )
+
+ predict.__doc__ = _lgbmmodel_doc_predict.format(
+ description="Return the predicted value for each sample.",
+ X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+ output_name="predicted_result",
+ predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
+ X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+ X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
+ )
+
+ @property
+ def n_features_(self) -> int:
+ """:obj:`int`: The number of features of fitted model."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
+ return self._n_features
+
+ @property
+ def n_features_in_(self) -> int:
+ """:obj:`int`: The number of features of fitted model."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.')
+ return self._n_features_in
+
+ @property
+ def best_score_(self) -> _LGBM_BoosterBestScoreType:
+ """:obj:`dict`: The best score of fitted model."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.')
+ return self._best_score
+
+ @property
+ def best_iteration_(self) -> int:
+ """:obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.')
+ return self._best_iteration
+
+ @property
+ def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]:
+ """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
+ return self._objective # type: ignore[return-value]
+
+ @property
+ def n_estimators_(self) -> int:
+ """:obj:`int`: True number of boosting iterations performed.
+
+ This might be less than parameter ``n_estimators`` if early stopping was enabled or
+ if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
+
+ .. versionadded:: 4.0.0
+ """
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No n_estimators found. Need to call fit beforehand.')
+ return self._Booster.current_iteration() # type: ignore
+
+ @property
+ def n_iter_(self) -> int:
+ """:obj:`int`: True number of boosting iterations performed.
+
+ This might be less than parameter ``n_estimators`` if early stopping was enabled or
+ if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
+
+ .. versionadded:: 4.0.0
+ """
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No n_iter found. Need to call fit beforehand.')
+ return self._Booster.current_iteration() # type: ignore
+
+ @property
+ def booster_(self) -> Booster:
+ """Booster: The underlying Booster of this model."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No booster found. Need to call fit beforehand.')
+ return self._Booster # type: ignore[return-value]
+
+ @property
+ def evals_result_(self) -> _EvalResultDict:
+ """:obj:`dict`: The evaluation results if validation sets have been specified."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.')
+ return self._evals_result
+
+ @property
+ def feature_importances_(self) -> np.ndarray:
+ """:obj:`array` of shape = [n_features]: The feature importances (the higher, the more important).
+
+ .. note::
+
+ ``importance_type`` attribute is passed to the function
+ to configure the type of importance values to be extracted.
+ """
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
+ return self._Booster.feature_importance(importance_type=self.importance_type) # type: ignore[union-attr]
+
+ @property
+ def feature_name_(self) -> List[str]:
+ """:obj:`list` of shape = [n_features]: The names of features."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.')
+ return self._Booster.feature_name() # type: ignore[union-attr]
+
+
+class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
+ """LightGBM regressor."""
+
+ def fit( # type: ignore[override]
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ y: _LGBM_LabelType,
+ sample_weight: Optional[_LGBM_WeightType] = None,
+ init_score: Optional[_LGBM_InitScoreType] = None,
+ eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
+ eval_names: Optional[List[str]] = None,
+ eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+ eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+ eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
+ feature_name: _LGBM_FeatureNameConfiguration = 'auto',
+ categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
+ callbacks: Optional[List[Callable]] = None,
+ init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
+ ) -> "LGBMRegressor":
+ """Docstring is inherited from the LGBMModel."""
+ super().fit(
+ X,
+ y,
+ sample_weight=sample_weight,
+ init_score=init_score,
+ eval_set=eval_set,
+ eval_names=eval_names,
+ eval_sample_weight=eval_sample_weight,
+ eval_init_score=eval_init_score,
+ eval_metric=eval_metric,
+ feature_name=feature_name,
+ categorical_feature=categorical_feature,
+ callbacks=callbacks,
+ init_model=init_model
+ )
+ return self
+
+ _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRegressor") # type: ignore
+ _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore
+ + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore
+ _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')]
+ + _base_doc[_base_doc.find('eval_init_score :'):])
+ fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
+ + _base_doc[_base_doc.find('eval_metric :'):])
+
+
+class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
+ """LightGBM classifier."""
+
+ def fit( # type: ignore[override]
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ y: _LGBM_LabelType,
+ sample_weight: Optional[_LGBM_WeightType] = None,
+ init_score: Optional[_LGBM_InitScoreType] = None,
+ eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
+ eval_names: Optional[List[str]] = None,
+ eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+ eval_class_weight: Optional[List[float]] = None,
+ eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+ eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
+ feature_name: _LGBM_FeatureNameConfiguration = 'auto',
+ categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
+ callbacks: Optional[List[Callable]] = None,
+ init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
+ ) -> "LGBMClassifier":
+ """Docstring is inherited from the LGBMModel."""
+ _LGBMAssertAllFinite(y)
+ _LGBMCheckClassificationTargets(y)
+ self._le = _LGBMLabelEncoder().fit(y)
+ _y = self._le.transform(y)
+ self._class_map = dict(zip(self._le.classes_, self._le.transform(self._le.classes_)))
+ if isinstance(self.class_weight, dict):
+ self._class_weight = {self._class_map[k]: v for k, v in self.class_weight.items()}
+
+ self._classes = self._le.classes_
+ self._n_classes = len(self._classes) # type: ignore[arg-type]
+ if self.objective is None:
+ self._objective = None
+
+ # adjust eval metrics to match whether binary or multiclass
+ # classification is being performed
+ if not callable(eval_metric):
+ if isinstance(eval_metric, list):
+ eval_metric_list = eval_metric
+ elif isinstance(eval_metric, str):
+ eval_metric_list = [eval_metric]
+ else:
+ eval_metric_list = []
+ if self._n_classes > 2:
+ for index, metric in enumerate(eval_metric_list):
+ if metric in {'logloss', 'binary_logloss'}:
+ eval_metric_list[index] = "multi_logloss"
+ elif metric in {'error', 'binary_error'}:
+ eval_metric_list[index] = "multi_error"
+ else:
+ for index, metric in enumerate(eval_metric_list):
+ if metric in {'logloss', 'multi_logloss'}:
+ eval_metric_list[index] = 'binary_logloss'
+ elif metric in {'error', 'multi_error'}:
+ eval_metric_list[index] = 'binary_error'
+ eval_metric = eval_metric_list
+
+ # do not modify args, as it causes errors in model selection tools
+ valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None
+ if eval_set is not None:
+ if isinstance(eval_set, tuple):
+ eval_set = [eval_set]
+ valid_sets = []
+ for valid_x, valid_y in eval_set:
+ if valid_x is X and valid_y is y:
+ valid_sets.append((valid_x, _y))
+ else:
+ valid_sets.append((valid_x, self._le.transform(valid_y)))
+
+ super().fit(
+ X,
+ _y,
+ sample_weight=sample_weight,
+ init_score=init_score,
+ eval_set=valid_sets,
+ eval_names=eval_names,
+ eval_sample_weight=eval_sample_weight,
+ eval_class_weight=eval_class_weight,
+ eval_init_score=eval_init_score,
+ eval_metric=eval_metric,
+ feature_name=feature_name,
+ categorical_feature=categorical_feature,
+ callbacks=callbacks,
+ init_model=init_model
+ )
+ return self
+
+ _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMClassifier") # type: ignore
+ _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore
+ + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore
+ fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
+ + _base_doc[_base_doc.find('eval_metric :'):])
+
+ def predict(
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ raw_score: bool = False,
+ start_iteration: int = 0,
+ num_iteration: Optional[int] = None,
+ pred_leaf: bool = False,
+ pred_contrib: bool = False,
+ validate_features: bool = False,
+ **kwargs: Any
+ ):
+ """Docstring is inherited from the LGBMModel."""
+ result = self.predict_proba(
+ X=X,
+ raw_score=raw_score,
+ start_iteration=start_iteration,
+ num_iteration=num_iteration,
+ pred_leaf=pred_leaf,
+ pred_contrib=pred_contrib,
+ validate_features=validate_features,
+ **kwargs
+ )
+ if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
+ return result
+ else:
+ class_index = np.argmax(result, axis=1)
+ return self._le.inverse_transform(class_index)
+
+ predict.__doc__ = LGBMModel.predict.__doc__
+
+ def predict_proba(
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ raw_score: bool = False,
+ start_iteration: int = 0,
+ num_iteration: Optional[int] = None,
+ pred_leaf: bool = False,
+ pred_contrib: bool = False,
+ validate_features: bool = False,
+ **kwargs: Any
+ ):
+ """Docstring is set after definition, using a template."""
+ result = super().predict(
+ X=X,
+ raw_score=raw_score,
+ start_iteration=start_iteration,
+ num_iteration=num_iteration,
+ pred_leaf=pred_leaf,
+ pred_contrib=pred_contrib,
+ validate_features=validate_features,
+ **kwargs
+ )
+ if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
+ _log_warning("Cannot compute class probabilities or labels "
+ "due to the usage of customized objective function.\n"
+ "Returning raw scores instead.")
+ return result
+ elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib: # type: ignore [operator]
+ return result
+ else:
+ return np.vstack((1. - result, result)).transpose()
+
+ predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
+ description="Return the predicted probability for each class for each sample.",
+ X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+ output_name="predicted_probability",
+ predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
+ X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
+ X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
+ )
+
+ @property
+ def classes_(self) -> np.ndarray:
+ """:obj:`array` of shape = [n_classes]: The class label array."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
+ return self._classes # type: ignore[return-value]
+
+ @property
+ def n_classes_(self) -> int:
+ """:obj:`int`: The number of classes."""
+ if not self.__sklearn_is_fitted__():
+ raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
+ return self._n_classes
+
+
+class LGBMRanker(LGBMModel):
+ """LightGBM ranker.
+
+ .. warning::
+
+ scikit-learn doesn't support ranking applications yet,
+ therefore this class is not really compatible with the sklearn ecosystem.
+ Please use this class mainly for training and applying ranking models in common sklearnish way.
+ """
+
+ def fit( # type: ignore[override]
+ self,
+ X: _LGBM_ScikitMatrixLike,
+ y: _LGBM_LabelType,
+ sample_weight: Optional[_LGBM_WeightType] = None,
+ init_score: Optional[_LGBM_InitScoreType] = None,
+ group: Optional[_LGBM_GroupType] = None,
+ eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
+ eval_names: Optional[List[str]] = None,
+ eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+ eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+ eval_group: Optional[List[_LGBM_GroupType]] = None,
+ eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
+ eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
+ feature_name: _LGBM_FeatureNameConfiguration = 'auto',
+ categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
+ callbacks: Optional[List[Callable]] = None,
+ init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
+ ) -> "LGBMRanker":
+ """Docstring is inherited from the LGBMModel."""
+ # check group data
+ if group is None:
+ raise ValueError("Should set group for ranking task")
+
+ if eval_set is not None:
+ if eval_group is None:
+ raise ValueError("Eval_group cannot be None when eval_set is not None")
+ elif len(eval_group) != len(eval_set):
+ raise ValueError("Length of eval_group should be equal to eval_set")
+ elif (isinstance(eval_group, dict)
+ and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))
+ or isinstance(eval_group, list)
+ and any(group is None for group in eval_group)):
+ raise ValueError("Should set group for all eval datasets for ranking task; "
+ "if you use dict, the index should start from 0")
+
+ self._eval_at = eval_at
+ super().fit(
+ X,
+ y,
+ sample_weight=sample_weight,
+ init_score=init_score,
+ group=group,
+ eval_set=eval_set,
+ eval_names=eval_names,
+ eval_sample_weight=eval_sample_weight,
+ eval_init_score=eval_init_score,
+ eval_group=eval_group,
+ eval_metric=eval_metric,
+ feature_name=feature_name,
+ categorical_feature=categorical_feature,
+ callbacks=callbacks,
+ init_model=init_model
+ )
+ return self
+
+ _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRanker") # type: ignore
+ fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')] # type: ignore
+ + _base_doc[_base_doc.find('eval_init_score :'):]) # type: ignore
+ _base_doc = fit.__doc__
+ _before_feature_name, _feature_name, _after_feature_name = _base_doc.partition('feature_name :')
+ fit.__doc__ = f"""{_before_feature_name}eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))
+ The evaluation positions of the specified metric.
+ {_feature_name}{_after_feature_name}"""