diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-03-06 08:05:12 +0100 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-03-06 08:05:12 +0100 |
commit | 08636303e6bef99ea5f997a8ba4a41256aabee6b (patch) | |
tree | b8480c4d5289dedb06b3e3338953666900a849ee /ext | |
parent | 112ebbe21ff330c68faf883b125ba4932e007544 (diff) |
import lightgbm
Diffstat (limited to 'ext')
-rw-r--r-- | ext/lightgbm/VERSION.txt | 1 | ||||
-rw-r--r-- | ext/lightgbm/__init__.py | 36 | ||||
-rw-r--r-- | ext/lightgbm/basic.py | 4952 | ||||
-rw-r--r-- | ext/lightgbm/callback.py | 470 | ||||
-rw-r--r-- | ext/lightgbm/compat.py | 269 | ||||
-rw-r--r-- | ext/lightgbm/dask.py | 1671 | ||||
-rw-r--r-- | ext/lightgbm/engine.py | 785 | ||||
-rwxr-xr-x | ext/lightgbm/lib/lib_lightgbm.so | bin | 0 -> 8098376 bytes | |||
-rw-r--r-- | ext/lightgbm/libpath.py | 32 | ||||
-rw-r--r-- | ext/lightgbm/plotting.py | 828 | ||||
-rw-r--r-- | ext/lightgbm/py.typed | 0 | ||||
-rw-r--r-- | ext/lightgbm/sklearn.py | 1370 |
12 files changed, 10414 insertions, 0 deletions
diff --git a/ext/lightgbm/VERSION.txt b/ext/lightgbm/VERSION.txt new file mode 100644 index 0000000..8089590 --- /dev/null +++ b/ext/lightgbm/VERSION.txt @@ -0,0 +1 @@ +4.3.0 diff --git a/ext/lightgbm/__init__.py b/ext/lightgbm/__init__.py new file mode 100644 index 0000000..0dc5b75 --- /dev/null +++ b/ext/lightgbm/__init__.py @@ -0,0 +1,36 @@ +# coding: utf-8 +"""LightGBM, Light Gradient Boosting Machine. + +Contributors: https://github.com/microsoft/LightGBM/graphs/contributors. +""" +from pathlib import Path + +from .basic import Booster, Dataset, Sequence, register_logger +from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter +from .engine import CVBooster, cv, train + +try: + from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor +except ImportError: + pass +try: + from .plotting import create_tree_digraph, plot_importance, plot_metric, plot_split_value_histogram, plot_tree +except ImportError: + pass +try: + from .dask import DaskLGBMClassifier, DaskLGBMRanker, DaskLGBMRegressor +except ImportError: + pass + + +_version_path = Path(__file__).absolute().parent / 'VERSION.txt' +if _version_path.is_file(): + __version__ = _version_path.read_text(encoding='utf-8').strip() + +__all__ = ['Dataset', 'Booster', 'CVBooster', 'Sequence', + 'register_logger', + 'train', 'cv', + 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', + 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', + 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException', + 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/ext/lightgbm/basic.py b/ext/lightgbm/basic.py new file mode 100644 index 0000000..5c3a32a --- /dev/null +++ b/ext/lightgbm/basic.py @@ -0,0 +1,4952 @@ +# coding: utf-8 +"""Wrapper for C API of LightGBM.""" +import abc +import ctypes +import inspect +import json +import warnings +from collections import OrderedDict +from copy import deepcopy +from enum import Enum +from functools import wraps +from os import SEEK_END, environ +from os.path import getsize +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union + +import numpy as np +import scipy.sparse + +from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, + dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) +from .libpath import find_lib_path + +if TYPE_CHECKING: + from typing import Literal + + # typing.TypeGuard was only introduced in Python 3.10 + try: + from typing import TypeGuard + except ImportError: + from typing_extensions import TypeGuard + + +__all__ = [ + 'Booster', + 'Dataset', + 'LGBMDeprecationWarning', + 'LightGBMError', + 'register_logger', + 'Sequence', +] + +_BoosterHandle = ctypes.c_void_p +_DatasetHandle = ctypes.c_void_p +_ctypes_int_ptr = Union[ + "ctypes._Pointer[ctypes.c_int32]", + "ctypes._Pointer[ctypes.c_int64]" +] +_ctypes_int_array = Union[ + "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", + "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]" +] +_ctypes_float_ptr = Union[ + "ctypes._Pointer[ctypes.c_float]", + "ctypes._Pointer[ctypes.c_double]" +] +_ctypes_float_array = Union[ + "ctypes.Array[ctypes._Pointer[ctypes.c_float]]", + "ctypes.Array[ctypes._Pointer[ctypes.c_double]]" +] +_LGBM_EvalFunctionResultType = Tuple[str, float, bool] +_LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] +_LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool] +_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float] +_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"] +_LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"] +_LGBM_GroupType = Union[ + List[float], + List[int], + np.ndarray, + pd_Series, + pa_Array, + pa_ChunkedArray, +] +_LGBM_PositionType = Union[ + np.ndarray, + pd_Series +] +_LGBM_InitScoreType = Union[ + List[float], + List[List[float]], + np.ndarray, + pd_Series, + pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, +] +_LGBM_TrainDataType = Union[ + str, + Path, + np.ndarray, + pd_DataFrame, + dt_DataTable, + scipy.sparse.spmatrix, + "Sequence", + List["Sequence"], + List[np.ndarray], + pa_Table +] +_LGBM_LabelType = Union[ + List[float], + List[int], + np.ndarray, + pd_Series, + pd_DataFrame, + pa_Array, + pa_ChunkedArray, +] +_LGBM_PredictDataType = Union[ + str, + Path, + np.ndarray, + pd_DataFrame, + dt_DataTable, + scipy.sparse.spmatrix, + pa_Table, +] +_LGBM_WeightType = Union[ + List[float], + List[int], + np.ndarray, + pd_Series, + pa_Array, + pa_ChunkedArray, +] +ZERO_THRESHOLD = 1e-35 + + +def _is_zero(x: float) -> bool: + return -ZERO_THRESHOLD <= x <= ZERO_THRESHOLD + + +def _get_sample_count(total_nrow: int, params: str) -> int: + sample_cnt = ctypes.c_int(0) + _safe_call(_LIB.LGBM_GetSampleCount( + ctypes.c_int32(total_nrow), + _c_str(params), + ctypes.byref(sample_cnt), + )) + return sample_cnt.value + + +class _MissingType(Enum): + NONE = 'None' + NAN = 'NaN' + ZERO = 'Zero' + + +class _DummyLogger: + def info(self, msg: str) -> None: + print(msg) # noqa: T201 + + def warning(self, msg: str) -> None: + warnings.warn(msg, stacklevel=3) + + +_LOGGER: Any = _DummyLogger() +_INFO_METHOD_NAME = "info" +_WARNING_METHOD_NAME = "warning" + + +def _has_method(logger: Any, method_name: str) -> bool: + return callable(getattr(logger, method_name, None)) + + +def register_logger( + logger: Any, info_method_name: str = "info", warning_method_name: str = "warning" +) -> None: + """Register custom logger. + + Parameters + ---------- + logger : Any + Custom logger. + info_method_name : str, optional (default="info") + Method used to log info messages. + warning_method_name : str, optional (default="warning") + Method used to log warning messages. + """ + if not _has_method(logger, info_method_name) or not _has_method(logger, warning_method_name): + raise TypeError( + f"Logger must provide '{info_method_name}' and '{warning_method_name}' method" + ) + + global _LOGGER, _INFO_METHOD_NAME, _WARNING_METHOD_NAME + _LOGGER = logger + _INFO_METHOD_NAME = info_method_name + _WARNING_METHOD_NAME = warning_method_name + + +def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], None]: + """Join log messages from native library which come by chunks.""" + msg_normalized: List[str] = [] + + @wraps(func) + def wrapper(msg: str) -> None: + nonlocal msg_normalized + if msg.strip() == '': + msg = ''.join(msg_normalized) + msg_normalized = [] + return func(msg) + else: + msg_normalized.append(msg) + + return wrapper + + +def _log_info(msg: str) -> None: + getattr(_LOGGER, _INFO_METHOD_NAME)(msg) + + +def _log_warning(msg: str) -> None: + getattr(_LOGGER, _WARNING_METHOD_NAME)(msg) + + +@_normalize_native_string +def _log_native(msg: str) -> None: + getattr(_LOGGER, _INFO_METHOD_NAME)(msg) + + +def _log_callback(msg: bytes) -> None: + """Redirect logs from native library into Python.""" + _log_native(str(msg.decode('utf-8'))) + + +def _load_lib() -> ctypes.CDLL: + """Load LightGBM library.""" + lib_path = find_lib_path() + lib = ctypes.cdll.LoadLibrary(lib_path[0]) + lib.LGBM_GetLastError.restype = ctypes.c_char_p + callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p) + lib.callback = callback(_log_callback) # type: ignore[attr-defined] + if lib.LGBM_RegisterLogCallback(lib.callback) != 0: + raise LightGBMError(lib.LGBM_GetLastError().decode('utf-8')) + return lib + + +# we don't need lib_lightgbm while building docs +_LIB: ctypes.CDLL +if environ.get('LIGHTGBM_BUILD_DOC', False): + from unittest.mock import Mock # isort: skip + _LIB = Mock(ctypes.CDLL) # type: ignore +else: + _LIB = _load_lib() + + +_NUMERIC_TYPES = (int, float, bool) +_ArrayLike = Union[List, np.ndarray, pd_Series] + + +def _safe_call(ret: int) -> None: + """Check the return value from C API call. + + Parameters + ---------- + ret : int + The return value from C API calls. + """ + if ret != 0: + raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) + + +def _is_numeric(obj: Any) -> bool: + """Check whether object is a number or not, include numpy number, etc.""" + try: + float(obj) + return True + except (TypeError, ValueError): + # TypeError: obj is not a string or a number + # ValueError: invalid literal + return False + + +def _is_numpy_1d_array(data: Any) -> bool: + """Check whether data is a numpy 1-D array.""" + return isinstance(data, np.ndarray) and len(data.shape) == 1 + + +def _is_numpy_column_array(data: Any) -> bool: + """Check whether data is a column numpy array.""" + if not isinstance(data, np.ndarray): + return False + shape = data.shape + return len(shape) == 2 and shape[1] == 1 + + +def _cast_numpy_array_to_dtype(array: np.ndarray, dtype: "np.typing.DTypeLike") -> np.ndarray: + """Cast numpy array to given dtype.""" + if array.dtype == dtype: + return array + return array.astype(dtype=dtype, copy=False) + + +def _is_1d_list(data: Any) -> bool: + """Check whether data is a 1-D list.""" + return isinstance(data, list) and (not data or _is_numeric(data[0])) + + +def _is_list_of_numpy_arrays(data: Any) -> "TypeGuard[List[np.ndarray]]": + return ( + isinstance(data, list) + and all(isinstance(x, np.ndarray) for x in data) + ) + + +def _is_list_of_sequences(data: Any) -> "TypeGuard[List[Sequence]]": + return ( + isinstance(data, list) + and all(isinstance(x, Sequence) for x in data) + ) + + +def _is_1d_collection(data: Any) -> bool: + """Check whether data is a 1-D collection.""" + return ( + _is_numpy_1d_array(data) + or _is_numpy_column_array(data) + or _is_1d_list(data) + or isinstance(data, pd_Series) + ) + + +def _list_to_1d_numpy( + data: Any, + dtype: "np.typing.DTypeLike", + name: str +) -> np.ndarray: + """Convert data to numpy 1-D array.""" + if _is_numpy_1d_array(data): + return _cast_numpy_array_to_dtype(data, dtype) + elif _is_numpy_column_array(data): + _log_warning('Converting column-vector to 1d array') + array = data.ravel() + return _cast_numpy_array_to_dtype(array, dtype) + elif _is_1d_list(data): + return np.array(data, dtype=dtype, copy=False) + elif isinstance(data, pd_Series): + _check_for_bad_pandas_dtypes(data.to_frame().dtypes) + return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well + else: + raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n" + "It should be list, numpy 1-D array or pandas Series") + + +def _is_numpy_2d_array(data: Any) -> bool: + """Check whether data is a numpy 2-D array.""" + return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1 + + +def _is_2d_list(data: Any) -> bool: + """Check whether data is a 2-D list.""" + return isinstance(data, list) and len(data) > 0 and _is_1d_list(data[0]) + + +def _is_2d_collection(data: Any) -> bool: + """Check whether data is a 2-D collection.""" + return ( + _is_numpy_2d_array(data) + or _is_2d_list(data) + or isinstance(data, pd_DataFrame) + ) + + +def _is_pyarrow_array(data: Any) -> bool: + """Check whether data is a PyArrow array.""" + return isinstance(data, (pa_Array, pa_ChunkedArray)) + + +def _is_pyarrow_table(data: Any) -> bool: + """Check whether data is a PyArrow table.""" + return isinstance(data, pa_Table) + + +class _ArrowCArray: + """Simple wrapper around the C representation of an Arrow type.""" + + n_chunks: int + chunks: arrow_cffi.CData + schema: arrow_cffi.CData + + def __init__(self, n_chunks: int, chunks: arrow_cffi.CData, schema: arrow_cffi.CData): + self.n_chunks = n_chunks + self.chunks = chunks + self.schema = schema + + @property + def chunks_ptr(self) -> int: + """Returns the address of the pointer to the list of chunks making up the array.""" + return int(arrow_cffi.cast("uintptr_t", arrow_cffi.addressof(self.chunks[0]))) + + @property + def schema_ptr(self) -> int: + """Returns the address of the pointer to the schema of the array.""" + return int(arrow_cffi.cast("uintptr_t", self.schema)) + + +def _export_arrow_to_c(data: pa_Table) -> _ArrowCArray: + """Export an Arrow type to its C representation.""" + # Obtain objects to export + if isinstance(data, pa_Array): + export_objects = [data] + elif isinstance(data, pa_ChunkedArray): + export_objects = data.chunks + elif isinstance(data, pa_Table): + export_objects = data.to_batches() + else: + raise ValueError(f"data of type '{type(data)}' cannot be exported to Arrow") + + # Prepare export + chunks = arrow_cffi.new("struct ArrowArray[]", len(export_objects)) + schema = arrow_cffi.new("struct ArrowSchema*") + + # Export all objects + for i, obj in enumerate(export_objects): + chunk_ptr = int(arrow_cffi.cast("uintptr_t", arrow_cffi.addressof(chunks[i]))) + if i == 0: + schema_ptr = int(arrow_cffi.cast("uintptr_t", schema)) + obj._export_to_c(chunk_ptr, schema_ptr) + else: + obj._export_to_c(chunk_ptr) + + return _ArrowCArray(len(chunks), chunks, schema) + + + +def _data_to_2d_numpy( + data: Any, + dtype: "np.typing.DTypeLike", + name: str +) -> np.ndarray: + """Convert data to numpy 2-D array.""" + if _is_numpy_2d_array(data): + return _cast_numpy_array_to_dtype(data, dtype) + if _is_2d_list(data): + return np.array(data, dtype=dtype) + if isinstance(data, pd_DataFrame): + _check_for_bad_pandas_dtypes(data.dtypes) + return _cast_numpy_array_to_dtype(data.values, dtype) + raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n" + "It should be list of lists, numpy 2-D array or pandas DataFrame") + + +def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: + """Convert a ctypes float pointer array to a numpy array.""" + if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): + return np.ctypeslib.as_array(cptr, shape=(length,)).copy() + else: + raise RuntimeError('Expected float pointer') + + +def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: + """Convert a ctypes double pointer array to a numpy array.""" + if isinstance(cptr, ctypes.POINTER(ctypes.c_double)): + return np.ctypeslib.as_array(cptr, shape=(length,)).copy() + else: + raise RuntimeError('Expected double pointer') + + +def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: + """Convert a ctypes int pointer array to a numpy array.""" + if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)): + return np.ctypeslib.as_array(cptr, shape=(length,)).copy() + else: + raise RuntimeError('Expected int32 pointer') + + +def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: + """Convert a ctypes int pointer array to a numpy array.""" + if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)): + return np.ctypeslib.as_array(cptr, shape=(length,)).copy() + else: + raise RuntimeError('Expected int64 pointer') + + +def _c_str(string: str) -> ctypes.c_char_p: + """Convert a Python string to C string.""" + return ctypes.c_char_p(string.encode('utf-8')) + + +def _c_array(ctype: type, values: List[Any]) -> ctypes.Array: + """Convert a Python array to C array.""" + return (ctype * len(values))(*values) # type: ignore[operator] + + +def _json_default_with_numpy(obj: Any) -> Any: + """Convert numpy classes to JSON serializable objects.""" + if isinstance(obj, (np.integer, np.floating, np.bool_)): + return obj.item() + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return obj + + +def _to_string(x: Union[int, float, str, List]) -> str: + if isinstance(x, list): + val_list = ",".join(str(val) for val in x) + return f"[{val_list}]" + else: + return str(x) + + +def _param_dict_to_str(data: Optional[Dict[str, Any]]) -> str: + """Convert Python dictionary to string, which is passed to C API.""" + if data is None or not data: + return "" + pairs = [] + for key, val in data.items(): + if isinstance(val, (list, tuple, set)) or _is_numpy_1d_array(val): + pairs.append(f"{key}={','.join(map(_to_string, val))}") + elif isinstance(val, (str, Path, _NUMERIC_TYPES)) or _is_numeric(val): + pairs.append(f"{key}={val}") + elif val is not None: + raise TypeError(f'Unknown type of parameter:{key}, got:{type(val).__name__}') + return ' '.join(pairs) + + +class _TempFile: + """Proxy class to workaround errors on Windows.""" + + def __enter__(self): + with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f: + self.name = f.name + self.path = Path(self.name) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.path.is_file(): + self.path.unlink() + + +class LightGBMError(Exception): + """Error thrown by LightGBM.""" + + pass + + +# DeprecationWarning is not shown by default, so let's create our own with higher level +class LGBMDeprecationWarning(UserWarning): + """Custom deprecation warning.""" + + pass + + +class _ConfigAliases: + # lazy evaluation to allow import without dynamic library, e.g., for docs generation + aliases = None + + @staticmethod + def _get_all_param_aliases() -> Dict[str, List[str]]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_DumpParamAliases( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_DumpParamAliases( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + return json.loads( + string_buffer.value.decode('utf-8'), + object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} + ) + + @classmethod + def get(cls, *args) -> Set[str]: + if cls.aliases is None: + cls.aliases = cls._get_all_param_aliases() + ret = set() + for i in args: + ret.update(cls.get_sorted(i)) + return ret + + @classmethod + def get_sorted(cls, name: str) -> List[str]: + if cls.aliases is None: + cls.aliases = cls._get_all_param_aliases() + return cls.aliases.get(name, [name]) + + @classmethod + def get_by_alias(cls, *args) -> Set[str]: + if cls.aliases is None: + cls.aliases = cls._get_all_param_aliases() + ret = set(args) + for arg in args: + for aliases in cls.aliases.values(): + if arg in aliases: + ret.update(aliases) + break + return ret + + +def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_value: Any) -> Dict[str, Any]: + """Get a single parameter value, accounting for aliases. + + Parameters + ---------- + main_param_name : str + Name of the main parameter to get a value for. One of the keys of ``_ConfigAliases``. + params : dict + Dictionary of LightGBM parameters. + default_value : Any + Default value to use for the parameter, if none is found in ``params``. + + Returns + ------- + params : dict + A ``params`` dict with exactly one value for ``main_param_name``, and all aliases ``main_param_name`` removed. + If both ``main_param_name`` and one or more aliases for it are found, the value of ``main_param_name`` will be preferred. + """ + # avoid side effects on passed-in parameters + params = deepcopy(params) + + aliases = _ConfigAliases.get_sorted(main_param_name) + aliases = [a for a in aliases if a != main_param_name] + + # if main_param_name was provided, keep that value and remove all aliases + if main_param_name in params.keys(): + for param in aliases: + params.pop(param, None) + return params + + # if main param name was not found, search for an alias + for param in aliases: + if param in params.keys(): + params[main_param_name] = params[param] + break + + if main_param_name in params.keys(): + for param in aliases: + params.pop(param, None) + return params + + # neither of main_param_name, aliases were found + params[main_param_name] = default_value + + return params + + +_MAX_INT32 = (1 << 31) - 1 + +"""Macro definition of data type in C API of LightGBM""" +_C_API_DTYPE_FLOAT32 = 0 +_C_API_DTYPE_FLOAT64 = 1 +_C_API_DTYPE_INT32 = 2 +_C_API_DTYPE_INT64 = 3 + +"""Matrix is row major in Python""" +_C_API_IS_ROW_MAJOR = 1 + +"""Macro definition of prediction type in C API of LightGBM""" +_C_API_PREDICT_NORMAL = 0 +_C_API_PREDICT_RAW_SCORE = 1 +_C_API_PREDICT_LEAF_INDEX = 2 +_C_API_PREDICT_CONTRIB = 3 + +"""Macro definition of sparse matrix type""" +_C_API_MATRIX_TYPE_CSR = 0 +_C_API_MATRIX_TYPE_CSC = 1 + +"""Macro definition of feature importance type""" +_C_API_FEATURE_IMPORTANCE_SPLIT = 0 +_C_API_FEATURE_IMPORTANCE_GAIN = 1 + +"""Data type of data field""" +_FIELD_TYPE_MAPPER = { + "label": _C_API_DTYPE_FLOAT32, + "weight": _C_API_DTYPE_FLOAT32, + "init_score": _C_API_DTYPE_FLOAT64, + "group": _C_API_DTYPE_INT32, + "position": _C_API_DTYPE_INT32 +} + +"""String name to int feature importance type mapper""" +_FEATURE_IMPORTANCE_TYPE_MAPPER = { + "split": _C_API_FEATURE_IMPORTANCE_SPLIT, + "gain": _C_API_FEATURE_IMPORTANCE_GAIN +} + + +def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray: + """Fix the memory of multi-dimensional sliced object.""" + if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray): + if not data.flags.c_contiguous: + _log_warning("Usage of np.ndarray subset (sliced data) is not recommended " + "due to it will double the peak memory cost in LightGBM.") + return np.copy(data) + return data + + +def _c_float_array( + data: np.ndarray +) -> Tuple[_ctypes_float_ptr, int, np.ndarray]: + """Get pointer of float numpy array / list.""" + if _is_1d_list(data): + data = np.array(data, copy=False) + if _is_numpy_1d_array(data): + data = _convert_from_sliced_object(data) + assert data.flags.c_contiguous + ptr_data: _ctypes_float_ptr + if data.dtype == np.float32: + ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + type_data = _C_API_DTYPE_FLOAT32 + elif data.dtype == np.float64: + ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) + type_data = _C_API_DTYPE_FLOAT64 + else: + raise TypeError(f"Expected np.float32 or np.float64, met type({data.dtype})") + else: + raise TypeError(f"Unknown type({type(data).__name__})") + return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed + + +def _c_int_array( + data: np.ndarray +) -> Tuple[_ctypes_int_ptr, int, np.ndarray]: + """Get pointer of int numpy array / list.""" + if _is_1d_list(data): + data = np.array(data, copy=False) + if _is_numpy_1d_array(data): + data = _convert_from_sliced_object(data) + assert data.flags.c_contiguous + ptr_data: _ctypes_int_ptr + if data.dtype == np.int32: + ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + type_data = _C_API_DTYPE_INT32 + elif data.dtype == np.int64: + ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + type_data = _C_API_DTYPE_INT64 + else: + raise TypeError(f"Expected np.int32 or np.int64, met type({data.dtype})") + else: + raise TypeError(f"Unknown type({type(data).__name__})") + return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed + + +def _is_allowed_numpy_dtype(dtype: type) -> bool: + float128 = getattr(np, 'float128', type(None)) + return ( + issubclass(dtype, (np.integer, np.floating, np.bool_)) + and not issubclass(dtype, (np.timedelta64, float128)) + ) + + +def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: + bad_pandas_dtypes = [ + f'{column_name}: {pandas_dtype}' + for column_name, pandas_dtype in pandas_dtypes_series.items() + if not _is_allowed_numpy_dtype(pandas_dtype.type) + ] + if bad_pandas_dtypes: + raise ValueError('pandas dtypes must be int, float or bool.\n' + f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}') + + +def _pandas_to_numpy( + data: pd_DataFrame, + target_dtype: "np.typing.DTypeLike" +) -> np.ndarray: + _check_for_bad_pandas_dtypes(data.dtypes) + try: + # most common case (no nullable dtypes) + return data.to_numpy(dtype=target_dtype, copy=False) + except TypeError: + # 1.0 <= pd version < 1.1 and nullable dtypes, least common case + # raises error because array is casted to type(pd.NA) and there's no na_value argument + return data.astype(target_dtype, copy=False).values + except ValueError: + # data has nullable dtypes, but we can specify na_value argument and copy will be made + return data.to_numpy(dtype=target_dtype, na_value=np.nan) + + +def _data_from_pandas( + data: pd_DataFrame, + feature_name: _LGBM_FeatureNameConfiguration, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, + pandas_categorical: Optional[List[List]] +) -> Tuple[np.ndarray, List[str], Union[List[str], List[int]], List[List]]: + if len(data.shape) != 2 or data.shape[0] < 1: + raise ValueError('Input data must be 2 dimensional and non empty.') + + # take shallow copy in case we modify categorical columns + # whole column modifications don't change the original df + data = data.copy(deep=False) + + # determine feature names + if feature_name == 'auto': + feature_name = [str(col) for col in data.columns] + + # determine categorical features + cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] + cat_cols_not_ordered: List[str] = [col for col in cat_cols if not data[col].cat.ordered] + if pandas_categorical is None: # train dataset + pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] + else: + if len(cat_cols) != len(pandas_categorical): + raise ValueError('train and valid dataset categorical_feature do not match.') + for col, category in zip(cat_cols, pandas_categorical): + if list(data[col].cat.categories) != list(category): + data[col] = data[col].cat.set_categories(category) + if len(cat_cols): # cat_cols is list + data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) + + # use cat cols from DataFrame + if categorical_feature == 'auto': + categorical_feature = cat_cols_not_ordered + + df_dtypes = [dtype.type for dtype in data.dtypes] + # so that the target dtype considers floats + df_dtypes.append(np.float32) + target_dtype = np.result_type(*df_dtypes) + + return ( + _pandas_to_numpy(data, target_dtype=target_dtype), + feature_name, + categorical_feature, + pandas_categorical + ) + + +def _dump_pandas_categorical( + pandas_categorical: Optional[List[List]], + file_name: Optional[Union[str, Path]] = None +) -> str: + categorical_json = json.dumps(pandas_categorical, default=_json_default_with_numpy) + pandas_str = f'\npandas_categorical:{categorical_json}\n' + if file_name is not None: + with open(file_name, 'a') as f: + f.write(pandas_str) + return pandas_str + + +def _load_pandas_categorical( + file_name: Optional[Union[str, Path]] = None, + model_str: Optional[str] = None +) -> Optional[List[List]]: + pandas_key = 'pandas_categorical:' + offset = -len(pandas_key) + if file_name is not None: + max_offset = -getsize(file_name) + with open(file_name, 'rb') as f: + while True: + if offset < max_offset: + offset = max_offset + f.seek(offset, SEEK_END) + lines = f.readlines() + if len(lines) >= 2: + break + offset *= 2 + last_line = lines[-1].decode('utf-8').strip() + if not last_line.startswith(pandas_key): + last_line = lines[-2].decode('utf-8').strip() + elif model_str is not None: + idx = model_str.rfind('\n', 0, offset) + last_line = model_str[idx:].strip() + if last_line.startswith(pandas_key): + return json.loads(last_line[len(pandas_key):]) + else: + return None + + +class Sequence(abc.ABC): + """ + Generic data access interface. + + Object should support the following operations: + + .. code-block:: + + # Get total row number. + >>> len(seq) + # Random access by row index. Used for data sampling. + >>> seq[10] + # Range data access. Used to read data in batch when constructing Dataset. + >>> seq[0:100] + # Optionally specify batch_size to control range data read size. + >>> seq.batch_size + + - With random access, **data sampling does not need to go through all data**. + - With range data access, there's **no need to read all data into memory thus reduce memory usage**. + + .. versionadded:: 3.3.0 + + Attributes + ---------- + batch_size : int + Default size of a batch. + """ + + batch_size = 4096 # Defaults to read 4K rows in each batch. + + @abc.abstractmethod + def __getitem__(self, idx: Union[int, slice, List[int]]) -> np.ndarray: + """Return data for given row index. + + A basic implementation should look like this: + + .. code-block:: python + + if isinstance(idx, numbers.Integral): + return self._get_one_line(idx) + elif isinstance(idx, slice): + return np.stack([self._get_one_line(i) for i in range(idx.start, idx.stop)]) + elif isinstance(idx, list): + # Only required if using ``Dataset.subset()``. + return np.array([self._get_one_line(i) for i in idx]) + else: + raise TypeError(f"Sequence index must be integer, slice or list, got {type(idx).__name__}") + + Parameters + ---------- + idx : int, slice[int], list[int] + Item index. + + Returns + ------- + result : numpy 1-D array or numpy 2-D array + 1-D array if idx is int, 2-D array if idx is slice or list. + """ + raise NotImplementedError("Sub-classes of lightgbm.Sequence must implement __getitem__()") + + @abc.abstractmethod + def __len__(self) -> int: + """Return row count of this sequence.""" + raise NotImplementedError("Sub-classes of lightgbm.Sequence must implement __len__()") + + +class _InnerPredictor: + """_InnerPredictor of LightGBM. + + Not exposed to user. + Used only for prediction, usually used for continued training. + + .. note:: + + Can be converted from Booster, but cannot be converted to Booster. + """ + + def __init__( + self, + booster_handle: _BoosterHandle, + pandas_categorical: Optional[List[List]], + pred_parameter: Dict[str, Any], + manage_handle: bool + ): + """Initialize the _InnerPredictor. + + Parameters + ---------- + booster_handle : object + Handle of Booster. + pandas_categorical : list of list, or None + If provided, list of categories for ``pandas`` categorical columns. + Where the ``i``th element of the list contains the categories for the ``i``th categorical feature. + pred_parameter : dict + Other parameters for the prediction. + manage_handle : bool + If ``True``, free the corresponding Booster on the C++ side when this Python object is deleted. + """ + self._handle = booster_handle + self.__is_manage_handle = manage_handle + self.pandas_categorical = pandas_categorical + self.pred_parameter = _param_dict_to_str(pred_parameter) + + out_num_class = ctypes.c_int(0) + _safe_call( + _LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class) + ) + ) + self.num_class = out_num_class.value + + @classmethod + def from_booster( + cls, + booster: "Booster", + pred_parameter: Dict[str, Any] + ) -> "_InnerPredictor": + """Initialize an ``_InnerPredictor`` from a ``Booster``. + + Parameters + ---------- + booster : Booster + Booster. + pred_parameter : dict + Other parameters for the prediction. + """ + out_cur_iter = ctypes.c_int(0) + _safe_call( + _LIB.LGBM_BoosterGetCurrentIteration( + booster._handle, + ctypes.byref(out_cur_iter) + ) + ) + return cls( + booster_handle=booster._handle, + pandas_categorical=booster.pandas_categorical, + pred_parameter=pred_parameter, + manage_handle=False + ) + + @classmethod + def from_model_file( + cls, + model_file: Union[str, Path], + pred_parameter: Dict[str, Any] + ) -> "_InnerPredictor": + """Initialize an ``_InnerPredictor`` from a text file containing a LightGBM model. + + Parameters + ---------- + model_file : str or pathlib.Path + Path to the model file. + pred_parameter : dict + Other parameters for the prediction. + """ + booster_handle = ctypes.c_void_p() + out_num_iterations = ctypes.c_int(0) + _safe_call( + _LIB.LGBM_BoosterCreateFromModelfile( + _c_str(str(model_file)), + ctypes.byref(out_num_iterations), + ctypes.byref(booster_handle) + ) + ) + return cls( + booster_handle=booster_handle, + pandas_categorical=_load_pandas_categorical(file_name=model_file), + pred_parameter=pred_parameter, + manage_handle=True + ) + + def __del__(self) -> None: + try: + if self.__is_manage_handle: + _safe_call(_LIB.LGBM_BoosterFree(self._handle)) + except AttributeError: + pass + + def __getstate__(self) -> Dict[str, Any]: + this = self.__dict__.copy() + this.pop('handle', None) + this.pop('_handle', None) + return this + + def predict( + self, + data: _LGBM_PredictDataType, + start_iteration: int = 0, + num_iteration: int = -1, + raw_score: bool = False, + pred_leaf: bool = False, + pred_contrib: bool = False, + data_has_header: bool = False, + validate_features: bool = False + ) -> Union[np.ndarray, scipy.sparse.spmatrix, List[scipy.sparse.spmatrix]]: + """Predict logic. + + Parameters + ---------- + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse + Data source for prediction. + If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). + start_iteration : int, optional (default=0) + Start index of the iteration to predict. + num_iteration : int, optional (default=-1) + Iteration used for prediction. + raw_score : bool, optional (default=False) + Whether to predict raw scores. + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + data_has_header : bool, optional (default=False) + Whether data has header. + Used only for txt data. + validate_features : bool, optional (default=False) + If True, ensure that the features used to predict match the ones used to train. + Used only if data is pandas DataFrame. + + .. versionadded:: 4.0.0 + + Returns + ------- + result : numpy array, scipy.sparse or list of scipy.sparse + Prediction result. + Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``). + """ + if isinstance(data, Dataset): + raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead") + elif isinstance(data, pd_DataFrame) and validate_features: + data_names = [str(x) for x in data.columns] + ptr_names = (ctypes.c_char_p * len(data_names))() + ptr_names[:] = [x.encode('utf-8') for x in data_names] + _safe_call( + _LIB.LGBM_BoosterValidateFeatureNames( + self._handle, + ptr_names, + ctypes.c_int(len(data_names)), + ) + ) + + if isinstance(data, pd_DataFrame): + data = _data_from_pandas( + data=data, + feature_name="auto", + categorical_feature="auto", + pandas_categorical=self.pandas_categorical + )[0] + + predict_type = _C_API_PREDICT_NORMAL + if raw_score: + predict_type = _C_API_PREDICT_RAW_SCORE + if pred_leaf: + predict_type = _C_API_PREDICT_LEAF_INDEX + if pred_contrib: + predict_type = _C_API_PREDICT_CONTRIB + + if isinstance(data, (str, Path)): + with _TempFile() as f: + _safe_call(_LIB.LGBM_BoosterPredictForFile( + self._handle, + _c_str(str(data)), + ctypes.c_int(data_has_header), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + _c_str(f.name))) + preds = np.loadtxt(f.name, dtype=np.float64) + nrow = preds.shape[0] + elif isinstance(data, scipy.sparse.csr_matrix): + preds, nrow = self.__pred_for_csr( + csr=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + elif isinstance(data, scipy.sparse.csc_matrix): + preds, nrow = self.__pred_for_csc( + csc=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + elif isinstance(data, np.ndarray): + preds, nrow = self.__pred_for_np2d( + mat=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + elif _is_pyarrow_table(data): + preds, nrow = self.__pred_for_pyarrow_table( + table=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + elif isinstance(data, list): + try: + data = np.array(data) + except BaseException as err: + raise ValueError('Cannot convert data list to numpy array.') from err + preds, nrow = self.__pred_for_np2d( + mat=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + elif isinstance(data, dt_DataTable): + preds, nrow = self.__pred_for_np2d( + mat=data.to_numpy(), + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + else: + try: + _log_warning('Converting data to scipy sparse matrix.') + csr = scipy.sparse.csr_matrix(data) + except BaseException as err: + raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err + preds, nrow = self.__pred_for_csr( + csr=csr, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + if pred_leaf: + preds = preds.astype(np.int32) + is_sparse = isinstance(preds, scipy.sparse.spmatrix) or isinstance(preds, list) + if not is_sparse and preds.size != nrow: + if preds.size % nrow == 0: + preds = preds.reshape(nrow, -1) + else: + raise ValueError(f'Length of predict result ({preds.size}) cannot be divide nrow ({nrow})') + return preds + + def __get_num_preds( + self, + start_iteration: int, + num_iteration: int, + nrow: int, + predict_type: int + ) -> int: + """Get size of prediction result.""" + if nrow > _MAX_INT32: + raise LightGBMError('LightGBM cannot perform prediction for data ' + f'with number of rows greater than MAX_INT32 ({_MAX_INT32}).\n' + 'You can split your data into chunks ' + 'and then concatenate predictions for them') + n_preds = ctypes.c_int64(0) + _safe_call(_LIB.LGBM_BoosterCalcNumPredict( + self._handle, + ctypes.c_int(nrow), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.byref(n_preds))) + return n_preds.value + + def __inner_predict_np2d( + self, + mat: np.ndarray, + start_iteration: int, + num_iteration: int, + predict_type: int, + preds: Optional[np.ndarray] + ) -> Tuple[np.ndarray, int]: + if mat.dtype == np.float32 or mat.dtype == np.float64: + data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) + else: # change non-float data to float data, need to copy + data = np.array(mat.reshape(mat.size), dtype=np.float32) + ptr_data, type_ptr_data, _ = _c_float_array(data) + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=mat.shape[0], + predict_type=predict_type + ) + if preds is None: + preds = np.empty(n_preds, dtype=np.float64) + elif len(preds.shape) != 1 or len(preds) != n_preds: + raise ValueError("Wrong length of pre-allocated predict array") + out_num_preds = ctypes.c_int64(0) + _safe_call(_LIB.LGBM_BoosterPredictForMat( + self._handle, + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int32(mat.shape[0]), + ctypes.c_int32(mat.shape[1]), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, mat.shape[0] + + def __pred_for_np2d( + self, + mat: np.ndarray, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a 2-D numpy matrix.""" + if len(mat.shape) != 2: + raise ValueError('Input numpy.ndarray or list must be 2 dimensional') + + nrow = mat.shape[0] + if nrow > _MAX_INT32: + sections = np.arange(start=_MAX_INT32, stop=nrow, step=_MAX_INT32) + # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal + n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])] + n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() + preds = np.empty(sum(n_preds), dtype=np.float64) + for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections), + zip(n_preds_sections, n_preds_sections[1:])): + # avoid memory consumption by arrays concatenation operations + self.__inner_predict_np2d( + mat=chunk, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, + preds=preds[start_idx_pred:end_idx_pred] + ) + return preds, nrow + else: + return self.__inner_predict_np2d( + mat=mat, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, + preds=None + ) + + def __create_sparse_native( + self, + cs: Union[scipy.sparse.csc_matrix, scipy.sparse.csr_matrix], + out_shape: np.ndarray, + out_ptr_indptr: "ctypes._Pointer", + out_ptr_indices: "ctypes._Pointer", + out_ptr_data: "ctypes._Pointer", + indptr_type: int, + data_type: int, + is_csr: bool + ) -> Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]]: + # create numpy array from output arrays + data_indices_len = out_shape[0] + indptr_len = out_shape[1] + if indptr_type == _C_API_DTYPE_INT32: + out_indptr = _cint32_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len) + elif indptr_type == _C_API_DTYPE_INT64: + out_indptr = _cint64_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len) + else: + raise TypeError("Expected int32 or int64 type for indptr") + if data_type == _C_API_DTYPE_FLOAT32: + out_data = _cfloat32_array_to_numpy(cptr=out_ptr_data, length=data_indices_len) + elif data_type == _C_API_DTYPE_FLOAT64: + out_data = _cfloat64_array_to_numpy(cptr=out_ptr_data, length=data_indices_len) + else: + raise TypeError("Expected float32 or float64 type for data") + out_indices = _cint32_array_to_numpy(cptr=out_ptr_indices, length=data_indices_len) + # break up indptr based on number of rows (note more than one matrix in multiclass case) + per_class_indptr_shape = cs.indptr.shape[0] + # for CSC there is extra column added + if not is_csr: + per_class_indptr_shape += 1 + out_indptr_arrays = np.split(out_indptr, out_indptr.shape[0] / per_class_indptr_shape) + # reformat output into a csr or csc matrix or list of csr or csc matrices + cs_output_matrices = [] + offset = 0 + for cs_indptr in out_indptr_arrays: + matrix_indptr_len = cs_indptr[cs_indptr.shape[0] - 1] + cs_indices = out_indices[offset + cs_indptr[0]:offset + matrix_indptr_len] + cs_data = out_data[offset + cs_indptr[0]:offset + matrix_indptr_len] + offset += matrix_indptr_len + # same shape as input csr or csc matrix except extra column for expected value + cs_shape = [cs.shape[0], cs.shape[1] + 1] + # note: make sure we copy data as it will be deallocated next + if is_csr: + cs_output_matrices.append(scipy.sparse.csr_matrix((cs_data, cs_indices, cs_indptr), cs_shape)) + else: + cs_output_matrices.append(scipy.sparse.csc_matrix((cs_data, cs_indices, cs_indptr), cs_shape)) + # free the temporary native indptr, indices, and data + _safe_call(_LIB.LGBM_BoosterFreePredictSparse(out_ptr_indptr, out_ptr_indices, out_ptr_data, + ctypes.c_int(indptr_type), ctypes.c_int(data_type))) + if len(cs_output_matrices) == 1: + return cs_output_matrices[0] + return cs_output_matrices + + def __inner_predict_csr( + self, + csr: scipy.sparse.csr_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int, + preds: Optional[np.ndarray] + ) -> Tuple[np.ndarray, int]: + nrow = len(csr.indptr) - 1 + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=nrow, + predict_type=predict_type + ) + if preds is None: + preds = np.empty(n_preds, dtype=np.float64) + elif len(preds.shape) != 1 or len(preds) != n_preds: + raise ValueError("Wrong length of pre-allocated predict array") + out_num_preds = ctypes.c_int64(0) + + ptr_indptr, type_ptr_indptr, _ = _c_int_array(csr.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csr.data) + + assert csr.shape[1] <= _MAX_INT32 + csr_indices = csr.indices.astype(np.int32, copy=False) + + _safe_call(_LIB.LGBM_BoosterPredictForCSR( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, nrow + + def __inner_predict_csr_sparse( + self, + csr: scipy.sparse.csr_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]: + ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csr.data) + csr_indices = csr.indices.astype(np.int32, copy=False) + matrix_type = _C_API_MATRIX_TYPE_CSR + out_ptr_indptr: _ctypes_int_ptr + if type_ptr_indptr == _C_API_DTYPE_INT32: + out_ptr_indptr = ctypes.POINTER(ctypes.c_int32)() + else: + out_ptr_indptr = ctypes.POINTER(ctypes.c_int64)() + out_ptr_indices = ctypes.POINTER(ctypes.c_int32)() + out_ptr_data: _ctypes_float_ptr + if type_ptr_data == _C_API_DTYPE_FLOAT32: + out_ptr_data = ctypes.POINTER(ctypes.c_float)() + else: + out_ptr_data = ctypes.POINTER(ctypes.c_double)() + out_shape = np.empty(2, dtype=np.int64) + _safe_call(_LIB.LGBM_BoosterPredictSparseOutput( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.c_int(matrix_type), + out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), + ctypes.byref(out_ptr_indptr), + ctypes.byref(out_ptr_indices), + ctypes.byref(out_ptr_data))) + matrices = self.__create_sparse_native( + cs=csr, + out_shape=out_shape, + out_ptr_indptr=out_ptr_indptr, + out_ptr_indices=out_ptr_indices, + out_ptr_data=out_ptr_data, + indptr_type=type_ptr_indptr, + data_type=type_ptr_data, + is_csr=True + ) + nrow = len(csr.indptr) - 1 + return matrices, nrow + + def __pred_for_csr( + self, + csr: scipy.sparse.csr_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a CSR data.""" + if predict_type == _C_API_PREDICT_CONTRIB: + return self.__inner_predict_csr_sparse( + csr=csr, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + nrow = len(csr.indptr) - 1 + if nrow > _MAX_INT32: + sections = [0] + list(np.arange(start=_MAX_INT32, stop=nrow, step=_MAX_INT32)) + [nrow] + # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal + n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)] + n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() + preds = np.empty(sum(n_preds), dtype=np.float64) + for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]), + zip(n_preds_sections, n_preds_sections[1:])): + # avoid memory consumption by arrays concatenation operations + self.__inner_predict_csr( + csr=csr[start_idx:end_idx], + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, + preds=preds[start_idx_pred:end_idx_pred] + ) + return preds, nrow + else: + return self.__inner_predict_csr( + csr=csr, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, + preds=None + ) + + def __inner_predict_sparse_csc( + self, + csc: scipy.sparse.csc_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int + ): + ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csc.data) + csc_indices = csc.indices.astype(np.int32, copy=False) + matrix_type = _C_API_MATRIX_TYPE_CSC + out_ptr_indptr: _ctypes_int_ptr + if type_ptr_indptr == _C_API_DTYPE_INT32: + out_ptr_indptr = ctypes.POINTER(ctypes.c_int32)() + else: + out_ptr_indptr = ctypes.POINTER(ctypes.c_int64)() + out_ptr_indices = ctypes.POINTER(ctypes.c_int32)() + out_ptr_data: _ctypes_float_ptr + if type_ptr_data == _C_API_DTYPE_FLOAT32: + out_ptr_data = ctypes.POINTER(ctypes.c_float)() + else: + out_ptr_data = ctypes.POINTER(ctypes.c_double)() + out_shape = np.empty(2, dtype=np.int64) + _safe_call(_LIB.LGBM_BoosterPredictSparseOutput( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.c_int(matrix_type), + out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), + ctypes.byref(out_ptr_indptr), + ctypes.byref(out_ptr_indices), + ctypes.byref(out_ptr_data))) + matrices = self.__create_sparse_native( + cs=csc, + out_shape=out_shape, + out_ptr_indptr=out_ptr_indptr, + out_ptr_indices=out_ptr_indices, + out_ptr_data=out_ptr_data, + indptr_type=type_ptr_indptr, + data_type=type_ptr_data, + is_csr=False + ) + nrow = csc.shape[0] + return matrices, nrow + + def __pred_for_csc( + self, + csc: scipy.sparse.csc_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a CSC data.""" + nrow = csc.shape[0] + if nrow > _MAX_INT32: + return self.__pred_for_csr( + csr=csc.tocsr(), + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + if predict_type == _C_API_PREDICT_CONTRIB: + return self.__inner_predict_sparse_csc( + csc=csc, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=nrow, + predict_type=predict_type + ) + preds = np.empty(n_preds, dtype=np.float64) + out_num_preds = ctypes.c_int64(0) + + ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csc.data) + + assert csc.shape[0] <= _MAX_INT32 + csc_indices = csc.indices.astype(np.int32, copy=False) + + _safe_call(_LIB.LGBM_BoosterPredictForCSC( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, nrow + + def __pred_for_pyarrow_table( + self, + table: pa_Table, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a PyArrow table.""" + if not PYARROW_INSTALLED: + raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.") + + # Check that the input is valid: we only handle numbers (for now) + if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types): + raise ValueError("Arrow table may only have integer or floating point datatypes") + + # Prepare prediction output array + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=table.num_rows, + predict_type=predict_type + ) + preds = np.empty(n_preds, dtype=np.float64) + out_num_preds = ctypes.c_int64(0) + + # Export Arrow table to C and run prediction + c_array = _export_arrow_to_c(table) + _safe_call(_LIB.LGBM_BoosterPredictForArrow( + self._handle, + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, table.num_rows + + def current_iteration(self) -> int: + """Get the index of the current iteration. + + Returns + ------- + cur_iter : int + The index of the current iteration. + """ + out_cur_iter = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetCurrentIteration( + self._handle, + ctypes.byref(out_cur_iter))) + return out_cur_iter.value + + +class Dataset: + """Dataset in LightGBM.""" + + def __init__( + self, + data: _LGBM_TrainDataType, + label: Optional[_LGBM_LabelType] = None, + reference: Optional["Dataset"] = None, + weight: Optional[_LGBM_WeightType] = None, + group: Optional[_LGBM_GroupType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + params: Optional[Dict[str, Any]] = None, + free_raw_data: bool = True, + position: Optional[_LGBM_PositionType] = None, + ): + """Initialize Dataset. + + Parameters + ---------- + data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table + Data source of Dataset. + If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. + label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Label of the data. + reference : Dataset or None, optional (default=None) + If this is Dataset for validation, training data should be used as reference. + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Weight for each instance. Weights should be non-negative. + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) + Init score for Dataset. + feature_name : list of str, or 'auto', optional (default="auto") + Feature names. + If 'auto' and data is pandas DataFrame or pyarrow Table, data columns names are used. + categorical_feature : list of str or int, or 'auto', optional (default="auto") + Categorical features. + If list of int, interpreted as indices. + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + Floating point numbers in categorical features will be rounded towards 0. + params : dict or None, optional (default=None) + Other parameters for Dataset. + free_raw_data : bool, optional (default=True) + If True, raw data is freed after constructing inner Dataset. + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. + """ + self._handle: Optional[_DatasetHandle] = None + self.data = data + self.label = label + self.reference = reference + self.weight = weight + self.group = group + self.position = position + self.init_score = init_score + self.feature_name: _LGBM_FeatureNameConfiguration = feature_name + self.categorical_feature: _LGBM_CategoricalFeatureConfiguration = categorical_feature + self.params = deepcopy(params) + self.free_raw_data = free_raw_data + self.used_indices: Optional[List[int]] = None + self._need_slice = True + self._predictor: Optional[_InnerPredictor] = None + self.pandas_categorical: Optional[List[List]] = None + self._params_back_up = None + self.version = 0 + self._start_row = 0 # Used when pushing rows one by one. + + def __del__(self) -> None: + try: + self._free_handle() + except AttributeError: + pass + + def _create_sample_indices(self, total_nrow: int) -> np.ndarray: + """Get an array of randomly chosen indices from this ``Dataset``. + + Indices are sampled without replacement. + + Parameters + ---------- + total_nrow : int + Total number of rows to sample from. + If this value is greater than the value of parameter ``bin_construct_sample_cnt``, only ``bin_construct_sample_cnt`` indices will be used. + If Dataset has multiple input data, this should be the sum of rows of every file. + + Returns + ------- + indices : numpy array + Indices for sampled data. + """ + param_str = _param_dict_to_str(self.get_params()) + sample_cnt = _get_sample_count(total_nrow, param_str) + indices = np.empty(sample_cnt, dtype=np.int32) + ptr_data, _, _ = _c_int_array(indices) + actual_sample_cnt = ctypes.c_int32(0) + + _safe_call(_LIB.LGBM_SampleIndices( + ctypes.c_int32(total_nrow), + _c_str(param_str), + ptr_data, + ctypes.byref(actual_sample_cnt), + )) + assert sample_cnt == actual_sample_cnt.value + return indices + + def _init_from_ref_dataset( + self, + total_nrow: int, + ref_dataset: _DatasetHandle + ) -> 'Dataset': + """Create dataset from a reference dataset. + + Parameters + ---------- + total_nrow : int + Number of rows expected to add to dataset. + ref_dataset : object + Handle of reference dataset to extract metadata from. + + Returns + ------- + self : Dataset + Constructed Dataset object. + """ + self._handle = ctypes.c_void_p() + _safe_call(_LIB.LGBM_DatasetCreateByReference( + ref_dataset, + ctypes.c_int64(total_nrow), + ctypes.byref(self._handle), + )) + return self + + def _init_from_sample( + self, + sample_data: List[np.ndarray], + sample_indices: List[np.ndarray], + sample_cnt: int, + total_nrow: int, + ) -> "Dataset": + """Create Dataset from sampled data structures. + + Parameters + ---------- + sample_data : list of numpy array + Sample data for each column. + sample_indices : list of numpy array + Sample data row index for each column. + sample_cnt : int + Number of samples. + total_nrow : int + Total number of rows for all input files. + + Returns + ------- + self : Dataset + Constructed Dataset object. + """ + ncol = len(sample_indices) + assert len(sample_data) == ncol, "#sample data column != #column indices" + + for i in range(ncol): + if sample_data[i].dtype != np.double: + raise ValueError(f"sample_data[{i}] type {sample_data[i].dtype} is not double") + if sample_indices[i].dtype != np.int32: + raise ValueError(f"sample_indices[{i}] type {sample_indices[i].dtype} is not int32") + + # c type: double** + # each double* element points to start of each column of sample data. + sample_col_ptr: _ctypes_float_array = (ctypes.POINTER(ctypes.c_double) * ncol)() + # c type int** + # each int* points to start of indices for each column + indices_col_ptr: _ctypes_int_array = (ctypes.POINTER(ctypes.c_int32) * ncol)() + for i in range(ncol): + sample_col_ptr[i] = _c_float_array(sample_data[i])[0] + indices_col_ptr[i] = _c_int_array(sample_indices[i])[0] + + num_per_col = np.array([len(d) for d in sample_indices], dtype=np.int32) + num_per_col_ptr, _, _ = _c_int_array(num_per_col) + + self._handle = ctypes.c_void_p() + params_str = _param_dict_to_str(self.get_params()) + _safe_call(_LIB.LGBM_DatasetCreateFromSampledColumn( + ctypes.cast(sample_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), + ctypes.cast(indices_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_int32))), + ctypes.c_int32(ncol), + num_per_col_ptr, + ctypes.c_int32(sample_cnt), + ctypes.c_int32(total_nrow), + ctypes.c_int64(total_nrow), + _c_str(params_str), + ctypes.byref(self._handle), + )) + return self + + def _push_rows(self, data: np.ndarray) -> 'Dataset': + """Add rows to Dataset. + + Parameters + ---------- + data : numpy 1-D array + New data to add to the Dataset. + + Returns + ------- + self : Dataset + Dataset object. + """ + nrow, ncol = data.shape + data = data.reshape(data.size) + data_ptr, data_type, _ = _c_float_array(data) + + _safe_call(_LIB.LGBM_DatasetPushRows( + self._handle, + data_ptr, + data_type, + ctypes.c_int32(nrow), + ctypes.c_int32(ncol), + ctypes.c_int32(self._start_row), + )) + self._start_row += nrow + return self + + def get_params(self) -> Dict[str, Any]: + """Get the used parameters in the Dataset. + + Returns + ------- + params : dict + The used parameters in this Dataset object. + """ + if self.params is not None: + # no min_data, nthreads and verbose in this function + dataset_params = _ConfigAliases.get("bin_construct_sample_cnt", + "categorical_feature", + "data_random_seed", + "enable_bundle", + "feature_pre_filter", + "forcedbins_filename", + "group_column", + "header", + "ignore_column", + "is_enable_sparse", + "label_column", + "linear_tree", + "max_bin", + "max_bin_by_feature", + "min_data_in_bin", + "pre_partition", + "precise_float_parser", + "two_round", + "use_missing", + "weight_column", + "zero_as_missing") + return {k: v for k, v in self.params.items() if k in dataset_params} + else: + return {} + + def _free_handle(self) -> "Dataset": + if self._handle is not None: + _safe_call(_LIB.LGBM_DatasetFree(self._handle)) + self._handle = None + self._need_slice = True + if self.used_indices is not None: + self.data = None + return self + + def _set_init_score_by_predictor( + self, + predictor: Optional[_InnerPredictor], + data: _LGBM_TrainDataType, + used_indices: Optional[Union[List[int], np.ndarray]] + ) -> "Dataset": + data_has_header = False + if isinstance(data, (str, Path)) and self.params is not None: + # check data has header or not + data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header")) + num_data = self.num_data() + if predictor is not None: + init_score: Union[np.ndarray, scipy.sparse.spmatrix] = predictor.predict( + data=data, + raw_score=True, + data_has_header=data_has_header + ) + init_score = init_score.ravel() + if used_indices is not None: + assert not self._need_slice + if isinstance(data, (str, Path)): + sub_init_score = np.empty(num_data * predictor.num_class, dtype=np.float64) + assert num_data == len(used_indices) + for i in range(len(used_indices)): + for j in range(predictor.num_class): + sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j] + init_score = sub_init_score + if predictor.num_class > 1: + # need to regroup init_score + new_init_score = np.empty(init_score.size, dtype=np.float64) + for i in range(num_data): + for j in range(predictor.num_class): + new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j] + init_score = new_init_score + elif self.init_score is not None: + init_score = np.full_like(self.init_score, fill_value=0.0, dtype=np.float64) + else: + return self + self.set_init_score(init_score) + return self + + def _lazy_init( + self, + data: Optional[_LGBM_TrainDataType], + label: Optional[_LGBM_LabelType], + reference: Optional["Dataset"], + weight: Optional[_LGBM_WeightType], + group: Optional[_LGBM_GroupType], + init_score: Optional[_LGBM_InitScoreType], + predictor: Optional[_InnerPredictor], + feature_name: _LGBM_FeatureNameConfiguration, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, + params: Optional[Dict[str, Any]], + position: Optional[_LGBM_PositionType] + ) -> "Dataset": + if data is None: + self._handle = None + return self + if reference is not None: + self.pandas_categorical = reference.pandas_categorical + categorical_feature = reference.categorical_feature + if isinstance(data, pd_DataFrame): + data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas( + data=data, + feature_name=feature_name, + categorical_feature=categorical_feature, + pandas_categorical=self.pandas_categorical + ) + + # process for args + params = {} if params is None else params + args_names = inspect.signature(self.__class__._lazy_init).parameters.keys() + for key in params.keys(): + if key in args_names: + _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' + f'Please use {key} argument of the Dataset constructor to pass this parameter.') + # get categorical features + if isinstance(categorical_feature, list): + categorical_indices = set() + feature_dict = {} + if isinstance(feature_name, list): + feature_dict = {name: i for i, name in enumerate(feature_name)} + for name in categorical_feature: + if isinstance(name, str) and name in feature_dict: + categorical_indices.add(feature_dict[name]) + elif isinstance(name, int): + categorical_indices.add(name) + else: + raise TypeError(f"Wrong type({type(name).__name__}) or unknown name({name}) in categorical_feature") + if categorical_indices: + for cat_alias in _ConfigAliases.get("categorical_feature"): + if cat_alias in params: + # If the params[cat_alias] is equal to categorical_indices, do not report the warning. + if not (isinstance(params[cat_alias], list) and set(params[cat_alias]) == categorical_indices): + _log_warning(f'{cat_alias} in param dict is overridden.') + params.pop(cat_alias, None) + params['categorical_column'] = sorted(categorical_indices) + + params_str = _param_dict_to_str(params) + self.params = params + # process for reference dataset + ref_dataset = None + if isinstance(reference, Dataset): + ref_dataset = reference.construct()._handle + elif reference is not None: + raise TypeError('Reference dataset should be None or dataset instance') + # start construct data + if isinstance(data, (str, Path)): + self._handle = ctypes.c_void_p() + _safe_call(_LIB.LGBM_DatasetCreateFromFile( + _c_str(str(data)), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + elif isinstance(data, scipy.sparse.csr_matrix): + self.__init_from_csr(data, params_str, ref_dataset) + elif isinstance(data, scipy.sparse.csc_matrix): + self.__init_from_csc(data, params_str, ref_dataset) + elif isinstance(data, np.ndarray): + self.__init_from_np2d(data, params_str, ref_dataset) + elif _is_pyarrow_table(data): + self.__init_from_pyarrow_table(data, params_str, ref_dataset) + feature_name = data.column_names + elif isinstance(data, list) and len(data) > 0: + if _is_list_of_numpy_arrays(data): + self.__init_from_list_np2d(data, params_str, ref_dataset) + elif _is_list_of_sequences(data): + self.__init_from_seqs(data, ref_dataset) + else: + raise TypeError('Data list can only be of ndarray or Sequence') + elif isinstance(data, Sequence): + self.__init_from_seqs([data], ref_dataset) + elif isinstance(data, dt_DataTable): + self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset) + else: + try: + csr = scipy.sparse.csr_matrix(data) + self.__init_from_csr(csr, params_str, ref_dataset) + except BaseException as err: + raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err + if label is not None: + self.set_label(label) + if self.get_label() is None: + raise ValueError("Label should not be None") + if weight is not None: + self.set_weight(weight) + if group is not None: + self.set_group(group) + if position is not None: + self.set_position(position) + if isinstance(predictor, _InnerPredictor): + if self._predictor is None and init_score is not None: + _log_warning("The init_score will be overridden by the prediction of init_model.") + self._set_init_score_by_predictor( + predictor=predictor, + data=data, + used_indices=None + ) + elif init_score is not None: + self.set_init_score(init_score) + elif predictor is not None: + raise TypeError(f'Wrong predictor type {type(predictor).__name__}') + # set feature names + return self.set_feature_name(feature_name) + + @staticmethod + def _yield_row_from_seqlist(seqs: List[Sequence], indices: Iterable[int]): + offset = 0 + seq_id = 0 + seq = seqs[seq_id] + for row_id in indices: + assert row_id >= offset, "sample indices are expected to be monotonic" + while row_id >= offset + len(seq): + offset += len(seq) + seq_id += 1 + seq = seqs[seq_id] + id_in_seq = row_id - offset + row = seq[id_in_seq] + yield row if row.flags['OWNDATA'] else row.copy() + + def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]: + """Sample data from seqs. + + Mimics behavior in c_api.cpp:LGBM_DatasetCreateFromMats() + + Returns + ------- + sampled_rows, sampled_row_indices + """ + indices = self._create_sample_indices(total_nrow) + + # Select sampled rows, transpose to column order. + sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices))) + sampled = sampled.T + + filtered = [] + filtered_idx = [] + sampled_row_range = np.arange(len(indices), dtype=np.int32) + for col in sampled: + col_predicate = (np.abs(col) > ZERO_THRESHOLD) | np.isnan(col) + filtered_col = col[col_predicate] + filtered_row_idx = sampled_row_range[col_predicate] + + filtered.append(filtered_col) + filtered_idx.append(filtered_row_idx) + + return filtered, filtered_idx + + def __init_from_seqs( + self, + seqs: List[Sequence], + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """ + Initialize data from list of Sequence objects. + + Sequence: Generic Data Access Object + Supports random access and access by batch if properly defined by user + + Data scheme uniformity are trusted, not checked + """ + total_nrow = sum(len(seq) for seq in seqs) + + # create validation dataset from ref_dataset + if ref_dataset is not None: + self._init_from_ref_dataset(total_nrow, ref_dataset) + else: + param_str = _param_dict_to_str(self.get_params()) + sample_cnt = _get_sample_count(total_nrow, param_str) + + sample_data, col_indices = self.__sample(seqs, total_nrow) + self._init_from_sample(sample_data, col_indices, sample_cnt, total_nrow) + + for seq in seqs: + nrow = len(seq) + batch_size = getattr(seq, 'batch_size', None) or Sequence.batch_size + for start in range(0, nrow, batch_size): + end = min(start + batch_size, nrow) + self._push_rows(seq[start:end]) + return self + + def __init_from_np2d( + self, + mat: np.ndarray, + params_str: str, + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """Initialize data from a 2-D numpy matrix.""" + if len(mat.shape) != 2: + raise ValueError('Input numpy.ndarray must be 2 dimensional') + + self._handle = ctypes.c_void_p() + if mat.dtype == np.float32 or mat.dtype == np.float64: + data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) + else: # change non-float data to float data, need to copy + data = np.array(mat.reshape(mat.size), dtype=np.float32) + + ptr_data, type_ptr_data, _ = _c_float_array(data) + _safe_call(_LIB.LGBM_DatasetCreateFromMat( + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int32(mat.shape[0]), + ctypes.c_int32(mat.shape[1]), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + return self + + def __init_from_list_np2d( + self, + mats: List[np.ndarray], + params_str: str, + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """Initialize data from a list of 2-D numpy matrices.""" + ncol = mats[0].shape[1] + nrow = np.empty((len(mats),), np.int32) + ptr_data: _ctypes_float_array + if mats[0].dtype == np.float64: + ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))() + else: + ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))() + + holders = [] + type_ptr_data = -1 + + for i, mat in enumerate(mats): + if len(mat.shape) != 2: + raise ValueError('Input numpy.ndarray must be 2 dimensional') + + if mat.shape[1] != ncol: + raise ValueError('Input arrays must have same number of columns') + + nrow[i] = mat.shape[0] + + if mat.dtype == np.float32 or mat.dtype == np.float64: + mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) + else: # change non-float data to float data, need to copy + mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32) + + chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i]) + if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data: + raise ValueError('Input chunks must have same type') + ptr_data[i] = chunk_ptr_data + type_ptr_data = chunk_type_ptr_data + holders.append(holder) + + self._handle = ctypes.c_void_p() + _safe_call(_LIB.LGBM_DatasetCreateFromMats( + ctypes.c_int32(len(mats)), + ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), + ctypes.c_int(type_ptr_data), + nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ctypes.c_int32(ncol), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + return self + + def __init_from_csr( + self, + csr: scipy.sparse.csr_matrix, + params_str: str, + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """Initialize data from a CSR matrix.""" + if len(csr.indices) != len(csr.data): + raise ValueError(f'Length mismatch: {len(csr.indices)} vs {len(csr.data)}') + self._handle = ctypes.c_void_p() + + ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csr.data) + + assert csr.shape[1] <= _MAX_INT32 + csr_indices = csr.indices.astype(np.int32, copy=False) + + _safe_call(_LIB.LGBM_DatasetCreateFromCSR( + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + return self + + def __init_from_csc( + self, + csc: scipy.sparse.csc_matrix, + params_str: str, + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """Initialize data from a CSC matrix.""" + if len(csc.indices) != len(csc.data): + raise ValueError(f'Length mismatch: {len(csc.indices)} vs {len(csc.data)}') + self._handle = ctypes.c_void_p() + + ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) + ptr_data, type_ptr_data, _ = _c_float_array(csc.data) + + assert csc.shape[0] <= _MAX_INT32 + csc_indices = csc.indices.astype(np.int32, copy=False) + + _safe_call(_LIB.LGBM_DatasetCreateFromCSC( + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + return self + + def __init_from_pyarrow_table( + self, + table: pa_Table, + params_str: str, + ref_dataset: Optional[_DatasetHandle] + ) -> "Dataset": + """Initialize data from a PyArrow table.""" + if not PYARROW_INSTALLED: + raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` installed.") + + # Check that the input is valid: we only handle numbers (for now) + if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types): + raise ValueError("Arrow table may only have integer or floating point datatypes") + + # Export Arrow table to C + c_array = _export_arrow_to_c(table) + self._handle = ctypes.c_void_p() + _safe_call(_LIB.LGBM_DatasetCreateFromArrow( + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle))) + return self + + @staticmethod + def _compare_params_for_warning( + params: Dict[str, Any], + other_params: Dict[str, Any], + ignore_keys: Set[str] + ) -> bool: + """Compare two dictionaries with params ignoring some keys. + + It is only for the warning purpose. + + Parameters + ---------- + params : dict + One dictionary with parameters to compare. + other_params : dict + Another dictionary with parameters to compare. + ignore_keys : set + Keys that should be ignored during comparing two dictionaries. + + Returns + ------- + compare_result : bool + Returns whether two dictionaries with params are equal. + """ + for k in other_params: + if k not in ignore_keys: + if k not in params or params[k] != other_params[k]: + return False + for k in params: + if k not in ignore_keys: + if k not in other_params or params[k] != other_params[k]: + return False + return True + + def construct(self) -> "Dataset": + """Lazy init. + + Returns + ------- + self : Dataset + Constructed Dataset object. + """ + if self._handle is None: + if self.reference is not None: + reference_params = self.reference.get_params() + params = self.get_params() + if params != reference_params: + if not self._compare_params_for_warning( + params=params, + other_params=reference_params, + ignore_keys=_ConfigAliases.get("categorical_feature") + ): + _log_warning('Overriding the parameters from Reference Dataset.') + self._update_params(reference_params) + if self.used_indices is None: + # create valid + self._lazy_init(data=self.data, label=self.label, reference=self.reference, + weight=self.weight, group=self.group, position=self.position, + init_score=self.init_score, predictor=self._predictor, + feature_name=self.feature_name, categorical_feature='auto', params=self.params) + else: + # construct subset + used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name='used_indices') + assert used_indices.flags.c_contiguous + if self.reference.group is not None: + group_info = np.array(self.reference.group).astype(np.int32, copy=False) + _, self.group = np.unique(np.repeat(range(len(group_info)), repeats=group_info)[self.used_indices], + return_counts=True) + self._handle = ctypes.c_void_p() + params_str = _param_dict_to_str(self.params) + _safe_call(_LIB.LGBM_DatasetGetSubset( + self.reference.construct()._handle, + used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ctypes.c_int32(used_indices.shape[0]), + _c_str(params_str), + ctypes.byref(self._handle))) + if not self.free_raw_data: + self.get_data() + if self.group is not None: + self.set_group(self.group) + if self.position is not None: + self.set_position(self.position) + if self.get_label() is None: + raise ValueError("Label should not be None.") + if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor: + self.get_data() + self._set_init_score_by_predictor( + predictor=self._predictor, + data=self.data, + used_indices=used_indices + ) + else: + # create train + self._lazy_init(data=self.data, label=self.label, reference=None, + weight=self.weight, group=self.group, + init_score=self.init_score, predictor=self._predictor, + feature_name=self.feature_name, categorical_feature=self.categorical_feature, + params=self.params, position=self.position) + if self.free_raw_data: + self.data = None + self.feature_name = self.get_feature_name() + return self + + def create_valid( + self, + data: _LGBM_TrainDataType, + label: Optional[_LGBM_LabelType] = None, + weight: Optional[_LGBM_WeightType] = None, + group: Optional[_LGBM_GroupType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + params: Optional[Dict[str, Any]] = None, + position: Optional[_LGBM_PositionType] = None + ) -> "Dataset": + """Create validation data align with current Dataset. + + Parameters + ---------- + data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array + Data source of Dataset. + If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. + label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Label of the data. + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Weight for each instance. Weights should be non-negative. + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) + Init score for Dataset. + params : dict or None, optional (default=None) + Other parameters for validation Dataset. + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. + + Returns + ------- + valid : Dataset + Validation Dataset with reference to self. + """ + ret = Dataset(data, label=label, reference=self, + weight=weight, group=group, position=position, init_score=init_score, + params=params, free_raw_data=self.free_raw_data) + ret._predictor = self._predictor + ret.pandas_categorical = self.pandas_categorical + return ret + + def subset( + self, + used_indices: List[int], + params: Optional[Dict[str, Any]] = None + ) -> "Dataset": + """Get subset of current Dataset. + + Parameters + ---------- + used_indices : list of int + Indices used to create the subset. + params : dict or None, optional (default=None) + These parameters will be passed to Dataset constructor. + + Returns + ------- + subset : Dataset + Subset of the current Dataset. + """ + if params is None: + params = self.params + ret = Dataset(None, reference=self, feature_name=self.feature_name, + categorical_feature=self.categorical_feature, params=params, + free_raw_data=self.free_raw_data) + ret._predictor = self._predictor + ret.pandas_categorical = self.pandas_categorical + ret.used_indices = sorted(used_indices) + return ret + + def save_binary(self, filename: Union[str, Path]) -> "Dataset": + """Save Dataset to a binary file. + + .. note:: + + Please note that `init_score` is not saved in binary file. + If you need it, please set it again after loading Dataset. + + Parameters + ---------- + filename : str or pathlib.Path + Name of the output file. + + Returns + ------- + self : Dataset + Returns self. + """ + _safe_call(_LIB.LGBM_DatasetSaveBinary( + self.construct()._handle, + _c_str(str(filename)))) + return self + + def _update_params(self, params: Optional[Dict[str, Any]]) -> "Dataset": + if not params: + return self + params = deepcopy(params) + + def update(): + if not self.params: + self.params = params + else: + self._params_back_up = deepcopy(self.params) + self.params.update(params) + + if self._handle is None: + update() + elif params is not None: + ret = _LIB.LGBM_DatasetUpdateParamChecking( + _c_str(_param_dict_to_str(self.params)), + _c_str(_param_dict_to_str(params))) + if ret != 0: + # could be updated if data is not freed + if self.data is not None: + update() + self._free_handle() + else: + raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) + return self + + def _reverse_update_params(self) -> "Dataset": + if self._handle is None: + self.params = deepcopy(self._params_back_up) + self._params_back_up = None + return self + + def set_field( + self, + field_name: str, + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] + ) -> "Dataset": + """Set property into the Dataset. + + Parameters + ---------- + field_name : str + The field name of the information. + data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray or None + The data to be set. + + Returns + ------- + self : Dataset + Dataset with set property. + """ + if self._handle is None: + raise Exception(f"Cannot set {field_name} before construct dataset") + if data is None: + # set to None + _safe_call(_LIB.LGBM_DatasetSetField( + self._handle, + _c_str(field_name), + None, + ctypes.c_int(0), + ctypes.c_int(_FIELD_TYPE_MAPPER[field_name]))) + return self + + # If the data is a arrow data, we can just pass it to C + if _is_pyarrow_array(data) or _is_pyarrow_table(data): + # If a table is being passed, we concatenate the columns. This is only valid for + # 'init_score'. + if _is_pyarrow_table(data): + if field_name != "init_score": + raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") + data = pa_chunked_array([ + chunk for array in data.columns for chunk in array.chunks # type: ignore + ]) + + c_array = _export_arrow_to_c(data) + _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( + self._handle, + _c_str(field_name), + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + )) + self.version += 1 + return self + + dtype: "np.typing.DTypeLike" + if field_name == 'init_score': + dtype = np.float64 + if _is_1d_collection(data): + data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) + elif _is_2d_collection(data): + data = _data_to_2d_numpy(data, dtype=dtype, name=field_name) + data = data.ravel(order='F') + else: + raise TypeError( + 'init_score must be list, numpy 1-D array or pandas Series.\n' + 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' + ) + else: + dtype = np.int32 if (field_name == 'group' or field_name == 'position') else np.float32 + data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) + + ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr] + if data.dtype == np.float32 or data.dtype == np.float64: + ptr_data, type_data, _ = _c_float_array(data) + elif data.dtype == np.int32: + ptr_data, type_data, _ = _c_int_array(data) + else: + raise TypeError(f"Expected np.float32/64 or np.int32, met type({data.dtype})") + if type_data != _FIELD_TYPE_MAPPER[field_name]: + raise TypeError("Input type error for set_field") + _safe_call(_LIB.LGBM_DatasetSetField( + self._handle, + _c_str(field_name), + ptr_data, + ctypes.c_int(len(data)), + ctypes.c_int(type_data))) + self.version += 1 + return self + + def get_field(self, field_name: str) -> Optional[np.ndarray]: + """Get property from the Dataset. + + Can only be run on a constructed Dataset. + + Unlike ``get_group()``, ``get_init_score()``, ``get_label()``, ``get_position()``, and ``get_weight()``, + this method ignores any raw data passed into ``lgb.Dataset()`` on the Python side, and will only read + data from the constructed C++ ``Dataset`` object. + + Parameters + ---------- + field_name : str + The field name of the information. + + Returns + ------- + info : numpy array or None + A numpy array with information from the Dataset. + """ + if self._handle is None: + raise Exception(f"Cannot get {field_name} before construct Dataset") + tmp_out_len = ctypes.c_int(0) + out_type = ctypes.c_int(0) + ret = ctypes.POINTER(ctypes.c_void_p)() + _safe_call(_LIB.LGBM_DatasetGetField( + self._handle, + _c_str(field_name), + ctypes.byref(tmp_out_len), + ctypes.byref(ret), + ctypes.byref(out_type))) + if out_type.value != _FIELD_TYPE_MAPPER[field_name]: + raise TypeError("Return type error for get_field") + if tmp_out_len.value == 0: + return None + if out_type.value == _C_API_DTYPE_INT32: + arr = _cint32_array_to_numpy( + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), + length=tmp_out_len.value + ) + elif out_type.value == _C_API_DTYPE_FLOAT32: + arr = _cfloat32_array_to_numpy( + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), + length=tmp_out_len.value + ) + elif out_type.value == _C_API_DTYPE_FLOAT64: + arr = _cfloat64_array_to_numpy( + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), + length=tmp_out_len.value + ) + else: + raise TypeError("Unknown type") + if field_name == 'init_score': + num_data = self.num_data() + num_classes = arr.size // num_data + if num_classes > 1: + arr = arr.reshape((num_data, num_classes), order='F') + return arr + + def set_categorical_feature( + self, + categorical_feature: _LGBM_CategoricalFeatureConfiguration + ) -> "Dataset": + """Set categorical features. + + Parameters + ---------- + categorical_feature : list of str or int, or 'auto' + Names or indices of categorical features. + + Returns + ------- + self : Dataset + Dataset with set categorical features. + """ + if self.categorical_feature == categorical_feature: + return self + if self.data is not None: + if self.categorical_feature is None: + self.categorical_feature = categorical_feature + return self._free_handle() + elif categorical_feature == 'auto': + return self + else: + if self.categorical_feature != 'auto': + _log_warning('categorical_feature in Dataset is overridden.\n' + f'New categorical_feature is {list(categorical_feature)}') + self.categorical_feature = categorical_feature + return self._free_handle() + else: + raise LightGBMError("Cannot set categorical feature after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this.") + + def _set_predictor( + self, + predictor: Optional[_InnerPredictor] + ) -> "Dataset": + """Set predictor for continued training. + + It is not recommended for user to call this function. + Please use init_model argument in engine.train() or engine.cv() instead. + """ + if predictor is None and self._predictor is None: + return self + elif isinstance(predictor, _InnerPredictor) and isinstance(self._predictor, _InnerPredictor): + if (predictor == self._predictor) and (predictor.current_iteration() == self._predictor.current_iteration()): + return self + if self._handle is None: + self._predictor = predictor + elif self.data is not None: + self._predictor = predictor + self._set_init_score_by_predictor( + predictor=self._predictor, + data=self.data, + used_indices=None + ) + elif self.used_indices is not None and self.reference is not None and self.reference.data is not None: + self._predictor = predictor + self._set_init_score_by_predictor( + predictor=self._predictor, + data=self.reference.data, + used_indices=self.used_indices + ) + else: + raise LightGBMError("Cannot set predictor after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this.") + return self + + def set_reference(self, reference: "Dataset") -> "Dataset": + """Set reference Dataset. + + Parameters + ---------- + reference : Dataset + Reference that is used as a template to construct the current Dataset. + + Returns + ------- + self : Dataset + Dataset with set reference. + """ + self.set_categorical_feature(reference.categorical_feature) \ + .set_feature_name(reference.feature_name) \ + ._set_predictor(reference._predictor) + # we're done if self and reference share a common upstream reference + if self.get_ref_chain().intersection(reference.get_ref_chain()): + return self + if self.data is not None: + self.reference = reference + return self._free_handle() + else: + raise LightGBMError("Cannot set reference after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this.") + + def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dataset": + """Set feature name. + + Parameters + ---------- + feature_name : list of str + Feature names. + + Returns + ------- + self : Dataset + Dataset with set feature name. + """ + if feature_name != 'auto': + self.feature_name = feature_name + if self._handle is not None and feature_name is not None and feature_name != 'auto': + if len(feature_name) != self.num_feature(): + raise ValueError(f"Length of feature_name({len(feature_name)}) and num_feature({self.num_feature()}) don't match") + c_feature_name = [_c_str(name) for name in feature_name] + _safe_call(_LIB.LGBM_DatasetSetFeatureNames( + self._handle, + _c_array(ctypes.c_char_p, c_feature_name), + ctypes.c_int(len(feature_name)))) + return self + + def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": + """Set label of Dataset. + + Parameters + ---------- + label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None + The label information to be set into Dataset. + + Returns + ------- + self : Dataset + Dataset with set label. + """ + self.label = label + if self._handle is not None: + if isinstance(label, pd_DataFrame): + if len(label.columns) > 1: + raise ValueError('DataFrame for label cannot have multiple columns') + label_array = np.ravel(_pandas_to_numpy(label, target_dtype=np.float32)) + elif _is_pyarrow_array(label): + label_array = label + else: + label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label') + self.set_field('label', label_array) + self.label = self.get_field('label') # original values can be modified at cpp side + return self + + def set_weight( + self, + weight: Optional[_LGBM_WeightType] + ) -> "Dataset": + """Set weight of each instance. + + Parameters + ---------- + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None + Weight to be set for each data point. Weights should be non-negative. + + Returns + ------- + self : Dataset + Dataset with set weight. + """ + # Check if the weight contains values other than one + if weight is not None: + if _is_pyarrow_array(weight): + if pa_compute.all(pa_compute.equal(weight, 1)).as_py(): + weight = None + elif np.all(weight == 1): + weight = None + self.weight = weight + + # Set field + if self._handle is not None and weight is not None: + if not _is_pyarrow_array(weight): + weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight') + self.set_field('weight', weight) + self.weight = self.get_field('weight') # original values can be modified at cpp side + return self + + def set_init_score( + self, + init_score: Optional[_LGBM_InitScoreType] + ) -> "Dataset": + """Set init score of Booster to start from. + + Parameters + ---------- + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None + Init score for Booster. + + Returns + ------- + self : Dataset + Dataset with set init score. + """ + self.init_score = init_score + if self._handle is not None and init_score is not None: + self.set_field('init_score', init_score) + self.init_score = self.get_field('init_score') # original values can be modified at cpp side + return self + + def set_group( + self, + group: Optional[_LGBM_GroupType] + ) -> "Dataset": + """Set group size of Dataset (used for ranking). + + Parameters + ---------- + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + + Returns + ------- + self : Dataset + Dataset with set group. + """ + self.group = group + if self._handle is not None and group is not None: + if not _is_pyarrow_array(group): + group = _list_to_1d_numpy(group, dtype=np.int32, name='group') + self.set_field('group', group) + # original values can be modified at cpp side + constructed_group = self.get_field('group') + if constructed_group is not None: + self.group = np.diff(constructed_group) + return self + + def set_position( + self, + position: Optional[_LGBM_PositionType] + ) -> "Dataset": + """Set position of Dataset (used for ranking). + + Parameters + ---------- + position : numpy 1-D array, pandas Series or None, optional (default=None) + Position of items used in unbiased learning-to-rank task. + + Returns + ------- + self : Dataset + Dataset with set position. + """ + self.position = position + if self._handle is not None and position is not None: + position = _list_to_1d_numpy(position, dtype=np.int32, name='position') + self.set_field('position', position) + return self + + def get_feature_name(self) -> List[str]: + """Get the names of columns (features) in the Dataset. + + Returns + ------- + feature_names : list of str + The names of columns (features) in the Dataset. + """ + if self._handle is None: + raise LightGBMError("Cannot get feature_name before construct dataset") + num_feature = self.num_feature() + tmp_out_len = ctypes.c_int(0) + reserved_string_buffer_size = 255 + required_string_buffer_size = ctypes.c_size_t(0) + string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_DatasetGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + if num_feature != tmp_out_len.value: + raise ValueError("Length of feature names doesn't equal with num_feature") + actual_string_buffer_size = required_string_buffer_size.value + # if buffer length is not long enough, reallocate buffers + if reserved_string_buffer_size < actual_string_buffer_size: + string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_DatasetGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)] + + def get_label(self) -> Optional[_LGBM_LabelType]: + """Get the label of the Dataset. + + Returns + ------- + label : list, numpy 1-D array, pandas Series / one-column DataFrame or None + The label information from the Dataset. + For a constructed ``Dataset``, this will only return a numpy array. + """ + if self.label is None: + self.label = self.get_field('label') + return self.label + + def get_weight(self) -> Optional[_LGBM_WeightType]: + """Get the weight of the Dataset. + + Returns + ------- + weight : list, numpy 1-D array, pandas Series or None + Weight for each data point from the Dataset. Weights should be non-negative. + For a constructed ``Dataset``, this will only return ``None`` or a numpy array. + """ + if self.weight is None: + self.weight = self.get_field('weight') + return self.weight + + def get_init_score(self) -> Optional[_LGBM_InitScoreType]: + """Get the initial score of the Dataset. + + Returns + ------- + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None + Init score of Booster. + For a constructed ``Dataset``, this will only return ``None`` or a numpy array. + """ + if self.init_score is None: + self.init_score = self.get_field('init_score') + return self.init_score + + def get_data(self) -> Optional[_LGBM_TrainDataType]: + """Get the raw data of the Dataset. + + Returns + ------- + data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array or None + Raw data used in the Dataset construction. + """ + if self._handle is None: + raise Exception("Cannot get data before construct Dataset") + if self._need_slice and self.used_indices is not None and self.reference is not None: + self.data = self.reference.data + if self.data is not None: + if isinstance(self.data, np.ndarray) or isinstance(self.data, scipy.sparse.spmatrix): + self.data = self.data[self.used_indices, :] + elif isinstance(self.data, pd_DataFrame): + self.data = self.data.iloc[self.used_indices].copy() + elif isinstance(self.data, dt_DataTable): + self.data = self.data[self.used_indices, :] + elif isinstance(self.data, Sequence): + self.data = self.data[self.used_indices] + elif _is_list_of_sequences(self.data) and len(self.data) > 0: + self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices))) + else: + _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n" + "Returning original raw data") + self._need_slice = False + if self.data is None: + raise LightGBMError("Cannot call `get_data` after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this.") + return self.data + + def get_group(self) -> Optional[_LGBM_GroupType]: + """Get the group of the Dataset. + + Returns + ------- + group : list, numpy 1-D array, pandas Series or None + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + For a constructed ``Dataset``, this will only return ``None`` or a numpy array. + """ + if self.group is None: + self.group = self.get_field('group') + if self.group is not None: + # group data from LightGBM is boundaries data, need to convert to group size + self.group = np.diff(self.group) + return self.group + + def get_position(self) -> Optional[_LGBM_PositionType]: + """Get the position of the Dataset. + + Returns + ------- + position : numpy 1-D array, pandas Series or None + Position of items used in unbiased learning-to-rank task. + For a constructed ``Dataset``, this will only return ``None`` or a numpy array. + """ + if self.position is None: + self.position = self.get_field('position') + return self.position + + def num_data(self) -> int: + """Get the number of rows in the Dataset. + + Returns + ------- + number_of_rows : int + The number of rows in the Dataset. + """ + if self._handle is not None: + ret = ctypes.c_int(0) + _safe_call(_LIB.LGBM_DatasetGetNumData(self._handle, + ctypes.byref(ret))) + return ret.value + else: + raise LightGBMError("Cannot get num_data before construct dataset") + + def num_feature(self) -> int: + """Get the number of columns (features) in the Dataset. + + Returns + ------- + number_of_columns : int + The number of columns (features) in the Dataset. + """ + if self._handle is not None: + ret = ctypes.c_int(0) + _safe_call(_LIB.LGBM_DatasetGetNumFeature(self._handle, + ctypes.byref(ret))) + return ret.value + else: + raise LightGBMError("Cannot get num_feature before construct dataset") + + def feature_num_bin(self, feature: Union[int, str]) -> int: + """Get the number of bins for a feature. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + feature : int or str + Index or name of the feature. + + Returns + ------- + number_of_bins : int + The number of constructed bins for the feature in the Dataset. + """ + if self._handle is not None: + if isinstance(feature, str): + feature_index = self.feature_name.index(feature) + else: + feature_index = feature + ret = ctypes.c_int(0) + _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self._handle, + ctypes.c_int(feature_index), + ctypes.byref(ret))) + return ret.value + else: + raise LightGBMError("Cannot get feature_num_bin before construct dataset") + + def get_ref_chain(self, ref_limit: int = 100) -> Set["Dataset"]: + """Get a chain of Dataset objects. + + Starts with r, then goes to r.reference (if exists), + then to r.reference.reference, etc. + until we hit ``ref_limit`` or a reference loop. + + Parameters + ---------- + ref_limit : int, optional (default=100) + The limit number of references. + + Returns + ------- + ref_chain : set of Dataset + Chain of references of the Datasets. + """ + head = self + ref_chain: Set[Dataset] = set() + while len(ref_chain) < ref_limit: + if isinstance(head, Dataset): + ref_chain.add(head) + if (head.reference is not None) and (head.reference not in ref_chain): + head = head.reference + else: + break + else: + break + return ref_chain + + def add_features_from(self, other: "Dataset") -> "Dataset": + """Add features from other Dataset to the current Dataset. + + Both Datasets must be constructed before calling this method. + + Parameters + ---------- + other : Dataset + The Dataset to take features from. + + Returns + ------- + self : Dataset + Dataset with the new features added. + """ + if self._handle is None or other._handle is None: + raise ValueError('Both source and target Datasets must be constructed before adding features') + _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self._handle, other._handle)) + was_none = self.data is None + old_self_data_type = type(self.data).__name__ + if other.data is None: + self.data = None + elif self.data is not None: + if isinstance(self.data, np.ndarray): + if isinstance(other.data, np.ndarray): + self.data = np.hstack((self.data, other.data)) + elif isinstance(other.data, scipy.sparse.spmatrix): + self.data = np.hstack((self.data, other.data.toarray())) + elif isinstance(other.data, pd_DataFrame): + self.data = np.hstack((self.data, other.data.values)) + elif isinstance(other.data, dt_DataTable): + self.data = np.hstack((self.data, other.data.to_numpy())) + else: + self.data = None + elif isinstance(self.data, scipy.sparse.spmatrix): + sparse_format = self.data.getformat() + if isinstance(other.data, np.ndarray) or isinstance(other.data, scipy.sparse.spmatrix): + self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format) + elif isinstance(other.data, pd_DataFrame): + self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format) + elif isinstance(other.data, dt_DataTable): + self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format) + else: + self.data = None + elif isinstance(self.data, pd_DataFrame): + if not PANDAS_INSTALLED: + raise LightGBMError("Cannot add features to DataFrame type of raw data " + "without pandas installed. " + "Install pandas and restart your session.") + if isinstance(other.data, np.ndarray): + self.data = concat((self.data, pd_DataFrame(other.data)), + axis=1, ignore_index=True) + elif isinstance(other.data, scipy.sparse.spmatrix): + self.data = concat((self.data, pd_DataFrame(other.data.toarray())), + axis=1, ignore_index=True) + elif isinstance(other.data, pd_DataFrame): + self.data = concat((self.data, other.data), + axis=1, ignore_index=True) + elif isinstance(other.data, dt_DataTable): + self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), + axis=1, ignore_index=True) + else: + self.data = None + elif isinstance(self.data, dt_DataTable): + if isinstance(other.data, np.ndarray): + self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data))) + elif isinstance(other.data, scipy.sparse.spmatrix): + self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.toarray()))) + elif isinstance(other.data, pd_DataFrame): + self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.values))) + elif isinstance(other.data, dt_DataTable): + self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy()))) + else: + self.data = None + else: + self.data = None + if self.data is None: + err_msg = (f"Cannot add features from {type(other.data).__name__} type of raw data to " + f"{old_self_data_type} type of raw data.\n") + err_msg += ("Set free_raw_data=False when construct Dataset to avoid this" + if was_none else "Freeing raw data") + _log_warning(err_msg) + self.feature_name = self.get_feature_name() + _log_warning("Reseting categorical features.\n" + "You can set new categorical features via ``set_categorical_feature`` method") + self.categorical_feature = "auto" + self.pandas_categorical = None + return self + + def _dump_text(self, filename: Union[str, Path]) -> "Dataset": + """Save Dataset to a text file. + + This format cannot be loaded back in by LightGBM, but is useful for debugging purposes. + + Parameters + ---------- + filename : str or pathlib.Path + Name of the output file. + + Returns + ------- + self : Dataset + Returns self. + """ + _safe_call(_LIB.LGBM_DatasetDumpText( + self.construct()._handle, + _c_str(str(filename)))) + return self + + +_LGBM_CustomObjectiveFunction = Callable[ + [np.ndarray, Dataset], + Tuple[np.ndarray, np.ndarray] +] +_LGBM_CustomEvalFunction = Union[ + Callable[ + [np.ndarray, Dataset], + _LGBM_EvalFunctionResultType + ], + Callable[ + [np.ndarray, Dataset], + List[_LGBM_EvalFunctionResultType] + ] +] + + +class Booster: + """Booster in LightGBM.""" + + def __init__( + self, + params: Optional[Dict[str, Any]] = None, + train_set: Optional[Dataset] = None, + model_file: Optional[Union[str, Path]] = None, + model_str: Optional[str] = None + ): + """Initialize the Booster. + + Parameters + ---------- + params : dict or None, optional (default=None) + Parameters for Booster. + train_set : Dataset or None, optional (default=None) + Training dataset. + model_file : str, pathlib.Path or None, optional (default=None) + Path to the model file. + model_str : str or None, optional (default=None) + Model will be loaded from this string. + """ + self._handle = ctypes.c_void_p() + self._network = False + self.__need_reload_eval_info = True + self._train_data_name = "training" + self.__set_objective_to_none = False + self.best_iteration = -1 + self.best_score: _LGBM_BoosterBestScoreType = {} + params = {} if params is None else deepcopy(params) + if train_set is not None: + # Training task + if not isinstance(train_set, Dataset): + raise TypeError(f'Training data should be Dataset instance, met {type(train_set).__name__}') + params = _choose_param_value( + main_param_name="machines", + params=params, + default_value=None + ) + # if "machines" is given, assume user wants to do distributed learning, and set up network + if params["machines"] is None: + params.pop("machines", None) + else: + machines = params["machines"] + if isinstance(machines, str): + num_machines_from_machine_list = len(machines.split(',')) + elif isinstance(machines, (list, set)): + num_machines_from_machine_list = len(machines) + machines = ','.join(machines) + else: + raise ValueError("Invalid machines in params.") + + params = _choose_param_value( + main_param_name="num_machines", + params=params, + default_value=num_machines_from_machine_list + ) + params = _choose_param_value( + main_param_name="local_listen_port", + params=params, + default_value=12400 + ) + self.set_network( + machines=machines, + local_listen_port=params["local_listen_port"], + listen_time_out=params.get("time_out", 120), + num_machines=params["num_machines"] + ) + # construct booster object + train_set.construct() + # copy the parameters from train_set + params.update(train_set.get_params()) + params_str = _param_dict_to_str(params) + _safe_call(_LIB.LGBM_BoosterCreate( + train_set._handle, + _c_str(params_str), + ctypes.byref(self._handle))) + # save reference to data + self.train_set = train_set + self.valid_sets: List[Dataset] = [] + self.name_valid_sets: List[str] = [] + self.__num_dataset = 1 + self.__init_predictor = train_set._predictor + if self.__init_predictor is not None: + _safe_call(_LIB.LGBM_BoosterMerge( + self._handle, + self.__init_predictor._handle)) + out_num_class = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class))) + self.__num_class = out_num_class.value + # buffer for inner predict + self.__inner_predict_buffer: List[Optional[np.ndarray]] = [None] + self.__is_predicted_cur_iter = [False] + self.__get_eval_info() + self.pandas_categorical = train_set.pandas_categorical + self.train_set_version = train_set.version + elif model_file is not None: + # Prediction task + out_num_iterations = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterCreateFromModelfile( + _c_str(str(model_file)), + ctypes.byref(out_num_iterations), + ctypes.byref(self._handle))) + out_num_class = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class))) + self.__num_class = out_num_class.value + self.pandas_categorical = _load_pandas_categorical(file_name=model_file) + if params: + _log_warning('Ignoring params argument, using parameters from model file.') + params = self._get_loaded_param() + elif model_str is not None: + self.model_from_string(model_str) + else: + raise TypeError('Need at least one training dataset or model file or model string ' + 'to create Booster instance') + self.params = params + + def __del__(self) -> None: + try: + if self._network: + self.free_network() + except AttributeError: + pass + try: + if self._handle is not None: + _safe_call(_LIB.LGBM_BoosterFree(self._handle)) + except AttributeError: + pass + + def __copy__(self) -> "Booster": + return self.__deepcopy__(None) + + def __deepcopy__(self, _) -> "Booster": + model_str = self.model_to_string(num_iteration=-1) + return Booster(model_str=model_str) + + def __getstate__(self) -> Dict[str, Any]: + this = self.__dict__.copy() + handle = this['_handle'] + this.pop('train_set', None) + this.pop('valid_sets', None) + if handle is not None: + this["_handle"] = self.model_to_string(num_iteration=-1) + return this + + def __setstate__(self, state: Dict[str, Any]) -> None: + model_str = state.get('_handle', state.get('handle', None)) + if model_str is not None: + handle = ctypes.c_void_p() + out_num_iterations = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterLoadModelFromString( + _c_str(model_str), + ctypes.byref(out_num_iterations), + ctypes.byref(handle))) + state['_handle'] = handle + self.__dict__.update(state) + + def _get_loaded_param(self) -> Dict[str, Any]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterGetLoadedParam( + self._handle, + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterGetLoadedParam( + self._handle, + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + return json.loads(string_buffer.value.decode('utf-8')) + + def free_dataset(self) -> "Booster": + """Free Booster's Datasets. + + Returns + ------- + self : Booster + Booster without Datasets. + """ + self.__dict__.pop('train_set', None) + self.__dict__.pop('valid_sets', None) + self.__num_dataset = 0 + return self + + def _free_buffer(self) -> "Booster": + self.__inner_predict_buffer = [] + self.__is_predicted_cur_iter = [] + return self + + def set_network( + self, + machines: Union[List[str], Set[str], str], + local_listen_port: int = 12400, + listen_time_out: int = 120, + num_machines: int = 1 + ) -> "Booster": + """Set the network configuration. + + Parameters + ---------- + machines : list, set or str + Names of machines. + local_listen_port : int, optional (default=12400) + TCP listen port for local machines. + listen_time_out : int, optional (default=120) + Socket time-out in minutes. + num_machines : int, optional (default=1) + The number of machines for distributed learning application. + + Returns + ------- + self : Booster + Booster with set network. + """ + if isinstance(machines, (list, set)): + machines = ','.join(machines) + _safe_call(_LIB.LGBM_NetworkInit(_c_str(machines), + ctypes.c_int(local_listen_port), + ctypes.c_int(listen_time_out), + ctypes.c_int(num_machines))) + self._network = True + return self + + def free_network(self) -> "Booster": + """Free Booster's network. + + Returns + ------- + self : Booster + Booster with freed network. + """ + _safe_call(_LIB.LGBM_NetworkFree()) + self._network = False + return self + + def trees_to_dataframe(self) -> pd_DataFrame: + """Parse the fitted model and return in an easy-to-read pandas DataFrame. + + The returned DataFrame has the following columns. + + - ``tree_index`` : int64, which tree a node belongs to. 0-based, so a value of ``6``, for example, means "this node is in the 7th tree". + - ``node_depth`` : int64, how far a node is from the root of the tree. The root node has a value of ``1``, its direct children are ``2``, etc. + - ``node_index`` : str, unique identifier for a node. + - ``left_child`` : str, ``node_index`` of the child node to the left of a split. ``None`` for leaf nodes. + - ``right_child`` : str, ``node_index`` of the child node to the right of a split. ``None`` for leaf nodes. + - ``parent_index`` : str, ``node_index`` of this node's parent. ``None`` for the root node. + - ``split_feature`` : str, name of the feature used for splitting. ``None`` for leaf nodes. + - ``split_gain`` : float64, gain from adding this split to the tree. ``NaN`` for leaf nodes. + - ``threshold`` : float64, value of the feature used to decide which side of the split a record will go down. ``NaN`` for leaf nodes. + - ``decision_type`` : str, logical operator describing how to compare a value to ``threshold``. + For example, ``split_feature = "Column_10", threshold = 15, decision_type = "<="`` means that + records where ``Column_10 <= 15`` follow the left side of the split, otherwise follows the right side of the split. ``None`` for leaf nodes. + - ``missing_direction`` : str, split direction that missing values should go to. ``None`` for leaf nodes. + - ``missing_type`` : str, describes what types of values are treated as missing. + - ``value`` : float64, predicted value for this leaf node, multiplied by the learning rate. + - ``weight`` : float64 or int64, sum of Hessian (second-order derivative of objective), summed over observations that fall in this node. + - ``count`` : int64, number of records in the training data that fall into this node. + + Returns + ------- + result : pandas DataFrame + Returns a pandas DataFrame of the parsed model. + """ + if not PANDAS_INSTALLED: + raise LightGBMError('This method cannot be run without pandas installed. ' + 'You must install pandas and restart your session to use this method.') + + if self.num_trees() == 0: + raise LightGBMError('There are no trees in this Booster and thus nothing to parse') + + def _is_split_node(tree: Dict[str, Any]) -> bool: + return 'split_index' in tree.keys() + + def create_node_record( + tree: Dict[str, Any], + node_depth: int = 1, + tree_index: Optional[int] = None, + feature_names: Optional[List[str]] = None, + parent_node: Optional[str] = None + ) -> Dict[str, Any]: + + def _get_node_index( + tree: Dict[str, Any], + tree_index: Optional[int] + ) -> str: + tree_num = f'{tree_index}-' if tree_index is not None else '' + is_split = _is_split_node(tree) + node_type = 'S' if is_split else 'L' + # if a single node tree it won't have `leaf_index` so return 0 + node_num = tree.get('split_index' if is_split else 'leaf_index', 0) + return f"{tree_num}{node_type}{node_num}" + + def _get_split_feature( + tree: Dict[str, Any], + feature_names: Optional[List[str]] + ) -> Optional[str]: + if _is_split_node(tree): + if feature_names is not None: + feature_name = feature_names[tree['split_feature']] + else: + feature_name = tree['split_feature'] + else: + feature_name = None + return feature_name + + def _is_single_node_tree(tree: Dict[str, Any]) -> bool: + return set(tree.keys()) == {'leaf_value'} + + # Create the node record, and populate universal data members + node: Dict[str, Union[int, str, None]] = OrderedDict() + node['tree_index'] = tree_index + node['node_depth'] = node_depth + node['node_index'] = _get_node_index(tree, tree_index) + node['left_child'] = None + node['right_child'] = None + node['parent_index'] = parent_node + node['split_feature'] = _get_split_feature(tree, feature_names) + node['split_gain'] = None + node['threshold'] = None + node['decision_type'] = None + node['missing_direction'] = None + node['missing_type'] = None + node['value'] = None + node['weight'] = None + node['count'] = None + + # Update values to reflect node type (leaf or split) + if _is_split_node(tree): + node['left_child'] = _get_node_index(tree['left_child'], tree_index) + node['right_child'] = _get_node_index(tree['right_child'], tree_index) + node['split_gain'] = tree['split_gain'] + node['threshold'] = tree['threshold'] + node['decision_type'] = tree['decision_type'] + node['missing_direction'] = 'left' if tree['default_left'] else 'right' + node['missing_type'] = tree['missing_type'] + node['value'] = tree['internal_value'] + node['weight'] = tree['internal_weight'] + node['count'] = tree['internal_count'] + else: + node['value'] = tree['leaf_value'] + if not _is_single_node_tree(tree): + node['weight'] = tree['leaf_weight'] + node['count'] = tree['leaf_count'] + + return node + + def tree_dict_to_node_list( + tree: Dict[str, Any], + node_depth: int = 1, + tree_index: Optional[int] = None, + feature_names: Optional[List[str]] = None, + parent_node: Optional[str] = None + ) -> List[Dict[str, Any]]: + + node = create_node_record(tree=tree, + node_depth=node_depth, + tree_index=tree_index, + feature_names=feature_names, + parent_node=parent_node) + + res = [node] + + if _is_split_node(tree): + # traverse the next level of the tree + children = ['left_child', 'right_child'] + for child in children: + subtree_list = tree_dict_to_node_list( + tree=tree[child], + node_depth=node_depth + 1, + tree_index=tree_index, + feature_names=feature_names, + parent_node=node['node_index'] + ) + # In tree format, "subtree_list" is a list of node records (dicts), + # and we add node to the list. + res.extend(subtree_list) + return res + + model_dict = self.dump_model() + feature_names = model_dict['feature_names'] + model_list = [] + for tree in model_dict['tree_info']: + model_list.extend(tree_dict_to_node_list(tree=tree['tree_structure'], + tree_index=tree['tree_index'], + feature_names=feature_names)) + + return pd_DataFrame(model_list, columns=model_list[0].keys()) + + def set_train_data_name(self, name: str) -> "Booster": + """Set the name to the training Dataset. + + Parameters + ---------- + name : str + Name for the training Dataset. + + Returns + ------- + self : Booster + Booster with set training Dataset name. + """ + self._train_data_name = name + return self + + def add_valid(self, data: Dataset, name: str) -> "Booster": + """Add validation data. + + Parameters + ---------- + data : Dataset + Validation data. + name : str + Name of validation data. + + Returns + ------- + self : Booster + Booster with set validation data. + """ + if not isinstance(data, Dataset): + raise TypeError(f'Validation data should be Dataset instance, met {type(data).__name__}') + if data._predictor is not self.__init_predictor: + raise LightGBMError("Add validation data failed, " + "you should use same predictor for these data") + _safe_call(_LIB.LGBM_BoosterAddValidData( + self._handle, + data.construct()._handle)) + self.valid_sets.append(data) + self.name_valid_sets.append(name) + self.__num_dataset += 1 + self.__inner_predict_buffer.append(None) + self.__is_predicted_cur_iter.append(False) + return self + + def reset_parameter(self, params: Dict[str, Any]) -> "Booster": + """Reset parameters of Booster. + + Parameters + ---------- + params : dict + New parameters for Booster. + + Returns + ------- + self : Booster + Booster with new parameters. + """ + params_str = _param_dict_to_str(params) + if params_str: + _safe_call(_LIB.LGBM_BoosterResetParameter( + self._handle, + _c_str(params_str))) + self.params.update(params) + return self + + def update( + self, + train_set: Optional[Dataset] = None, + fobj: Optional[_LGBM_CustomObjectiveFunction] = None + ) -> bool: + """Update Booster for one iteration. + + Parameters + ---------- + train_set : Dataset or None, optional (default=None) + Training data. + If None, last training data is used. + fobj : callable or None, optional (default=None) + Customized objective function. + Should accept two parameters: preds, train_data, + and return (grad, hess). + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + Predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task. + train_data : Dataset + The training dataset. + grad : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of preds for each sample point. + hess : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of preds for each sample point. + + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + + Returns + ------- + is_finished : bool + Whether the update was successfully finished. + """ + # need reset training data + if train_set is None and self.train_set_version != self.train_set.version: + train_set = self.train_set + is_the_same_train_set = False + else: + is_the_same_train_set = train_set is self.train_set and self.train_set_version == train_set.version + if train_set is not None and not is_the_same_train_set: + if not isinstance(train_set, Dataset): + raise TypeError(f'Training data should be Dataset instance, met {type(train_set).__name__}') + if train_set._predictor is not self.__init_predictor: + raise LightGBMError("Replace training data failed, " + "you should use same predictor for these data") + self.train_set = train_set + _safe_call(_LIB.LGBM_BoosterResetTrainingData( + self._handle, + self.train_set.construct()._handle)) + self.__inner_predict_buffer[0] = None + self.train_set_version = self.train_set.version + is_finished = ctypes.c_int(0) + if fobj is None: + if self.__set_objective_to_none: + raise LightGBMError('Cannot update due to null objective function.') + _safe_call(_LIB.LGBM_BoosterUpdateOneIter( + self._handle, + ctypes.byref(is_finished))) + self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] + return is_finished.value == 1 + else: + if not self.__set_objective_to_none: + self.reset_parameter({"objective": "none"}).__set_objective_to_none = True + grad, hess = fobj(self.__inner_predict(0), self.train_set) + return self.__boost(grad, hess) + + def __boost( + self, + grad: np.ndarray, + hess: np.ndarray + ) -> bool: + """Boost Booster for one iteration with customized gradient statistics. + + .. note:: + + Score is returned before any transformation, + e.g. it is raw margin instead of probability of positive class for binary task. + For multi-class task, score are numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + + Parameters + ---------- + grad : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of score for each sample point. + hess : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of score for each sample point. + + Returns + ------- + is_finished : bool + Whether the boost was successfully finished. + """ + if self.__num_class > 1: + grad = grad.ravel(order='F') + hess = hess.ravel(order='F') + grad = _list_to_1d_numpy(grad, dtype=np.float32, name='gradient') + hess = _list_to_1d_numpy(hess, dtype=np.float32, name='hessian') + assert grad.flags.c_contiguous + assert hess.flags.c_contiguous + if len(grad) != len(hess): + raise ValueError(f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) don't match") + num_train_data = self.train_set.num_data() + if len(grad) != num_train_data * self.__num_class: + raise ValueError( + f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) " + f"don't match training data length ({num_train_data}) * " + f"number of models per one iteration ({self.__num_class})" + ) + is_finished = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom( + self._handle, + grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + ctypes.byref(is_finished))) + self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] + return is_finished.value == 1 + + def rollback_one_iter(self) -> "Booster": + """Rollback one iteration. + + Returns + ------- + self : Booster + Booster with rolled back one iteration. + """ + _safe_call(_LIB.LGBM_BoosterRollbackOneIter( + self._handle)) + self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] + return self + + def current_iteration(self) -> int: + """Get the index of the current iteration. + + Returns + ------- + cur_iter : int + The index of the current iteration. + """ + out_cur_iter = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetCurrentIteration( + self._handle, + ctypes.byref(out_cur_iter))) + return out_cur_iter.value + + def num_model_per_iteration(self) -> int: + """Get number of models per iteration. + + Returns + ------- + model_per_iter : int + The number of models per iteration. + """ + model_per_iter = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterNumModelPerIteration( + self._handle, + ctypes.byref(model_per_iter))) + return model_per_iter.value + + def num_trees(self) -> int: + """Get number of weak sub-models. + + Returns + ------- + num_trees : int + The number of weak sub-models. + """ + num_trees = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel( + self._handle, + ctypes.byref(num_trees))) + return num_trees.value + + def upper_bound(self) -> float: + """Get upper bound value of a model. + + Returns + ------- + upper_bound : float + Upper bound value of the model. + """ + ret = ctypes.c_double(0) + _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue( + self._handle, + ctypes.byref(ret))) + return ret.value + + def lower_bound(self) -> float: + """Get lower bound value of a model. + + Returns + ------- + lower_bound : float + Lower bound value of the model. + """ + ret = ctypes.c_double(0) + _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue( + self._handle, + ctypes.byref(ret))) + return ret.value + + def eval( + self, + data: Dataset, + name: str, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + ) -> List[_LGBM_BoosterEvalMethodResultType]: + """Evaluate for data. + + Parameters + ---------- + data : Dataset + Data for the evaluating. + name : str + Name of the data. + feval : callable, list of callable, or None, optional (default=None) + Customized evaluation function. + Each evaluation function should accept two parameters: preds, eval_data, + and return (eval_name, eval_result, is_higher_better) or list of such tuples. + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes]. + If custom objective function is used, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + eval_data : Dataset + A ``Dataset`` to evaluate. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + Returns + ------- + result : list + List with (dataset_name, eval_name, eval_result, is_higher_better) tuples. + """ + if not isinstance(data, Dataset): + raise TypeError("Can only eval for Dataset instance") + data_idx = -1 + if data is self.train_set: + data_idx = 0 + else: + for i in range(len(self.valid_sets)): + if data is self.valid_sets[i]: + data_idx = i + 1 + break + # need to push new valid data + if data_idx == -1: + self.add_valid(data, name) + data_idx = self.__num_dataset - 1 + + return self.__inner_eval(name, data_idx, feval) + + def eval_train( + self, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + ) -> List[_LGBM_BoosterEvalMethodResultType]: + """Evaluate for training data. + + Parameters + ---------- + feval : callable, list of callable, or None, optional (default=None) + Customized evaluation function. + Each evaluation function should accept two parameters: preds, eval_data, + and return (eval_name, eval_result, is_higher_better) or list of such tuples. + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes]. + If custom objective function is used, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + eval_data : Dataset + The training dataset. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + Returns + ------- + result : list + List with (train_dataset_name, eval_name, eval_result, is_higher_better) tuples. + """ + return self.__inner_eval(self._train_data_name, 0, feval) + + def eval_valid( + self, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + ) -> List[_LGBM_BoosterEvalMethodResultType]: + """Evaluate for validation data. + + Parameters + ---------- + feval : callable, list of callable, or None, optional (default=None) + Customized evaluation function. + Each evaluation function should accept two parameters: preds, eval_data, + and return (eval_name, eval_result, is_higher_better) or list of such tuples. + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes]. + If custom objective function is used, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + eval_data : Dataset + The validation dataset. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + Returns + ------- + result : list + List with (validation_dataset_name, eval_name, eval_result, is_higher_better) tuples. + """ + return [item for i in range(1, self.__num_dataset) + for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)] + + def save_model( + self, + filename: Union[str, Path], + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> "Booster": + """Save Booster to file. + + Parameters + ---------- + filename : str or pathlib.Path + Filename to save Booster. + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + self : Booster + Returns self. + """ + if num_iteration is None: + num_iteration = self.best_iteration + importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] + _safe_call(_LIB.LGBM_BoosterSaveModel( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + _c_str(str(filename)))) + _dump_pandas_categorical(self.pandas_categorical, filename) + return self + + def shuffle_models( + self, + start_iteration: int = 0, + end_iteration: int = -1 + ) -> "Booster": + """Shuffle models. + + Parameters + ---------- + start_iteration : int, optional (default=0) + The first iteration that will be shuffled. + end_iteration : int, optional (default=-1) + The last iteration that will be shuffled. + If <= 0, means the last available iteration. + + Returns + ------- + self : Booster + Booster with shuffled models. + """ + _safe_call(_LIB.LGBM_BoosterShuffleModels( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(end_iteration))) + return self + + def model_from_string(self, model_str: str) -> "Booster": + """Load Booster from a string. + + Parameters + ---------- + model_str : str + Model will be loaded from this string. + + Returns + ------- + self : Booster + Loaded Booster object. + """ + # ensure that existing Booster is freed before replacing it + # with a new one createdfrom file + _safe_call(_LIB.LGBM_BoosterFree(self._handle)) + self._free_buffer() + self._handle = ctypes.c_void_p() + out_num_iterations = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterLoadModelFromString( + _c_str(model_str), + ctypes.byref(out_num_iterations), + ctypes.byref(self._handle))) + out_num_class = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class))) + self.__num_class = out_num_class.value + self.pandas_categorical = _load_pandas_categorical(model_str=model_str) + return self + + def model_to_string( + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> str: + """Save Booster to string. + + Parameters + ---------- + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + str_repr : str + String representation of Booster. + """ + if num_iteration is None: + num_iteration = self.best_iteration + importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterSaveModelToString( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterSaveModelToString( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + ret = string_buffer.value.decode('utf-8') + ret += _dump_pandas_categorical(self.pandas_categorical) + return ret + + def dump_model( + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split', + object_hook: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None + ) -> Dict[str, Any]: + """Dump Booster to JSON format. + + Parameters + ---------- + num_iteration : int or None, optional (default=None) + Index of the iteration that should be dumped. + If None, if the best iteration exists, it is dumped; otherwise, all iterations are dumped. + If <= 0, all iterations are dumped. + start_iteration : int, optional (default=0) + Start index of the iteration that should be dumped. + importance_type : str, optional (default="split") + What type of feature importance should be dumped. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + object_hook : callable or None, optional (default=None) + If not None, ``object_hook`` is a function called while parsing the json + string returned by the C API. It may be used to alter the json, to store + specific values while building the json structure. It avoids + walking through the structure again. It saves a significant amount + of time if the number of trees is huge. + Signature is ``def object_hook(node: dict) -> dict``. + None is equivalent to ``lambda node: node``. + See documentation of ``json.loads()`` for further details. + + Returns + ------- + json_repr : dict + JSON format of Booster. + """ + if num_iteration is None: + num_iteration = self.best_iteration + importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterDumpModel( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, reallocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call(_LIB.LGBM_BoosterDumpModel( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + ret = json.loads(string_buffer.value.decode('utf-8'), object_hook=object_hook) + ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical, + default=_json_default_with_numpy)) + return ret + + def predict( + self, + data: _LGBM_PredictDataType, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + raw_score: bool = False, + pred_leaf: bool = False, + pred_contrib: bool = False, + data_has_header: bool = False, + validate_features: bool = False, + **kwargs: Any + ) -> Union[np.ndarray, scipy.sparse.spmatrix, List[scipy.sparse.spmatrix]]: + """Make a prediction. + + Parameters + ---------- + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse + Data source for prediction. + If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). + start_iteration : int, optional (default=0) + Start index of the iteration to predict. + If <= 0, starts from the first iteration. + num_iteration : int or None, optional (default=None) + Total number of iterations used in the prediction. + If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; + otherwise, all iterations from ``start_iteration`` are used (no limits). + If <= 0, all iterations from ``start_iteration`` are used (no limits). + raw_score : bool, optional (default=False) + Whether to predict raw scores. + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + + .. note:: + + If you want to get more explanations for your model's predictions using SHAP values, + like SHAP interaction values, + you can install the shap package (https://github.com/slundberg/shap). + Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra + column, where the last column is the expected value. + + data_has_header : bool, optional (default=False) + Whether the data has header. + Used only if data is str. + validate_features : bool, optional (default=False) + If True, ensure that the features used to predict match the ones used to train. + Used only if data is pandas DataFrame. + **kwargs + Other parameters for the prediction. + + Returns + ------- + result : numpy array, scipy.sparse or list of scipy.sparse + Prediction result. + Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``). + """ + predictor = _InnerPredictor.from_booster( + booster=self, + pred_parameter=deepcopy(kwargs), + ) + if num_iteration is None: + if start_iteration <= 0: + num_iteration = self.best_iteration + else: + num_iteration = -1 + return predictor.predict( + data=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + raw_score=raw_score, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + data_has_header=data_has_header, + validate_features=validate_features + ) + + def refit( + self, + data: _LGBM_TrainDataType, + label: _LGBM_LabelType, + decay_rate: float = 0.9, + reference: Optional[Dataset] = None, + weight: Optional[_LGBM_WeightType] = None, + group: Optional[_LGBM_GroupType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + dataset_params: Optional[Dict[str, Any]] = None, + free_raw_data: bool = True, + validate_features: bool = False, + **kwargs + ) -> "Booster": + """Refit the existing Booster by new data. + + Parameters + ---------- + data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array + Data source for refit. + If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). + label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array or pyarrow ChunkedArray + Label for refit. + decay_rate : float, optional (default=0.9) + Decay rate of refit, + will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. + reference : Dataset or None, optional (default=None) + Reference for ``data``. + + .. versionadded:: 4.0.0 + + weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Weight for each ``data`` instance. Weights should be non-negative. + + .. versionadded:: 4.0.0 + + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) + Group/query size for ``data``. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + + .. versionadded:: 4.0.0 + + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) + Init score for ``data``. + + .. versionadded:: 4.0.0 + + feature_name : list of str, or 'auto', optional (default="auto") + Feature names for ``data``. + If 'auto' and data is pandas DataFrame, data columns names are used. + + .. versionadded:: 4.0.0 + + categorical_feature : list of str or int, or 'auto', optional (default="auto") + Categorical features for ``data``. + If list of int, interpreted as indices. + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + Floating point numbers in categorical features will be rounded towards 0. + + .. versionadded:: 4.0.0 + + dataset_params : dict or None, optional (default=None) + Other parameters for Dataset ``data``. + + .. versionadded:: 4.0.0 + + free_raw_data : bool, optional (default=True) + If True, raw data is freed after constructing inner Dataset for ``data``. + + .. versionadded:: 4.0.0 + + validate_features : bool, optional (default=False) + If True, ensure that the features used to refit the model match the original ones. + Used only if data is pandas DataFrame. + + .. versionadded:: 4.0.0 + + **kwargs + Other parameters for refit. + These parameters will be passed to ``predict`` method. + + Returns + ------- + result : Booster + Refitted Booster. + """ + if self.__set_objective_to_none: + raise LightGBMError('Cannot refit due to null objective function.') + if dataset_params is None: + dataset_params = {} + predictor = _InnerPredictor.from_booster( + booster=self, + pred_parameter=deepcopy(kwargs) + ) + leaf_preds: np.ndarray = predictor.predict( # type: ignore[assignment] + data=data, + start_iteration=-1, + pred_leaf=True, + validate_features=validate_features + ) + nrow, ncol = leaf_preds.shape + out_is_linear = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetLinear( + self._handle, + ctypes.byref(out_is_linear))) + new_params = _choose_param_value( + main_param_name="linear_tree", + params=self.params, + default_value=None + ) + new_params["linear_tree"] = bool(out_is_linear.value) + new_params.update(dataset_params) + train_set = Dataset( + data=data, + label=label, + reference=reference, + weight=weight, + group=group, + init_score=init_score, + feature_name=feature_name, + categorical_feature=categorical_feature, + params=new_params, + free_raw_data=free_raw_data, + ) + new_params['refit_decay_rate'] = decay_rate + new_booster = Booster(new_params, train_set) + # Copy models + _safe_call(_LIB.LGBM_BoosterMerge( + new_booster._handle, + predictor._handle)) + leaf_preds = leaf_preds.reshape(-1) + ptr_data, _, _ = _c_int_array(leaf_preds) + _safe_call(_LIB.LGBM_BoosterRefit( + new_booster._handle, + ptr_data, + ctypes.c_int32(nrow), + ctypes.c_int32(ncol))) + new_booster._network = self._network + return new_booster + + def get_leaf_output(self, tree_id: int, leaf_id: int) -> float: + """Get the output of a leaf. + + Parameters + ---------- + tree_id : int + The index of the tree. + leaf_id : int + The index of the leaf in the tree. + + Returns + ------- + result : float + The output of the leaf. + """ + ret = ctypes.c_double(0) + _safe_call(_LIB.LGBM_BoosterGetLeafValue( + self._handle, + ctypes.c_int(tree_id), + ctypes.c_int(leaf_id), + ctypes.byref(ret))) + return ret.value + + def set_leaf_output( + self, + tree_id: int, + leaf_id: int, + value: float, + ) -> 'Booster': + """Set the output of a leaf. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + tree_id : int + The index of the tree. + leaf_id : int + The index of the leaf in the tree. + value : float + Value to set as the output of the leaf. + + Returns + ------- + self : Booster + Booster with the leaf output set. + """ + _safe_call( + _LIB.LGBM_BoosterSetLeafValue( + self._handle, + ctypes.c_int(tree_id), + ctypes.c_int(leaf_id), + ctypes.c_double(value) + ) + ) + return self + + def num_feature(self) -> int: + """Get number of features. + + Returns + ------- + num_feature : int + The number of features. + """ + out_num_feature = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetNumFeature( + self._handle, + ctypes.byref(out_num_feature))) + return out_num_feature.value + + def feature_name(self) -> List[str]: + """Get names of features. + + Returns + ------- + result : list of str + List with names of features. + """ + num_feature = self.num_feature() + # Get name of features + tmp_out_len = ctypes.c_int(0) + reserved_string_buffer_size = 255 + required_string_buffer_size = ctypes.c_size_t(0) + string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_BoosterGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + if num_feature != tmp_out_len.value: + raise ValueError("Length of feature names doesn't equal with num_feature") + actual_string_buffer_size = required_string_buffer_size.value + # if buffer length is not long enough, reallocate buffers + if reserved_string_buffer_size < actual_string_buffer_size: + string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_BoosterGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)] + + def feature_importance( + self, + importance_type: str = 'split', + iteration: Optional[int] = None + ) -> np.ndarray: + """Get feature importances. + + Parameters + ---------- + importance_type : str, optional (default="split") + How the importance is calculated. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + iteration : int or None, optional (default=None) + Limit number of iterations in the feature importance calculation. + If None, if the best iteration exists, it is used; otherwise, all trees are used. + If <= 0, all trees are used (no limits). + + Returns + ------- + result : numpy array + Array with feature importances. + """ + if iteration is None: + iteration = self.best_iteration + importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] + result = np.empty(self.num_feature(), dtype=np.float64) + _safe_call(_LIB.LGBM_BoosterFeatureImportance( + self._handle, + ctypes.c_int(iteration), + ctypes.c_int(importance_type_int), + result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if importance_type_int == _C_API_FEATURE_IMPORTANCE_SPLIT: + return result.astype(np.int32) + else: + return result + + def get_split_value_histogram( + self, + feature: Union[int, str], + bins: Optional[Union[int, str]] = None, + xgboost_style: bool = False + ) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray, pd_DataFrame]: + """Get split value histogram for the specified feature. + + Parameters + ---------- + feature : int or str + The feature name or index the histogram is calculated for. + If int, interpreted as index. + If str, interpreted as name. + + .. warning:: + + Categorical features are not supported. + + bins : int, str or None, optional (default=None) + The maximum number of bins. + If None, or int and > number of unique split values and ``xgboost_style=True``, + the number of bins equals number of unique split values. + If str, it should be one from the list of the supported values by ``numpy.histogram()`` function. + xgboost_style : bool, optional (default=False) + Whether the returned result should be in the same form as it is in XGBoost. + If False, the returned value is tuple of 2 numpy arrays as it is in ``numpy.histogram()`` function. + If True, the returned value is matrix, in which the first column is the right edges of non-empty bins + and the second one is the histogram values. + + Returns + ------- + result_tuple : tuple of 2 numpy arrays + If ``xgboost_style=False``, the values of the histogram of used splitting values for the specified feature + and the bin edges. + result_array_like : numpy array or pandas DataFrame (if pandas is installed) + If ``xgboost_style=True``, the histogram of used splitting values for the specified feature. + """ + def add(root: Dict[str, Any]) -> None: + """Recursively add thresholds.""" + if 'split_index' in root: # non-leaf + if feature_names is not None and isinstance(feature, str): + split_feature = feature_names[root['split_feature']] + else: + split_feature = root['split_feature'] + if split_feature == feature: + if isinstance(root['threshold'], str): + raise LightGBMError('Cannot compute split value histogram for the categorical feature') + else: + values.append(root['threshold']) + add(root['left_child']) + add(root['right_child']) + + model = self.dump_model() + feature_names = model.get('feature_names') + tree_infos = model['tree_info'] + values: List[float] = [] + for tree_info in tree_infos: + add(tree_info['tree_structure']) + + if bins is None or isinstance(bins, int) and xgboost_style: + n_unique = len(np.unique(values)) + bins = max(min(n_unique, bins) if bins is not None else n_unique, 1) + hist, bin_edges = np.histogram(values, bins=bins) + if xgboost_style: + ret = np.column_stack((bin_edges[1:], hist)) + ret = ret[ret[:, 1] > 0] + if PANDAS_INSTALLED: + return pd_DataFrame(ret, columns=['SplitValue', 'Count']) + else: + return ret + else: + return hist, bin_edges + + def __inner_eval( + self, + data_name: str, + data_idx: int, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] + ) -> List[_LGBM_BoosterEvalMethodResultType]: + """Evaluate training or validation data.""" + if data_idx >= self.__num_dataset: + raise ValueError("Data_idx should be smaller than number of dataset") + self.__get_eval_info() + ret = [] + if self.__num_inner_eval > 0: + result = np.empty(self.__num_inner_eval, dtype=np.float64) + tmp_out_len = ctypes.c_int(0) + _safe_call(_LIB.LGBM_BoosterGetEval( + self._handle, + ctypes.c_int(data_idx), + ctypes.byref(tmp_out_len), + result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if tmp_out_len.value != self.__num_inner_eval: + raise ValueError("Wrong length of eval results") + for i in range(self.__num_inner_eval): + ret.append((data_name, self.__name_inner_eval[i], + result[i], self.__higher_better_inner_eval[i])) + if callable(feval): + feval = [feval] + if feval is not None: + if data_idx == 0: + cur_data = self.train_set + else: + cur_data = self.valid_sets[data_idx - 1] + for eval_function in feval: + if eval_function is None: + continue + feval_ret = eval_function(self.__inner_predict(data_idx), cur_data) + if isinstance(feval_ret, list): + for eval_name, val, is_higher_better in feval_ret: + ret.append((data_name, eval_name, val, is_higher_better)) + else: + eval_name, val, is_higher_better = feval_ret + ret.append((data_name, eval_name, val, is_higher_better)) + return ret + + def __inner_predict(self, data_idx: int) -> np.ndarray: + """Predict for training and validation dataset.""" + if data_idx >= self.__num_dataset: + raise ValueError("Data_idx should be smaller than number of dataset") + if self.__inner_predict_buffer[data_idx] is None: + if data_idx == 0: + n_preds = self.train_set.num_data() * self.__num_class + else: + n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class + self.__inner_predict_buffer[data_idx] = np.empty(n_preds, dtype=np.float64) + # avoid to predict many time in one iteration + if not self.__is_predicted_cur_iter[data_idx]: + tmp_out_len = ctypes.c_int64(0) + data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double)) # type: ignore[union-attr] + _safe_call(_LIB.LGBM_BoosterGetPredict( + self._handle, + ctypes.c_int(data_idx), + ctypes.byref(tmp_out_len), + data_ptr)) + if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): # type: ignore[arg-type] + raise ValueError(f"Wrong length of predict results for data {data_idx}") + self.__is_predicted_cur_iter[data_idx] = True + result: np.ndarray = self.__inner_predict_buffer[data_idx] # type: ignore[assignment] + if self.__num_class > 1: + num_data = result.size // self.__num_class + result = result.reshape(num_data, self.__num_class, order='F') + return result + + def __get_eval_info(self) -> None: + """Get inner evaluation count and names.""" + if self.__need_reload_eval_info: + self.__need_reload_eval_info = False + out_num_eval = ctypes.c_int(0) + # Get num of inner evals + _safe_call(_LIB.LGBM_BoosterGetEvalCounts( + self._handle, + ctypes.byref(out_num_eval))) + self.__num_inner_eval = out_num_eval.value + if self.__num_inner_eval > 0: + # Get name of eval metrics + tmp_out_len = ctypes.c_int(0) + reserved_string_buffer_size = 255 + required_string_buffer_size = ctypes.c_size_t(0) + string_buffers = [ + ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(self.__num_inner_eval) + ] + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_BoosterGetEvalNames( + self._handle, + ctypes.c_int(self.__num_inner_eval), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + if self.__num_inner_eval != tmp_out_len.value: + raise ValueError("Length of eval names doesn't equal with num_evals") + actual_string_buffer_size = required_string_buffer_size.value + # if buffer length is not long enough, reallocate buffers + if reserved_string_buffer_size < actual_string_buffer_size: + string_buffers = [ + ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(self.__num_inner_eval) + ] + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] + _safe_call(_LIB.LGBM_BoosterGetEvalNames( + self._handle, + ctypes.c_int(self.__num_inner_eval), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers)) + self.__name_inner_eval = [ + string_buffers[i].value.decode('utf-8') for i in range(self.__num_inner_eval) + ] + self.__higher_better_inner_eval = [ + name.startswith(('auc', 'ndcg@', 'map@', 'average_precision')) for name in self.__name_inner_eval + ] diff --git a/ext/lightgbm/callback.py b/ext/lightgbm/callback.py new file mode 100644 index 0000000..b68bb63 --- /dev/null +++ b/ext/lightgbm/callback.py @@ -0,0 +1,470 @@ +# coding: utf-8 +"""Callbacks library.""" +from collections import OrderedDict +from dataclasses import dataclass +from functools import partial +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union + +from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning) + +if TYPE_CHECKING: + from .engine import CVBooster + +__all__ = [ + 'EarlyStopException', + 'early_stopping', + 'log_evaluation', + 'record_evaluation', + 'reset_parameter', +] + +_EvalResultDict = Dict[str, Dict[str, List[Any]]] +_EvalResultTuple = Union[ + _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType +] +_ListOfEvalResultTuples = Union[ + List[_LGBM_BoosterEvalMethodResultType], + List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] +] + + +class EarlyStopException(Exception): + """Exception of early stopping. + + Raise this from a callback passed in via keyword argument ``callbacks`` + in ``cv()`` or ``train()`` to trigger early stopping. + """ + + def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None: + """Create early stopping exception. + + Parameters + ---------- + best_iteration : int + The best iteration stopped. + 0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one. + best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple + Scores for each metric, on each validation set, as of the best iteration. + """ + super().__init__() + self.best_iteration = best_iteration + self.best_score = best_score + + +# Callback environment used by callbacks +@dataclass +class CallbackEnv: + model: Union[Booster, "CVBooster"] + params: Dict[str, Any] + iteration: int + begin_iteration: int + end_iteration: int + evaluation_result_list: Optional[_ListOfEvalResultTuples] + + +def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: + """Format metric string.""" + if len(value) == 4: + return f"{value[0]}'s {value[1]}: {value[2]:g}" + elif len(value) == 5: + if show_stdv: + return f"{value[0]}'s {value[1]}: {value[2]:g} + {value[4]:g}" # type: ignore[misc] + else: + return f"{value[0]}'s {value[1]}: {value[2]:g}" + else: + raise ValueError("Wrong metric value") + + +class _LogEvaluationCallback: + """Internal log evaluation callable class.""" + + def __init__(self, period: int = 1, show_stdv: bool = True) -> None: + self.order = 10 + self.before_iteration = False + + self.period = period + self.show_stdv = show_stdv + + def __call__(self, env: CallbackEnv) -> None: + if self.period > 0 and env.evaluation_result_list and (env.iteration + 1) % self.period == 0: + result = '\t'.join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list]) + _log_info(f'[{env.iteration + 1}]\t{result}') + + +def log_evaluation(period: int = 1, show_stdv: bool = True) -> _LogEvaluationCallback: + """Create a callback that logs the evaluation results. + + By default, standard output resource is used. + Use ``register_logger()`` function to register a custom logger. + + Note + ---- + Requires at least one validation data. + + Parameters + ---------- + period : int, optional (default=1) + The period to log the evaluation results. + The last boosting stage or the boosting stage found by using ``early_stopping`` callback is also logged. + show_stdv : bool, optional (default=True) + Whether to log stdv (if provided). + + Returns + ------- + callback : _LogEvaluationCallback + The callback that logs the evaluation results every ``period`` boosting iteration(s). + """ + return _LogEvaluationCallback(period=period, show_stdv=show_stdv) + + +class _RecordEvaluationCallback: + """Internal record evaluation callable class.""" + + def __init__(self, eval_result: _EvalResultDict) -> None: + self.order = 20 + self.before_iteration = False + + if not isinstance(eval_result, dict): + raise TypeError('eval_result should be a dictionary') + self.eval_result = eval_result + + def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) + self.eval_result.clear() + for item in env.evaluation_result_list: + if len(item) == 4: # regular train + data_name, eval_name = item[:2] + else: # cv + data_name, eval_name = item[1].split() + self.eval_result.setdefault(data_name, OrderedDict()) + if len(item) == 4: + self.eval_result[data_name].setdefault(eval_name, []) + else: + self.eval_result[data_name].setdefault(f'{eval_name}-mean', []) + self.eval_result[data_name].setdefault(f'{eval_name}-stdv', []) + + def __call__(self, env: CallbackEnv) -> None: + if env.iteration == env.begin_iteration: + self._init(env) + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) + for item in env.evaluation_result_list: + if len(item) == 4: + data_name, eval_name, result = item[:3] + self.eval_result[data_name][eval_name].append(result) + else: + data_name, eval_name = item[1].split() + res_mean = item[2] + res_stdv = item[4] # type: ignore[misc] + self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean) + self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv) + + +def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable: + """Create a callback that records the evaluation history into ``eval_result``. + + Parameters + ---------- + eval_result : dict + Dictionary used to store all evaluation results of all validation sets. + This should be initialized outside of your call to ``record_evaluation()`` and should be empty. + Any initial contents of the dictionary will be deleted. + + .. rubric:: Example + + With two validation sets named 'eval' and 'train', and one evaluation metric named 'logloss' + this dictionary after finishing a model training process will have the following structure: + + .. code-block:: + + { + 'train': + { + 'logloss': [0.48253, 0.35953, ...] + }, + 'eval': + { + 'logloss': [0.480385, 0.357756, ...] + } + } + + Returns + ------- + callback : _RecordEvaluationCallback + The callback that records the evaluation history into the passed dictionary. + """ + return _RecordEvaluationCallback(eval_result=eval_result) + + +class _ResetParameterCallback: + """Internal reset parameter callable class.""" + + def __init__(self, **kwargs: Union[list, Callable]) -> None: + self.order = 10 + self.before_iteration = True + + self.kwargs = kwargs + + def __call__(self, env: CallbackEnv) -> None: + new_parameters = {} + for key, value in self.kwargs.items(): + if isinstance(value, list): + if len(value) != env.end_iteration - env.begin_iteration: + raise ValueError(f"Length of list {key!r} has to be equal to 'num_boost_round'.") + new_param = value[env.iteration - env.begin_iteration] + elif callable(value): + new_param = value(env.iteration - env.begin_iteration) + else: + raise ValueError("Only list and callable values are supported " + "as a mapping from boosting round index to new parameter value.") + if new_param != env.params.get(key, None): + new_parameters[key] = new_param + if new_parameters: + if isinstance(env.model, Booster): + env.model.reset_parameter(new_parameters) + else: + # CVBooster holds a list of Booster objects, each needs to be updated + for booster in env.model.boosters: + booster.reset_parameter(new_parameters) + env.params.update(new_parameters) + + +def reset_parameter(**kwargs: Union[list, Callable]) -> Callable: + """Create a callback that resets the parameter after the first iteration. + + .. note:: + + The initial parameter will still take in-effect on first iteration. + + Parameters + ---------- + **kwargs : value should be list or callable + List of parameters for each boosting round + or a callable that calculates the parameter in terms of + current number of round (e.g. yields learning rate decay). + If list lst, parameter = lst[current_round]. + If callable func, parameter = func(current_round). + + Returns + ------- + callback : _ResetParameterCallback + The callback that resets the parameter after the first iteration. + """ + return _ResetParameterCallback(**kwargs) + + +class _EarlyStoppingCallback: + """Internal early stopping callable class.""" + + def __init__( + self, + stopping_rounds: int, + first_metric_only: bool = False, + verbose: bool = True, + min_delta: Union[float, List[float]] = 0.0 + ) -> None: + + if not isinstance(stopping_rounds, int) or stopping_rounds <= 0: + raise ValueError(f"stopping_rounds should be an integer and greater than 0. got: {stopping_rounds}") + + self.order = 30 + self.before_iteration = False + + self.stopping_rounds = stopping_rounds + self.first_metric_only = first_metric_only + self.verbose = verbose + self.min_delta = min_delta + + self.enabled = True + self._reset_storages() + + def _reset_storages(self) -> None: + self.best_score: List[float] = [] + self.best_iter: List[int] = [] + self.best_score_list: List[_ListOfEvalResultTuples] = [] + self.cmp_op: List[Callable[[float, float], bool]] = [] + self.first_metric = '' + + def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: + return curr_score > best_score + delta + + def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: + return curr_score < best_score - delta + + def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool: + """Check, by name, if a given Dataset is the training data.""" + # for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set + # and those metrics are considered for early stopping + if ds_name == "cv_agg" and eval_name == "train": + return True + + # for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name + if isinstance(env.model, Booster) and ds_name == env.model._train_data_name: + return True + + return False + + def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None or env.evaluation_result_list == []: + raise ValueError( + "For early stopping, at least one dataset and eval metric is required for evaluation" + ) + + is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting")) + if is_dart: + self.enabled = False + _log_warning('Early stopping is not available in dart mode') + return + + # validation sets are guaranteed to not be identical to the training data in cv() + if isinstance(env.model, Booster): + only_train_set = ( + len(env.evaluation_result_list) == 1 + and self._is_train_set( + ds_name=env.evaluation_result_list[0][0], + eval_name=env.evaluation_result_list[0][1].split(" ")[0], + env=env + ) + ) + if only_train_set: + self.enabled = False + _log_warning('Only training set found, disabling early stopping.') + return + + if self.verbose: + _log_info(f"Training until validation scores don't improve for {self.stopping_rounds} rounds") + + self._reset_storages() + + n_metrics = len({m[1] for m in env.evaluation_result_list}) + n_datasets = len(env.evaluation_result_list) // n_metrics + if isinstance(self.min_delta, list): + if not all(t >= 0 for t in self.min_delta): + raise ValueError('Values for early stopping min_delta must be non-negative.') + if len(self.min_delta) == 0: + if self.verbose: + _log_info('Disabling min_delta for early stopping.') + deltas = [0.0] * n_datasets * n_metrics + elif len(self.min_delta) == 1: + if self.verbose: + _log_info(f'Using {self.min_delta[0]} as min_delta for all metrics.') + deltas = self.min_delta * n_datasets * n_metrics + else: + if len(self.min_delta) != n_metrics: + raise ValueError('Must provide a single value for min_delta or as many as metrics.') + if self.first_metric_only and self.verbose: + _log_info(f'Using only {self.min_delta[0]} as early stopping min_delta.') + deltas = self.min_delta * n_datasets + else: + if self.min_delta < 0: + raise ValueError('Early stopping min_delta must be non-negative.') + if self.min_delta > 0 and n_metrics > 1 and not self.first_metric_only and self.verbose: + _log_info(f'Using {self.min_delta} as min_delta for all metrics.') + deltas = [self.min_delta] * n_datasets * n_metrics + + # split is needed for "<dataset type> <metric>" case (e.g. "train l1") + self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1] + for eval_ret, delta in zip(env.evaluation_result_list, deltas): + self.best_iter.append(0) + if eval_ret[3]: # greater is better + self.best_score.append(float('-inf')) + self.cmp_op.append(partial(self._gt_delta, delta=delta)) + else: + self.best_score.append(float('inf')) + self.cmp_op.append(partial(self._lt_delta, delta=delta)) + + def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None: + if env.iteration == env.end_iteration - 1: + if self.verbose: + best_score_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) + _log_info('Did not meet early stopping. ' + f'Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}') + if self.first_metric_only: + _log_info(f"Evaluated only: {eval_name_splitted[-1]}") + raise EarlyStopException(self.best_iter[i], self.best_score_list[i]) + + def __call__(self, env: CallbackEnv) -> None: + if env.iteration == env.begin_iteration: + self._init(env) + if not self.enabled: + return + if env.evaluation_result_list is None: + raise RuntimeError( + "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) + # self.best_score_list is initialized to an empty list + first_time_updating_best_score_list = (self.best_score_list == []) + for i in range(len(env.evaluation_result_list)): + score = env.evaluation_result_list[i][2] + if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]): + self.best_score[i] = score + self.best_iter[i] = env.iteration + if first_time_updating_best_score_list: + self.best_score_list.append(env.evaluation_result_list) + else: + self.best_score_list[i] = env.evaluation_result_list + # split is needed for "<dataset type> <metric>" case (e.g. "train l1") + eval_name_splitted = env.evaluation_result_list[i][1].split(" ") + if self.first_metric_only and self.first_metric != eval_name_splitted[-1]: + continue # use only the first metric for early stopping + if self._is_train_set( + ds_name=env.evaluation_result_list[i][0], + eval_name=eval_name_splitted[0], + env=env + ): + continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train) + elif env.iteration - self.best_iter[i] >= self.stopping_rounds: + if self.verbose: + eval_result_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) + _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}") + if self.first_metric_only: + _log_info(f"Evaluated only: {eval_name_splitted[-1]}") + raise EarlyStopException(self.best_iter[i], self.best_score_list[i]) + self._final_iteration_check(env, eval_name_splitted, i) + + +def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, min_delta: Union[float, List[float]] = 0.0) -> _EarlyStoppingCallback: + """Create a callback that activates early stopping. + + Activates early stopping. + The model will train until the validation score doesn't improve by at least ``min_delta``. + Validation score needs to improve at least every ``stopping_rounds`` round(s) + to continue training. + Requires at least one validation data and one metric. + If there's more than one, will check all of them. But the training data is ignored anyway. + To check only the first metric set ``first_metric_only`` to True. + The index of iteration that has the best performance will be saved in the ``best_iteration`` attribute of a model. + + Parameters + ---------- + stopping_rounds : int + The possible number of rounds without the trend occurrence. + first_metric_only : bool, optional (default=False) + Whether to use only the first metric for early stopping. + verbose : bool, optional (default=True) + Whether to log message with early stopping information. + By default, standard output resource is used. + Use ``register_logger()`` function to register a custom logger. + min_delta : float or list of float, optional (default=0.0) + Minimum improvement in score to keep training. + If float, this single value is used for all metrics. + If list, its length should match the total number of metrics. + + .. versionadded:: 4.0.0 + + Returns + ------- + callback : _EarlyStoppingCallback + The callback that activates early stopping. + """ + return _EarlyStoppingCallback(stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta) diff --git a/ext/lightgbm/compat.py b/ext/lightgbm/compat.py new file mode 100644 index 0000000..bd1b29a --- /dev/null +++ b/ext/lightgbm/compat.py @@ -0,0 +1,269 @@ +# coding: utf-8 +"""Compatibility library.""" + +from typing import List + +"""pandas""" +try: + from pandas import DataFrame as pd_DataFrame + from pandas import Series as pd_Series + from pandas import concat + try: + from pandas import CategoricalDtype as pd_CategoricalDtype + except ImportError: + from pandas.api.types import CategoricalDtype as pd_CategoricalDtype + PANDAS_INSTALLED = True +except ImportError: + PANDAS_INSTALLED = False + + class pd_Series: # type: ignore + """Dummy class for pandas.Series.""" + + def __init__(self, *args, **kwargs): + pass + + class pd_DataFrame: # type: ignore + """Dummy class for pandas.DataFrame.""" + + def __init__(self, *args, **kwargs): + pass + + class pd_CategoricalDtype: # type: ignore + """Dummy class for pandas.CategoricalDtype.""" + + def __init__(self, *args, **kwargs): + pass + + concat = None + +"""numpy""" +try: + from numpy.random import Generator as np_random_Generator +except ImportError: + class np_random_Generator: # type: ignore + """Dummy class for np.random.Generator.""" + + def __init__(self, *args, **kwargs): + pass + +"""matplotlib""" +try: + import matplotlib # noqa: F401 + MATPLOTLIB_INSTALLED = True +except ImportError: + MATPLOTLIB_INSTALLED = False + +"""graphviz""" +try: + import graphviz # noqa: F401 + GRAPHVIZ_INSTALLED = True +except ImportError: + GRAPHVIZ_INSTALLED = False + +"""datatable""" +try: + import datatable + if hasattr(datatable, "Frame"): + dt_DataTable = datatable.Frame + else: + dt_DataTable = datatable.DataTable + DATATABLE_INSTALLED = True +except ImportError: + DATATABLE_INSTALLED = False + + class dt_DataTable: # type: ignore + """Dummy class for datatable.DataTable.""" + + def __init__(self, *args, **kwargs): + pass + + +"""sklearn""" +try: + from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin + from sklearn.preprocessing import LabelEncoder + from sklearn.utils.class_weight import compute_sample_weight + from sklearn.utils.multiclass import check_classification_targets + from sklearn.utils.validation import assert_all_finite, check_array, check_X_y + try: + from sklearn.exceptions import NotFittedError + from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold + except ImportError: + from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold + from sklearn.utils.validation import NotFittedError + try: + from sklearn.utils.validation import _check_sample_weight + except ImportError: + from sklearn.utils.validation import check_consistent_length + + # dummy function to support older version of scikit-learn + def _check_sample_weight(sample_weight, X, dtype=None): + check_consistent_length(sample_weight, X) + return sample_weight + + SKLEARN_INSTALLED = True + _LGBMBaseCrossValidator = BaseCrossValidator + _LGBMModelBase = BaseEstimator + _LGBMRegressorBase = RegressorMixin + _LGBMClassifierBase = ClassifierMixin + _LGBMLabelEncoder = LabelEncoder + LGBMNotFittedError = NotFittedError + _LGBMStratifiedKFold = StratifiedKFold + _LGBMGroupKFold = GroupKFold + _LGBMCheckXY = check_X_y + _LGBMCheckArray = check_array + _LGBMCheckSampleWeight = _check_sample_weight + _LGBMAssertAllFinite = assert_all_finite + _LGBMCheckClassificationTargets = check_classification_targets + _LGBMComputeSampleWeight = compute_sample_weight +except ImportError: + SKLEARN_INSTALLED = False + + class _LGBMModelBase: # type: ignore + """Dummy class for sklearn.base.BaseEstimator.""" + + pass + + class _LGBMClassifierBase: # type: ignore + """Dummy class for sklearn.base.ClassifierMixin.""" + + pass + + class _LGBMRegressorBase: # type: ignore + """Dummy class for sklearn.base.RegressorMixin.""" + + pass + + _LGBMBaseCrossValidator = None + _LGBMLabelEncoder = None + LGBMNotFittedError = ValueError + _LGBMStratifiedKFold = None + _LGBMGroupKFold = None + _LGBMCheckXY = None + _LGBMCheckArray = None + _LGBMCheckSampleWeight = None + _LGBMAssertAllFinite = None + _LGBMCheckClassificationTargets = None + _LGBMComputeSampleWeight = None + +"""dask""" +try: + from dask import delayed + from dask.array import Array as dask_Array + from dask.array import from_delayed as dask_array_from_delayed + from dask.bag import from_delayed as dask_bag_from_delayed + from dask.dataframe import DataFrame as dask_DataFrame + from dask.dataframe import Series as dask_Series + from dask.distributed import Client, Future, default_client, wait + DASK_INSTALLED = True +except ImportError: + DASK_INSTALLED = False + + dask_array_from_delayed = None # type: ignore[assignment] + dask_bag_from_delayed = None # type: ignore[assignment] + delayed = None + default_client = None # type: ignore[assignment] + wait = None # type: ignore[assignment] + + class Client: # type: ignore + """Dummy class for dask.distributed.Client.""" + + def __init__(self, *args, **kwargs): + pass + + class Future: # type: ignore + """Dummy class for dask.distributed.Future.""" + + def __init__(self, *args, **kwargs): + pass + + class dask_Array: # type: ignore + """Dummy class for dask.array.Array.""" + + def __init__(self, *args, **kwargs): + pass + + class dask_DataFrame: # type: ignore + """Dummy class for dask.dataframe.DataFrame.""" + + def __init__(self, *args, **kwargs): + pass + + class dask_Series: # type: ignore + """Dummy class for dask.dataframe.Series.""" + + def __init__(self, *args, **kwargs): + pass + +"""pyarrow""" +try: + import pyarrow.compute as pa_compute + from pyarrow import Array as pa_Array + from pyarrow import ChunkedArray as pa_ChunkedArray + from pyarrow import Table as pa_Table + from pyarrow import chunked_array as pa_chunked_array + from pyarrow.cffi import ffi as arrow_cffi + from pyarrow.types import is_floating as arrow_is_floating + from pyarrow.types import is_integer as arrow_is_integer + PYARROW_INSTALLED = True +except ImportError: + PYARROW_INSTALLED = False + + class pa_Array: # type: ignore + """Dummy class for pa.Array.""" + + def __init__(self, *args, **kwargs): + pass + + class pa_ChunkedArray: # type: ignore + """Dummy class for pa.ChunkedArray.""" + + def __init__(self, *args, **kwargs): + pass + + class pa_Table: # type: ignore + """Dummy class for pa.Table.""" + + def __init__(self, *args, **kwargs): + pass + + class arrow_cffi: # type: ignore + """Dummy class for pyarrow.cffi.ffi.""" + + CData = None + addressof = None + cast = None + new = None + + def __init__(self, *args, **kwargs): + pass + + class pa_compute: # type: ignore + """Dummy class for pyarrow.compute.""" + + all = None + equal = None + + pa_chunked_array = None + arrow_is_integer = None + arrow_is_floating = None + +"""cpu_count()""" +try: + from joblib import cpu_count + + def _LGBMCpuCount(only_physical_cores: bool = True) -> int: + return cpu_count(only_physical_cores=only_physical_cores) +except ImportError: + try: + from psutil import cpu_count + + def _LGBMCpuCount(only_physical_cores: bool = True) -> int: + return cpu_count(logical=not only_physical_cores) or 1 + except ImportError: + from multiprocessing import cpu_count + + def _LGBMCpuCount(only_physical_cores: bool = True) -> int: + return cpu_count() + +__all__: List[str] = [] diff --git a/ext/lightgbm/dask.py b/ext/lightgbm/dask.py new file mode 100644 index 0000000..88e4779 --- /dev/null +++ b/ext/lightgbm/dask.py @@ -0,0 +1,1671 @@ +# coding: utf-8 +"""Distributed training with LightGBM and dask.distributed. + +This module enables you to perform distributed training with LightGBM on +dask.Array and dask.DataFrame collections. + +It is based on dask-lightgbm, which was based on dask-xgboost. +""" +import operator +import socket +from collections import defaultdict +from copy import deepcopy +from enum import Enum, auto +from functools import partial +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from urllib.parse import urlparse + +import numpy as np +import scipy.sparse as ss + +from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning +from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, Future, LGBMNotFittedError, concat, + dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, + default_client, delayed, pd_DataFrame, pd_Series, wait) +from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction, + _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, + _lgbmmodel_doc_predict) + +__all__ = [ + 'DaskLGBMClassifier', + 'DaskLGBMRanker', + 'DaskLGBMRegressor', +] + +_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] +_DaskMatrixLike = Union[dask_Array, dask_DataFrame] +_DaskVectorLike = Union[dask_Array, dask_Series] +_DaskPart = Union[np.ndarray, pd_DataFrame, pd_Series, ss.spmatrix] +_PredictionDtype = Union[Type[np.float32], Type[np.float64], Type[np.int32], Type[np.int64]] + + +class _RemoteSocket: + def acquire(self) -> int: + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind(('', 0)) + return self.socket.getsockname()[1] + + def release(self) -> None: + self.socket.close() + + +def _acquire_port() -> Tuple[_RemoteSocket, int]: + s = _RemoteSocket() + port = s.acquire() + return s, port + + +class _DatasetNames(Enum): + """Placeholder names used by lightgbm.dask internals to say 'also evaluate the training data'. + + Avoid duplicating the training data when the validation set refers to elements of training data. + """ + + TRAINSET = auto() + SAMPLE_WEIGHT = auto() + INIT_SCORE = auto() + GROUP = auto() + + +def _get_dask_client(client: Optional[Client]) -> Client: + """Choose a Dask client to use. + + Parameters + ---------- + client : dask.distributed.Client or None + Dask client. + + Returns + ------- + client : dask.distributed.Client + A Dask client. + """ + if client is None: + return default_client() + else: + return client + + +def _assign_open_ports_to_workers( + client: Client, + workers: List[str], +) -> Tuple[Dict[str, Future], Dict[str, int]]: + """Assign an open port to each worker. + + Returns + ------- + worker_to_socket_future: dict + mapping from worker address to a future pointing to the remote socket. + worker_to_port: dict + mapping from worker address to an open port in the worker's host. + """ + # Acquire port in worker + worker_to_future = {} + for worker in workers: + worker_to_future[worker] = client.submit( + _acquire_port, + workers=[worker], + allow_other_workers=False, + pure=False, + ) + + # schedule futures to retrieve each element of the tuple + worker_to_socket_future = {} + worker_to_port_future = {} + for worker, socket_future in worker_to_future.items(): + worker_to_socket_future[worker] = client.submit(operator.itemgetter(0), socket_future) + worker_to_port_future[worker] = client.submit(operator.itemgetter(1), socket_future) + + # retrieve ports + worker_to_port = client.gather(worker_to_port_future) + + return worker_to_socket_future, worker_to_port + + +def _concat(seq: List[_DaskPart]) -> _DaskPart: + if isinstance(seq[0], np.ndarray): + return np.concatenate(seq, axis=0) + elif isinstance(seq[0], (pd_DataFrame, pd_Series)): + return concat(seq, axis=0) + elif isinstance(seq[0], ss.spmatrix): + return ss.vstack(seq, format='csr') + else: + raise TypeError(f'Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got {type(seq[0]).__name__}.') + + +def _remove_list_padding(*args: Any) -> List[List[Any]]: + return [[z for z in arg if z is not None] for arg in args] + + +def _pad_eval_names(lgbm_model: LGBMModel, required_names: List[str]) -> LGBMModel: + """Append missing (key, value) pairs to a LightGBM model's evals_result_ and best_score_ OrderedDict attrs based on a set of required eval_set names. + + Allows users to rely on expected eval_set names being present when fitting DaskLGBM estimators with ``eval_set``. + """ + for eval_name in required_names: + if eval_name not in lgbm_model.evals_result_: + lgbm_model.evals_result_[eval_name] = {} + if eval_name not in lgbm_model.best_score_: + lgbm_model.best_score_[eval_name] = {} + + return lgbm_model + + +def _train_part( + params: Dict[str, Any], + model_factory: Type[LGBMModel], + list_of_parts: List[Dict[str, _DaskPart]], + machines: str, + local_listen_port: int, + num_machines: int, + return_model: bool, + time_out: int, + remote_socket: _RemoteSocket, + **kwargs: Any +) -> Optional[LGBMModel]: + network_params = { + 'machines': machines, + 'local_listen_port': local_listen_port, + 'time_out': time_out, + 'num_machines': num_machines + } + params.update(network_params) + + is_ranker = issubclass(model_factory, LGBMRanker) + + # Concatenate many parts into one + data = _concat([x['data'] for x in list_of_parts]) + label = _concat([x['label'] for x in list_of_parts]) + + if 'weight' in list_of_parts[0]: + weight = _concat([x['weight'] for x in list_of_parts]) + else: + weight = None + + if 'group' in list_of_parts[0]: + group = _concat([x['group'] for x in list_of_parts]) + else: + group = None + + if 'init_score' in list_of_parts[0]: + init_score = _concat([x['init_score'] for x in list_of_parts]) + else: + init_score = None + + # construct local eval_set data. + n_evals = max(len(x.get('eval_set', [])) for x in list_of_parts) + eval_names = kwargs.pop('eval_names', None) + eval_class_weight = kwargs.get('eval_class_weight') + local_eval_set = None + local_eval_names = None + local_eval_sample_weight = None + local_eval_init_score = None + local_eval_group = None + + if n_evals: + has_eval_sample_weight = any(x.get('eval_sample_weight') is not None for x in list_of_parts) + has_eval_init_score = any(x.get('eval_init_score') is not None for x in list_of_parts) + + local_eval_set = [] + evals_result_names = [] + if has_eval_sample_weight: + local_eval_sample_weight = [] + if has_eval_init_score: + local_eval_init_score = [] + if is_ranker: + local_eval_group = [] + + # store indices of eval_set components that were not contained within local parts. + missing_eval_component_idx = [] + + # consolidate parts of each individual eval component. + for i in range(n_evals): + x_e = [] + y_e = [] + w_e = [] + init_score_e = [] + g_e = [] + for part in list_of_parts: + if not part.get('eval_set'): + continue + + # require that eval_name exists in evaluated result data in case dropped due to padding. + # in distributed training the 'training' eval_set is not detected, will have name 'valid_<index>'. + if eval_names: + evals_result_name = eval_names[i] + else: + evals_result_name = f'valid_{i}' + + eval_set = part['eval_set'][i] + if eval_set is _DatasetNames.TRAINSET: + x_e.append(part['data']) + y_e.append(part['label']) + else: + x_e.extend(eval_set[0]) + y_e.extend(eval_set[1]) + + if evals_result_name not in evals_result_names: + evals_result_names.append(evals_result_name) + + eval_weight = part.get('eval_sample_weight') + if eval_weight: + if eval_weight[i] is _DatasetNames.SAMPLE_WEIGHT: + w_e.append(part['weight']) + else: + w_e.extend(eval_weight[i]) + + eval_init_score = part.get('eval_init_score') + if eval_init_score: + if eval_init_score[i] is _DatasetNames.INIT_SCORE: + init_score_e.append(part['init_score']) + else: + init_score_e.extend(eval_init_score[i]) + + eval_group = part.get('eval_group') + if eval_group: + if eval_group[i] is _DatasetNames.GROUP: + g_e.append(part['group']) + else: + g_e.extend(eval_group[i]) + + # filter padding from eval parts then _concat each eval_set component. + x_e, y_e, w_e, init_score_e, g_e = _remove_list_padding(x_e, y_e, w_e, init_score_e, g_e) + if x_e: + local_eval_set.append((_concat(x_e), _concat(y_e))) + else: + missing_eval_component_idx.append(i) + continue + + if w_e: + local_eval_sample_weight.append(_concat(w_e)) + if init_score_e: + local_eval_init_score.append(_concat(init_score_e)) + if g_e: + local_eval_group.append(_concat(g_e)) + + # reconstruct eval_set fit args/kwargs depending on which components of eval_set are on worker. + eval_component_idx = [i for i in range(n_evals) if i not in missing_eval_component_idx] + if eval_names: + local_eval_names = [eval_names[i] for i in eval_component_idx] + if eval_class_weight: + kwargs['eval_class_weight'] = [eval_class_weight[i] for i in eval_component_idx] + + model = model_factory(**params) + if remote_socket is not None: + remote_socket.release() + try: + if is_ranker: + model.fit( + data, + label, + sample_weight=weight, + init_score=init_score, + group=group, + eval_set=local_eval_set, + eval_sample_weight=local_eval_sample_weight, + eval_init_score=local_eval_init_score, + eval_group=local_eval_group, + eval_names=local_eval_names, + **kwargs + ) + else: + model.fit( + data, + label, + sample_weight=weight, + init_score=init_score, + eval_set=local_eval_set, + eval_sample_weight=local_eval_sample_weight, + eval_init_score=local_eval_init_score, + eval_names=local_eval_names, + **kwargs + ) + + finally: + if getattr(model, "fitted_", False): + model.booster_.free_network() + + if n_evals: + # ensure that expected keys for evals_result_ and best_score_ exist regardless of padding. + model = _pad_eval_names(model, required_names=evals_result_names) + + return model if return_model else None + + +def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]: + parts = data.to_delayed() + if isinstance(parts, np.ndarray): + if is_matrix: + assert parts.shape[1] == 1 + else: + assert parts.ndim == 1 or parts.shape[1] == 1 + parts = parts.flatten().tolist() + return parts + + +def _machines_to_worker_map(machines: str, worker_addresses: Iterable[str]) -> Dict[str, int]: + """Create a worker_map from machines list. + + Given ``machines`` and a list of Dask worker addresses, return a mapping where the keys are + ``worker_addresses`` and the values are ports from ``machines``. + + Parameters + ---------- + machines : str + A comma-delimited list of workers, of the form ``ip1:port,ip2:port``. + worker_addresses : list of str + An iterable of Dask worker addresses, of the form ``{protocol}{hostname}:{port}``, where ``port`` is the port Dask's scheduler uses to talk to that worker. + + Returns + ------- + result : Dict[str, int] + Dictionary where keys are work addresses in the form expected by Dask and values are a port for LightGBM to use. + """ + machine_addresses = machines.split(",") + + if len(set(machine_addresses)) != len(machine_addresses): + raise ValueError(f"Found duplicates in 'machines' ({machines}). Each entry in 'machines' must be a unique IP-port combination.") + + machine_to_port = defaultdict(set) + for address in machine_addresses: + host, port = address.split(":") + machine_to_port[host].add(int(port)) + + out = {} + for address in worker_addresses: + worker_host = urlparse(address).hostname + if not worker_host: + raise ValueError(f"Could not parse host name from worker address '{address}'") + out[address] = machine_to_port[worker_host].pop() + + return out + + +def _train( + client: Client, + data: _DaskMatrixLike, + label: _DaskCollection, + params: Dict[str, Any], + model_factory: Type[LGBMModel], + sample_weight: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, + group: Optional[_DaskVectorLike] = None, + eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_DaskVectorLike]] = None, + eval_class_weight: Optional[List[Union[dict, str]]] = None, + eval_init_score: Optional[List[_DaskCollection]] = None, + eval_group: Optional[List[_DaskVectorLike]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None, + **kwargs: Any +) -> LGBMModel: + """Inner train routine. + + Parameters + ---------- + client : dask.distributed.Client + Dask client. + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] + Input feature matrix. + label : Dask Array, Dask DataFrame or Dask Series of shape = [n_samples] + The target values (class labels in classification, real numbers in regression). + params : dict + Parameters passed to constructor of the local underlying model. + model_factory : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class + Class of the local underlying model. + sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) + Weights of training data. Weights should be non-negative. + init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None) + Init score of training data. + group : Dask Array or Dask Series or None, optional (default=None) + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_set : list of (X, y) tuples of Dask data collections, or None, optional (default=None) + List of (X, y) tuple pairs to use as validation sets. + Note, that not all workers may receive chunks of every eval set within ``eval_set``. When the returned + lightgbm estimator is not trained using any chunks of a particular eval set, its corresponding component + of ``evals_result_`` and ``best_score_`` will be empty dictionaries. + eval_names : list of str, or None, optional (default=None) + Names of eval_set. + eval_sample_weight : list of Dask Array or Dask Series, or None, optional (default=None) + Weights for each validation set in eval_set. Weights should be non-negative. + eval_class_weight : list of dict or str, or None, optional (default=None) + Class weights, one dict or str for each validation set in eval_set. + eval_init_score : list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None) + Initial model score for each validation set in eval_set. + eval_group : list of Dask Array or Dask Series, or None, optional (default=None) + Group/query for each validation set in eval_set. + eval_metric : str, callable, list or None, optional (default=None) + If str, it should be a built-in evaluation metric to use. + If callable, it should be a custom evaluation metric, see note below for more details. + If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. + In either case, the ``metric`` from the Dask model parameters (or inferred from the objective) will be evaluated and used as well. + Default: 'l2' for DaskLGBMRegressor, 'binary(multi)_logloss' for DaskLGBMClassifier, 'ndcg' for DaskLGBMRanker. + eval_at : list or tuple of int, optional (default=None) + The evaluation positions of the specified ranking metric. + **kwargs + Other parameters passed to ``fit`` method of the local underlying model. + + Returns + ------- + model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class + Returns fitted underlying model. + + Note + ---- + + This method handles setting up the following network parameters based on information + about the Dask cluster referenced by ``client``. + + * ``local_listen_port``: port that each LightGBM worker opens a listening socket on, + to accept connections from other workers. This can differ from LightGBM worker + to LightGBM worker, but does not have to. + * ``machines``: a comma-delimited list of all workers in the cluster, in the + form ``ip:port,ip:port``. If running multiple Dask workers on the same host, use different + ports for each worker. For example, for ``LocalCluster(n_workers=3)``, you might + pass ``"127.0.0.1:12400,127.0.0.1:12401,127.0.0.1:12402"``. + * ``num_machines``: number of LightGBM workers. + * ``timeout``: time in minutes to wait before closing unused sockets. + + The default behavior of this function is to generate ``machines`` from the list of + Dask workers which hold some piece of the training data, and to search for an open + port on each worker to be used as ``local_listen_port``. + + If ``machines`` is provided explicitly in ``params``, this function uses the hosts + and ports in that list directly, and does not do any searching. This means that if + any of the Dask workers are missing from the list or any of those ports are not free + when training starts, training will fail. + + If ``local_listen_port`` is provided in ``params`` and ``machines`` is not, this function + constructs ``machines`` from the list of Dask workers which hold some piece of the + training data, assuming that each one will use the same ``local_listen_port``. + """ + params = deepcopy(params) + + # capture whether local_listen_port or its aliases were provided + listen_port_in_params = any( + alias in params for alias in _ConfigAliases.get("local_listen_port") + ) + + # capture whether machines or its aliases were provided + machines_in_params = any( + alias in params for alias in _ConfigAliases.get("machines") + ) + + params = _choose_param_value( + main_param_name="tree_learner", + params=params, + default_value="data" + ) + allowed_tree_learners = { + 'data', + 'data_parallel', + 'feature', + 'feature_parallel', + 'voting', + 'voting_parallel' + } + if params["tree_learner"] not in allowed_tree_learners: + _log_warning(f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default') + params['tree_learner'] = 'data' + + # Some passed-in parameters can be removed: + # * 'num_machines': set automatically from Dask worker list + # * 'num_threads': overridden to match nthreads on each Dask process + for param_alias in _ConfigAliases.get('num_machines', 'num_threads'): + if param_alias in params: + _log_warning(f"Parameter {param_alias} will be ignored.") + params.pop(param_alias) + + # Split arrays/dataframes into parts. Arrange parts into dicts to enforce co-locality + data_parts = _split_to_parts(data=data, is_matrix=True) + label_parts = _split_to_parts(data=label, is_matrix=False) + parts = [{'data': x, 'label': y} for (x, y) in zip(data_parts, label_parts)] + n_parts = len(parts) + + if sample_weight is not None: + weight_parts = _split_to_parts(data=sample_weight, is_matrix=False) + for i in range(n_parts): + parts[i]['weight'] = weight_parts[i] + + if group is not None: + group_parts = _split_to_parts(data=group, is_matrix=False) + for i in range(n_parts): + parts[i]['group'] = group_parts[i] + + if init_score is not None: + init_score_parts = _split_to_parts(data=init_score, is_matrix=False) + for i in range(n_parts): + parts[i]['init_score'] = init_score_parts[i] + + # evals_set will to be re-constructed into smaller lists of (X, y) tuples, where + # X and y are each delayed sub-lists of original eval dask Collections. + if eval_set: + # find maximum number of parts in an individual eval set so that we can + # pad eval sets when they come in different sizes. + n_largest_eval_parts = max(x[0].npartitions for x in eval_set) + + eval_sets: Dict[ + int, + List[ + Union[ + _DatasetNames, + Tuple[ + List[Optional[_DaskMatrixLike]], + List[Optional[_DaskVectorLike]] + ] + ] + ] + ] = defaultdict(list) + if eval_sample_weight: + eval_sample_weights: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskVectorLike]] + ] + ] + ] = defaultdict(list) + if eval_group: + eval_groups: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskVectorLike]] + ] + ] + ] = defaultdict(list) + if eval_init_score: + eval_init_scores: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskMatrixLike]] + ] + ] + ] = defaultdict(list) + + for i, (X_eval, y_eval) in enumerate(eval_set): + n_this_eval_parts = X_eval.npartitions + + # when individual eval set is equivalent to training data, skip recomputing parts. + if X_eval is data and y_eval is label: + for parts_idx in range(n_parts): + eval_sets[parts_idx].append(_DatasetNames.TRAINSET) + else: + eval_x_parts = _split_to_parts(data=X_eval, is_matrix=True) + eval_y_parts = _split_to_parts(data=y_eval, is_matrix=False) + for j in range(n_largest_eval_parts): + parts_idx = j % n_parts + + # add None-padding for individual eval_set member if it is smaller than the largest member. + if j < n_this_eval_parts: + x_e = eval_x_parts[j] + y_e = eval_y_parts[j] + else: + x_e = None + y_e = None + + if j < n_parts: + # first time a chunk of this eval set is added to this part. + eval_sets[parts_idx].append(([x_e], [y_e])) + else: + # append additional chunks of this eval set to this part. + eval_sets[parts_idx][-1][0].append(x_e) # type: ignore[index, union-attr] + eval_sets[parts_idx][-1][1].append(y_e) # type: ignore[index, union-attr] + + if eval_sample_weight: + if eval_sample_weight[i] is sample_weight: + for parts_idx in range(n_parts): + eval_sample_weights[parts_idx].append(_DatasetNames.SAMPLE_WEIGHT) + else: + eval_w_parts = _split_to_parts(data=eval_sample_weight[i], is_matrix=False) + + # ensure that all evaluation parts map uniquely to one part. + for j in range(n_largest_eval_parts): + if j < n_this_eval_parts: + w_e = eval_w_parts[j] + else: + w_e = None + + parts_idx = j % n_parts + if j < n_parts: + eval_sample_weights[parts_idx].append([w_e]) + else: + eval_sample_weights[parts_idx][-1].append(w_e) # type: ignore[union-attr] + + if eval_init_score: + if eval_init_score[i] is init_score: + for parts_idx in range(n_parts): + eval_init_scores[parts_idx].append(_DatasetNames.INIT_SCORE) + else: + eval_init_score_parts = _split_to_parts(data=eval_init_score[i], is_matrix=False) + for j in range(n_largest_eval_parts): + if j < n_this_eval_parts: + init_score_e = eval_init_score_parts[j] + else: + init_score_e = None + + parts_idx = j % n_parts + if j < n_parts: + eval_init_scores[parts_idx].append([init_score_e]) + else: + eval_init_scores[parts_idx][-1].append(init_score_e) # type: ignore[union-attr] + + if eval_group: + if eval_group[i] is group: + for parts_idx in range(n_parts): + eval_groups[parts_idx].append(_DatasetNames.GROUP) + else: + eval_g_parts = _split_to_parts(data=eval_group[i], is_matrix=False) + for j in range(n_largest_eval_parts): + if j < n_this_eval_parts: + g_e = eval_g_parts[j] + else: + g_e = None + + parts_idx = j % n_parts + if j < n_parts: + eval_groups[parts_idx].append([g_e]) + else: + eval_groups[parts_idx][-1].append(g_e) # type: ignore[union-attr] + + # assign sub-eval_set components to worker parts. + for parts_idx, e_set in eval_sets.items(): + parts[parts_idx]['eval_set'] = e_set + if eval_sample_weight: + parts[parts_idx]['eval_sample_weight'] = eval_sample_weights[parts_idx] + if eval_init_score: + parts[parts_idx]['eval_init_score'] = eval_init_scores[parts_idx] + if eval_group: + parts[parts_idx]['eval_group'] = eval_groups[parts_idx] + + # Start computation in the background + parts = list(map(delayed, parts)) + parts = client.compute(parts) + wait(parts) + + for part in parts: + if part.status == 'error': # type: ignore + # trigger error locally + return part # type: ignore[return-value] + + # Find locations of all parts and map them to particular Dask workers + key_to_part_dict = {part.key: part for part in parts} # type: ignore + who_has = client.who_has(parts) + worker_map = defaultdict(list) + for key, workers in who_has.items(): + worker_map[next(iter(workers))].append(key_to_part_dict[key]) + + # Check that all workers were provided some of eval_set. Otherwise warn user that validation + # data artifacts may not be populated depending on worker returning final estimator. + if eval_set: + for worker in worker_map: + has_eval_set = False + for part in worker_map[worker]: + if 'eval_set' in part.result(): # type: ignore[attr-defined] + has_eval_set = True + break + + if not has_eval_set: + _log_warning( + f"Worker {worker} was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable. " + "Try rebalancing data across workers." + ) + + # assign general validation set settings to fit kwargs. + if eval_names: + kwargs['eval_names'] = eval_names + if eval_class_weight: + kwargs['eval_class_weight'] = eval_class_weight + if eval_metric: + kwargs['eval_metric'] = eval_metric + if eval_at: + kwargs['eval_at'] = eval_at + + master_worker = next(iter(worker_map)) + worker_ncores = client.ncores() + + # resolve aliases for network parameters and pop the result off params. + # these values are added back in calls to `_train_part()` + params = _choose_param_value( + main_param_name="local_listen_port", + params=params, + default_value=12400 + ) + local_listen_port = params.pop("local_listen_port") + + params = _choose_param_value( + main_param_name="machines", + params=params, + default_value=None + ) + machines = params.pop("machines") + + # figure out network params + worker_to_socket_future: Dict[str, Future] = {} + worker_addresses = worker_map.keys() + if machines is not None: + _log_info("Using passed-in 'machines' parameter") + worker_address_to_port = _machines_to_worker_map( + machines=machines, + worker_addresses=worker_addresses + ) + else: + if listen_port_in_params: + _log_info("Using passed-in 'local_listen_port' for all workers") + unique_hosts = {urlparse(a).hostname for a in worker_addresses} + if len(unique_hosts) < len(worker_addresses): + msg = ( + "'local_listen_port' was provided in Dask training parameters, but at least one " + "machine in the cluster has multiple Dask worker processes running on it. Please omit " + "'local_listen_port' or pass 'machines'." + ) + raise LightGBMError(msg) + + worker_address_to_port = { + address: local_listen_port + for address in worker_addresses + } + else: + _log_info("Finding random open ports for workers") + worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers(client, list(worker_map.keys())) + + machines = ','.join([ + f'{urlparse(worker_address).hostname}:{port}' + for worker_address, port + in worker_address_to_port.items() + ]) + + num_machines = len(worker_address_to_port) + + # Tell each worker to train on the parts that it has locally + # + # This code treats ``_train_part()`` calls as not "pure" because: + # 1. there is randomness in the training process unless parameters ``seed`` + # and ``deterministic`` are set + # 2. even with those parameters set, the output of one ``_train_part()`` call + # relies on global state (it and all the other LightGBM training processes + # coordinate with each other) + futures_classifiers = [ + client.submit( + _train_part, + model_factory=model_factory, + params={**params, 'num_threads': worker_ncores[worker]}, + list_of_parts=list_of_parts, + machines=machines, + local_listen_port=worker_address_to_port[worker], + num_machines=num_machines, + time_out=params.get('time_out', 120), + remote_socket=worker_to_socket_future.get(worker, None), + return_model=(worker == master_worker), + workers=[worker], + allow_other_workers=False, + pure=False, + **kwargs + ) + for worker, list_of_parts in worker_map.items() + ] + + results = client.gather(futures_classifiers) + results = [v for v in results if v] + model = results[0] + + # if network parameters were changed during training, remove them from the + # returned model so that they're generated dynamically on every run based + # on the Dask cluster you're connected to and which workers have pieces of + # the training data + if not listen_port_in_params: + for param in _ConfigAliases.get('local_listen_port'): + model._other_params.pop(param, None) + + if not machines_in_params: + for param in _ConfigAliases.get('machines'): + model._other_params.pop(param, None) + + for param in _ConfigAliases.get('num_machines', 'timeout'): + model._other_params.pop(param, None) + + return model + + +def _predict_part( + part: _DaskPart, + model: LGBMModel, + raw_score: bool, + pred_proba: bool, + pred_leaf: bool, + pred_contrib: bool, + **kwargs: Any +) -> _DaskPart: + + result: _DaskPart + if part.shape[0] == 0: + result = np.array([]) + elif pred_proba: + result = model.predict_proba( + part, + raw_score=raw_score, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs + ) + else: + result = model.predict( + part, + raw_score=raw_score, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs + ) + + # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series + if isinstance(part, pd_DataFrame): + if len(result.shape) == 2: + result = pd_DataFrame(result, index=part.index) + else: + result = pd_Series(result, index=part.index, name='predictions') + + return result + + +def _predict( + model: LGBMModel, + data: _DaskMatrixLike, + client: Client, + raw_score: bool = False, + pred_proba: bool = False, + pred_leaf: bool = False, + pred_contrib: bool = False, + dtype: _PredictionDtype = np.float32, + **kwargs: Any +) -> Union[dask_Array, List[dask_Array]]: + """Inner predict routine. + + Parameters + ---------- + model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class + Fitted underlying model. + data : Dask Array or Dask DataFrame of shape = [n_samples, n_features] + Input feature matrix. + raw_score : bool, optional (default=False) + Whether to predict raw scores. + pred_proba : bool, optional (default=False) + Should method return results of ``predict_proba`` (``pred_proba=True``) or ``predict`` (``pred_proba=False``). + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + dtype : np.dtype, optional (default=np.float32) + Dtype of the output. + **kwargs + Other parameters passed to ``predict`` or ``predict_proba`` method. + + Returns + ------- + predicted_result : Dask Array of shape = [n_samples] or shape = [n_samples, n_classes] + The predicted values. + X_leaves : Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes] + If ``pred_leaf=True``, the predicted leaf of every tree for each sample. + X_SHAP_values : Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1] + If ``pred_contrib=True``, the feature contributions for each sample. + """ + if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): + raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') + if isinstance(data, dask_DataFrame): + return data.map_partitions( + _predict_part, + model=model, + raw_score=raw_score, + pred_proba=pred_proba, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs + ).values + elif isinstance(data, dask_Array): + # for multi-class classification with sparse matrices, pred_contrib predictions + # are returned as a list of sparse matrices (one per class) + num_classes = model._n_classes + + if ( + num_classes > 2 + and pred_contrib + and isinstance(data._meta, ss.spmatrix) + ): + + predict_function = partial( + _predict_part, + model=model, + raw_score=False, + pred_proba=pred_proba, + pred_leaf=False, + pred_contrib=True, + **kwargs + ) + + delayed_chunks = data.to_delayed() + bag = dask_bag_from_delayed(delayed_chunks[:, 0]) + + @delayed + def _extract(items: List[Any], i: int) -> Any: + return items[i] + + preds = bag.map_partitions(predict_function) + + # pred_contrib output will have one column per feature, + # plus one more for the base value + num_cols = model.n_features_ + 1 + + nrows_per_chunk = data.chunks[0] + out: List[List[dask_Array]] = [[] for _ in range(num_classes)] + + # need to tell Dask the expected type and shape of individual preds + pred_meta = data._meta + + for j, partition in enumerate(preds.to_delayed()): + for i in range(num_classes): + part = dask_array_from_delayed( + value=_extract(partition, i), + shape=(nrows_per_chunk[j], num_cols), + meta=pred_meta + ) + out[i].append(part) + + # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix + # the code below is used instead to ensure that the sparse type is preserved during concatentation + if isinstance(pred_meta, ss.csr_matrix): + concat_fn = partial(ss.vstack, format='csr') + elif isinstance(pred_meta, ss.csc_matrix): + concat_fn = partial(ss.vstack, format='csc') + else: + concat_fn = ss.vstack + + # At this point, `out` is a list of lists of delayeds (each of which points to a matrix). + # Concatenate them to return a list of Dask Arrays. + out_arrays: List[dask_Array] = [] + for i in range(num_classes): + out_arrays.append( + dask_array_from_delayed( + value=delayed(concat_fn)(out[i]), + shape=(data.shape[0], num_cols), + meta=pred_meta + ) + ) + + return out_arrays + + data_row = client.compute(data[[0]]).result() + predict_fn = partial( + _predict_part, + model=model, + raw_score=raw_score, + pred_proba=pred_proba, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs, + ) + pred_row = predict_fn(data_row) + chunks: Tuple[int, ...] = (data.chunks[0],) + map_blocks_kwargs = {} + if len(pred_row.shape) > 1: + chunks += (pred_row.shape[1],) + else: + map_blocks_kwargs['drop_axis'] = 1 + return data.map_blocks( + predict_fn, + chunks=chunks, + meta=pred_row, + dtype=dtype, + **map_blocks_kwargs, + ) + else: + raise TypeError(f'Data must be either Dask Array or Dask DataFrame. Got {type(data).__name__}.') + + +class _DaskLGBMModel: + + @property + def client_(self) -> Client: + """:obj:`dask.distributed.Client`: Dask client. + + This property can be passed in the constructor or updated + with ``model.set_params(client=client)``. + """ + if not getattr(self, "fitted_", False): + raise LGBMNotFittedError('Cannot access property client_ before calling fit().') + + return _get_dask_client(client=self.client) + + def _lgb_dask_getstate(self) -> Dict[Any, Any]: + """Remove un-picklable attributes before serialization.""" + client = self.__dict__.pop("client", None) + self._other_params.pop("client", None) # type: ignore[attr-defined] + out = deepcopy(self.__dict__) + out.update({"client": None}) + self.client = client + return out + + def _lgb_dask_fit( + self, + model_factory: Type[LGBMModel], + X: _DaskMatrixLike, + y: _DaskCollection, + sample_weight: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, + group: Optional[_DaskVectorLike] = None, + eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_DaskVectorLike]] = None, + eval_class_weight: Optional[List[Union[dict, str]]] = None, + eval_init_score: Optional[List[_DaskCollection]] = None, + eval_group: Optional[List[_DaskVectorLike]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None, + **kwargs: Any + ) -> "_DaskLGBMModel": + if not DASK_INSTALLED: + raise LightGBMError('dask is required for lightgbm.dask') + if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): + raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') + + params = self.get_params(True) # type: ignore[attr-defined] + params.pop("client", None) + + model = _train( + client=_get_dask_client(self.client), + data=X, + label=y, + params=params, + model_factory=model_factory, + sample_weight=sample_weight, + init_score=init_score, + group=group, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_class_weight=eval_class_weight, + eval_init_score=eval_init_score, + eval_group=eval_group, + eval_metric=eval_metric, + eval_at=eval_at, + **kwargs + ) + + self.set_params(**model.get_params()) # type: ignore[attr-defined] + self._lgb_dask_copy_extra_params(model, self) # type: ignore[attr-defined] + + return self + + def _lgb_dask_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel: + params = self.get_params() # type: ignore[attr-defined] + params.pop("client", None) + model = model_factory(**params) + self._lgb_dask_copy_extra_params(self, model) + model._other_params.pop("client", None) + return model + + @staticmethod + def _lgb_dask_copy_extra_params(source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel]) -> None: + params = source.get_params() # type: ignore[union-attr] + attributes = source.__dict__ + extra_param_names = set(attributes.keys()).difference(params.keys()) + for name in extra_param_names: + setattr(dest, name, attributes[name]) + + +class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): + """Distributed version of lightgbm.LGBMClassifier.""" + + def __init__( + self, + boosting_type: str = 'gbdt', + num_leaves: int = 31, + max_depth: int = -1, + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample_for_bin: int = 200000, + objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, + class_weight: Optional[Union[dict, str]] = None, + min_split_gain: float = 0., + min_child_weight: float = 1e-3, + min_child_samples: int = 20, + subsample: float = 1., + subsample_freq: int = 0, + colsample_bytree: float = 1., + reg_alpha: float = 0., + reg_lambda: float = 0., + random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + n_jobs: Optional[int] = None, + importance_type: str = 'split', + client: Optional[Client] = None, + **kwargs: Any + ): + """Docstring is inherited from the lightgbm.LGBMClassifier.__init__.""" + self.client = client + super().__init__( + boosting_type=boosting_type, + num_leaves=num_leaves, + max_depth=max_depth, + learning_rate=learning_rate, + n_estimators=n_estimators, + subsample_for_bin=subsample_for_bin, + objective=objective, + class_weight=class_weight, + min_split_gain=min_split_gain, + min_child_weight=min_child_weight, + min_child_samples=min_child_samples, + subsample=subsample, + subsample_freq=subsample_freq, + colsample_bytree=colsample_bytree, + reg_alpha=reg_alpha, + reg_lambda=reg_lambda, + random_state=random_state, + n_jobs=n_jobs, + importance_type=importance_type, + **kwargs + ) + + _base_doc = LGBMClassifier.__init__.__doc__ + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + __init__.__doc__ = f""" + {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) + {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. + {_kwargs}{_after_kwargs} + """ + + def __getstate__(self) -> Dict[Any, Any]: + return self._lgb_dask_getstate() + + def fit( # type: ignore[override] + self, + X: _DaskMatrixLike, + y: _DaskCollection, + sample_weight: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskCollection] = None, + eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_DaskVectorLike]] = None, + eval_class_weight: Optional[List[Union[dict, str]]] = None, + eval_init_score: Optional[List[_DaskCollection]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + **kwargs: Any + ) -> "DaskLGBMClassifier": + """Docstring is inherited from the lightgbm.LGBMClassifier.fit.""" + self._lgb_dask_fit( + model_factory=LGBMClassifier, + X=X, + y=y, + sample_weight=sample_weight, + init_score=init_score, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_class_weight=eval_class_weight, + eval_init_score=eval_init_score, + eval_metric=eval_metric, + **kwargs + ) + return self + + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)", + group_shape="Dask Array or Dask Series or None, optional (default=None)", + eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", + eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)", + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + ) + + # DaskLGBMClassifier does not support group, eval_group. + _base_doc = (_base_doc[:_base_doc.find('group :')] + + _base_doc[_base_doc.find('eval_set :'):]) + + _base_doc = (_base_doc[:_base_doc.find('eval_group :')] + + _base_doc[_base_doc.find('eval_metric :'):]) + + # DaskLGBMClassifier support for callbacks and init_model is not tested + fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs + Other parameters passed through to ``LGBMClassifier.fit()``. + + Returns + ------- + self : lightgbm.DaskLGBMClassifier + Returns self. + + {_lgbmmodel_doc_custom_eval_note} + """ + + def predict( + self, + X: _DaskMatrixLike, # type: ignore[override] + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ) -> dask_Array: + """Docstring is inherited from the lightgbm.LGBMClassifier.predict.""" + return _predict( + model=self.to_local(), + data=X, + dtype=self.classes_.dtype, + client=_get_dask_client(self.client), + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" + ) + + def predict_proba( + self, + X: _DaskMatrixLike, # type: ignore[override] + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ) -> dask_Array: + """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba.""" + return _predict( + model=self.to_local(), + data=X, + pred_proba=True, + client=_get_dask_client(self.client), + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" + ) + + def to_local(self) -> LGBMClassifier: + """Create regular version of lightgbm.LGBMClassifier from the distributed version. + + Returns + ------- + model : lightgbm.LGBMClassifier + Local underlying model. + """ + return self._lgb_dask_to_local(LGBMClassifier) + + +class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): + """Distributed version of lightgbm.LGBMRegressor.""" + + def __init__( + self, + boosting_type: str = 'gbdt', + num_leaves: int = 31, + max_depth: int = -1, + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample_for_bin: int = 200000, + objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, + class_weight: Optional[Union[dict, str]] = None, + min_split_gain: float = 0., + min_child_weight: float = 1e-3, + min_child_samples: int = 20, + subsample: float = 1., + subsample_freq: int = 0, + colsample_bytree: float = 1., + reg_alpha: float = 0., + reg_lambda: float = 0., + random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + n_jobs: Optional[int] = None, + importance_type: str = 'split', + client: Optional[Client] = None, + **kwargs: Any + ): + """Docstring is inherited from the lightgbm.LGBMRegressor.__init__.""" + self.client = client + super().__init__( + boosting_type=boosting_type, + num_leaves=num_leaves, + max_depth=max_depth, + learning_rate=learning_rate, + n_estimators=n_estimators, + subsample_for_bin=subsample_for_bin, + objective=objective, + class_weight=class_weight, + min_split_gain=min_split_gain, + min_child_weight=min_child_weight, + min_child_samples=min_child_samples, + subsample=subsample, + subsample_freq=subsample_freq, + colsample_bytree=colsample_bytree, + reg_alpha=reg_alpha, + reg_lambda=reg_lambda, + random_state=random_state, + n_jobs=n_jobs, + importance_type=importance_type, + **kwargs + ) + + _base_doc = LGBMRegressor.__init__.__doc__ + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + __init__.__doc__ = f""" + {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) + {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. + {_kwargs}{_after_kwargs} + """ + + def __getstate__(self) -> Dict[Any, Any]: + return self._lgb_dask_getstate() + + def fit( # type: ignore[override] + self, + X: _DaskMatrixLike, + y: _DaskCollection, + sample_weight: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskVectorLike] = None, + eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_DaskVectorLike]] = None, + eval_init_score: Optional[List[_DaskVectorLike]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + **kwargs: Any + ) -> "DaskLGBMRegressor": + """Docstring is inherited from the lightgbm.LGBMRegressor.fit.""" + self._lgb_dask_fit( + model_factory=LGBMRegressor, + X=X, + y=y, + sample_weight=sample_weight, + init_score=init_score, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_init_score=eval_init_score, + eval_metric=eval_metric, + **kwargs + ) + return self + + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array or Dask Series or None, optional (default=None)", + eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", + eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)", + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + ) + + # DaskLGBMRegressor does not support group, eval_class_weight, eval_group. + _base_doc = (_base_doc[:_base_doc.find('group :')] + + _base_doc[_base_doc.find('eval_set :'):]) + + _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] + + _base_doc[_base_doc.find('eval_init_score :'):]) + + _base_doc = (_base_doc[:_base_doc.find('eval_group :')] + + _base_doc[_base_doc.find('eval_metric :'):]) + + # DaskLGBMRegressor support for callbacks and init_model is not tested + fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs + Other parameters passed through to ``LGBMRegressor.fit()``. + + Returns + ------- + self : lightgbm.DaskLGBMRegressor + Returns self. + + {_lgbmmodel_doc_custom_eval_note} + """ + + def predict( + self, + X: _DaskMatrixLike, # type: ignore[override] + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ) -> dask_Array: + """Docstring is inherited from the lightgbm.LGBMRegressor.predict.""" + return _predict( + model=self.to_local(), + data=X, + client=_get_dask_client(self.client), + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + ) + + def to_local(self) -> LGBMRegressor: + """Create regular version of lightgbm.LGBMRegressor from the distributed version. + + Returns + ------- + model : lightgbm.LGBMRegressor + Local underlying model. + """ + return self._lgb_dask_to_local(LGBMRegressor) + + +class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): + """Distributed version of lightgbm.LGBMRanker.""" + + def __init__( + self, + boosting_type: str = 'gbdt', + num_leaves: int = 31, + max_depth: int = -1, + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample_for_bin: int = 200000, + objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, + class_weight: Optional[Union[dict, str]] = None, + min_split_gain: float = 0., + min_child_weight: float = 1e-3, + min_child_samples: int = 20, + subsample: float = 1., + subsample_freq: int = 0, + colsample_bytree: float = 1., + reg_alpha: float = 0., + reg_lambda: float = 0., + random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + n_jobs: Optional[int] = None, + importance_type: str = 'split', + client: Optional[Client] = None, + **kwargs: Any + ): + """Docstring is inherited from the lightgbm.LGBMRanker.__init__.""" + self.client = client + super().__init__( + boosting_type=boosting_type, + num_leaves=num_leaves, + max_depth=max_depth, + learning_rate=learning_rate, + n_estimators=n_estimators, + subsample_for_bin=subsample_for_bin, + objective=objective, + class_weight=class_weight, + min_split_gain=min_split_gain, + min_child_weight=min_child_weight, + min_child_samples=min_child_samples, + subsample=subsample, + subsample_freq=subsample_freq, + colsample_bytree=colsample_bytree, + reg_alpha=reg_alpha, + reg_lambda=reg_lambda, + random_state=random_state, + n_jobs=n_jobs, + importance_type=importance_type, + **kwargs + ) + + _base_doc = LGBMRanker.__init__.__doc__ + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + __init__.__doc__ = f""" + {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) + {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. + {_kwargs}{_after_kwargs} + """ + + def __getstate__(self) -> Dict[Any, Any]: + return self._lgb_dask_getstate() + + def fit( # type: ignore[override] + self, + X: _DaskMatrixLike, + y: _DaskCollection, + sample_weight: Optional[_DaskVectorLike] = None, + init_score: Optional[_DaskVectorLike] = None, + group: Optional[_DaskVectorLike] = None, + eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_DaskVectorLike]] = None, + eval_init_score: Optional[List[_DaskVectorLike]] = None, + eval_group: Optional[List[_DaskVectorLike]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), + **kwargs: Any + ) -> "DaskLGBMRanker": + """Docstring is inherited from the lightgbm.LGBMRanker.fit.""" + self._lgb_dask_fit( + model_factory=LGBMRanker, + X=X, + y=y, + sample_weight=sample_weight, + init_score=init_score, + group=group, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_init_score=eval_init_score, + eval_group=eval_group, + eval_metric=eval_metric, + eval_at=eval_at, + **kwargs + ) + return self + + _base_doc = _lgbmmodel_doc_fit.format( + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", + sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", + group_shape="Dask Array or Dask Series or None, optional (default=None)", + eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", + eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)", + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + ) + + # DaskLGBMRanker does not support eval_class_weight or early stopping + _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] + + _base_doc[_base_doc.find('eval_init_score :'):]) + + _base_doc = (_base_doc[:_base_doc.find('feature_name :')] + + "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n" + + f"{' ':8}The evaluation positions of the specified metric.\n" + + f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}") + + # DaskLGBMRanker support for callbacks and init_model is not tested + fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs + Other parameters passed through to ``LGBMRanker.fit()``. + + Returns + ------- + self : lightgbm.DaskLGBMRanker + Returns self. + + {_lgbmmodel_doc_custom_eval_note} + """ + + def predict( + self, + X: _DaskMatrixLike, # type: ignore[override] + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ) -> dask_Array: + """Docstring is inherited from the lightgbm.LGBMRanker.predict.""" + return _predict( + model=self.to_local(), + data=X, + client=_get_dask_client(self.client), + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="Dask Array of shape = [n_samples]", + X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + ) + + def to_local(self) -> LGBMRanker: + """Create regular version of lightgbm.LGBMRanker from the distributed version. + + Returns + ------- + model : lightgbm.LGBMRanker + Local underlying model. + """ + return self._lgb_dask_to_local(LGBMRanker) diff --git a/ext/lightgbm/engine.py b/ext/lightgbm/engine.py new file mode 100644 index 0000000..822aa3b --- /dev/null +++ b/ext/lightgbm/engine.py @@ -0,0 +1,785 @@ +# coding: utf-8 +"""Library with training routines of LightGBM.""" +import copy +import json +from collections import OrderedDict, defaultdict +from operator import attrgetter +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from . import callback +from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, + _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType, + _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, + _LGBM_FeatureNameConfiguration, _log_warning) +from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold + +__all__ = [ + 'cv', + 'CVBooster', + 'train', +] + + +_LGBM_CustomMetricFunction = Union[ + Callable[ + [np.ndarray, Dataset], + _LGBM_EvalFunctionResultType, + ], + Callable[ + [np.ndarray, Dataset], + List[_LGBM_EvalFunctionResultType] + ], +] + +_LGBM_PreprocFunction = Callable[ + [Dataset, Dataset, Dict[str, Any]], + Tuple[Dataset, Dataset, Dict[str, Any]] +] + + +def train( + params: Dict[str, Any], + train_set: Dataset, + num_boost_round: int = 100, + valid_sets: Optional[List[Dataset]] = None, + valid_names: Optional[List[str]] = None, + feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None, + init_model: Optional[Union[str, Path, Booster]] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + keep_training_booster: bool = False, + callbacks: Optional[List[Callable]] = None +) -> Booster: + """Perform the training with given parameters. + + Parameters + ---------- + params : dict + Parameters for training. Values passed through ``params`` take precedence over those + supplied via arguments. + train_set : Dataset + Data to be trained on. + num_boost_round : int, optional (default=100) + Number of boosting iterations. + valid_sets : list of Dataset, or None, optional (default=None) + List of data to be evaluated on during training. + valid_names : list of str, or None, optional (default=None) + Names of ``valid_sets``. + feval : callable, list of callable, or None, optional (default=None) + Customized evaluation function. + Each evaluation function should accept two parameters: preds, eval_data, + and return (eval_name, eval_result, is_higher_better) or list of such tuples. + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes]. + If custom objective function is used, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + eval_data : Dataset + A ``Dataset`` to evaluate. + eval_name : str + The name of evaluation function (without whitespaces). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + To ignore the default metric corresponding to the used objective, + set the ``metric`` parameter to the string ``"None"`` in ``params``. + init_model : str, pathlib.Path, Booster or None, optional (default=None) + Filename of LightGBM model or Booster instance used for continue training. + feature_name : list of str, or 'auto', optional (default="auto") + Feature names. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of str or int, or 'auto', optional (default="auto") + Categorical features. + If list of int, interpreted as indices. + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + Floating point numbers in categorical features will be rounded towards 0. + keep_training_booster : bool, optional (default=False) + Whether the returned Booster will be used to keep training. + If False, the returned value will be converted into _InnerPredictor before returning. + This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster. + When your model is very large and cause the memory error, + you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``. + You can still use _InnerPredictor as ``init_model`` for future continue training. + callbacks : list of callable, or None, optional (default=None) + List of callback functions that are applied at each iteration. + See Callbacks in Python API for more information. + + Note + ---- + A custom objective function can be provided for the ``objective`` parameter. + It should accept two parameters: preds, train_data and return (grad, hess). + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + Predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task. + train_data : Dataset + The training dataset. + grad : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of preds for each sample point. + hess : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of preds for each sample point. + + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + + Returns + ------- + booster : Booster + The trained Booster model. + """ + if not isinstance(train_set, Dataset): + raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.") + + if num_boost_round <= 0: + raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") + + if isinstance(valid_sets, list): + for i, valid_item in enumerate(valid_sets): + if not isinstance(valid_item, Dataset): + raise TypeError( + "Every item in valid_sets must be a Dataset object. " + f"Item {i} has type '{type(valid_item).__name__}'." + ) + + # create predictor first + params = copy.deepcopy(params) + params = _choose_param_value( + main_param_name='objective', + params=params, + default_value=None + ) + fobj: Optional[_LGBM_CustomObjectiveFunction] = None + if callable(params["objective"]): + fobj = params["objective"] + params["objective"] = 'none' + for alias in _ConfigAliases.get("num_iterations"): + if alias in params: + num_boost_round = params.pop(alias) + _log_warning(f"Found `{alias}` in params. Will use it instead of argument") + params["num_iterations"] = num_boost_round + # setting early stopping via global params should be possible + params = _choose_param_value( + main_param_name="early_stopping_round", + params=params, + default_value=None + ) + if params["early_stopping_round"] is None: + params.pop("early_stopping_round") + first_metric_only = params.get('first_metric_only', False) + + predictor: Optional[_InnerPredictor] = None + if isinstance(init_model, (str, Path)): + predictor = _InnerPredictor.from_model_file( + model_file=init_model, + pred_parameter=params + ) + elif isinstance(init_model, Booster): + predictor = _InnerPredictor.from_booster( + booster=init_model, + pred_parameter=dict(init_model.params, **params) + ) + + if predictor is not None: + init_iteration = predictor.current_iteration() + else: + init_iteration = 0 + + train_set._update_params(params) \ + ._set_predictor(predictor) \ + .set_feature_name(feature_name) \ + .set_categorical_feature(categorical_feature) + + is_valid_contain_train = False + train_data_name = "training" + reduced_valid_sets = [] + name_valid_sets = [] + if valid_sets is not None: + if isinstance(valid_sets, Dataset): + valid_sets = [valid_sets] + if isinstance(valid_names, str): + valid_names = [valid_names] + for i, valid_data in enumerate(valid_sets): + # reduce cost for prediction training data + if valid_data is train_set: + is_valid_contain_train = True + if valid_names is not None: + train_data_name = valid_names[i] + continue + reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set)) + if valid_names is not None and len(valid_names) > i: + name_valid_sets.append(valid_names[i]) + else: + name_valid_sets.append(f'valid_{i}') + # process callbacks + if callbacks is None: + callbacks_set = set() + else: + for i, cb in enumerate(callbacks): + cb.__dict__.setdefault('order', i - len(callbacks)) + callbacks_set = set(callbacks) + + if "early_stopping_round" in params: + callbacks_set.add( + callback.early_stopping( + stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] + first_metric_only=first_metric_only, + verbose=_choose_param_value( + main_param_name="verbosity", + params=params, + default_value=1 + ).pop("verbosity") > 0 + ) + ) + + callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)} + callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set + callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order')) + callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order')) + + # construct booster + try: + booster = Booster(params=params, train_set=train_set) + if is_valid_contain_train: + booster.set_train_data_name(train_data_name) + for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets): + booster.add_valid(valid_set, name_valid_set) + finally: + train_set._reverse_update_params() + for valid_set in reduced_valid_sets: + valid_set._reverse_update_params() + booster.best_iteration = 0 + + # start training + for i in range(init_iteration, init_iteration + num_boost_round): + for cb in callbacks_before_iter: + cb(callback.CallbackEnv(model=booster, + params=params, + iteration=i, + begin_iteration=init_iteration, + end_iteration=init_iteration + num_boost_round, + evaluation_result_list=None)) + + booster.update(fobj=fobj) + + evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = [] + # check evaluation result. + if valid_sets is not None: + if is_valid_contain_train: + evaluation_result_list.extend(booster.eval_train(feval)) + evaluation_result_list.extend(booster.eval_valid(feval)) + try: + for cb in callbacks_after_iter: + cb(callback.CallbackEnv(model=booster, + params=params, + iteration=i, + begin_iteration=init_iteration, + end_iteration=init_iteration + num_boost_round, + evaluation_result_list=evaluation_result_list)) + except callback.EarlyStopException as earlyStopException: + booster.best_iteration = earlyStopException.best_iteration + 1 + evaluation_result_list = earlyStopException.best_score + break + booster.best_score = defaultdict(OrderedDict) + for dataset_name, eval_name, score, _ in evaluation_result_list: + booster.best_score[dataset_name][eval_name] = score + if not keep_training_booster: + booster.model_from_string(booster.model_to_string()).free_dataset() + return booster + + +class CVBooster: + """CVBooster in LightGBM. + + Auxiliary data structure to hold and redirect all boosters of ``cv()`` function. + This class has the same methods as Booster class. + All method calls, except for the following methods, are actually performed for underlying Boosters and + then all returned results are returned in a list. + + - ``model_from_string()`` + - ``model_to_string()`` + - ``save_model()`` + + Attributes + ---------- + boosters : list of Booster + The list of underlying fitted models. + best_iteration : int + The best iteration of fitted model. + """ + + def __init__( + self, + model_file: Optional[Union[str, Path]] = None + ): + """Initialize the CVBooster. + + Parameters + ---------- + model_file : str, pathlib.Path or None, optional (default=None) + Path to the CVBooster model file. + """ + self.boosters: List[Booster] = [] + self.best_iteration = -1 + + if model_file is not None: + with open(model_file, "r") as file: + self._from_dict(json.load(file)) + + def _from_dict(self, models: Dict[str, Any]) -> None: + """Load CVBooster from dict.""" + self.best_iteration = models["best_iteration"] + self.boosters = [] + for model_str in models["boosters"]: + self.boosters.append(Booster(model_str=model_str)) + + def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]: + """Serialize CVBooster to dict.""" + models_str = [] + for booster in self.boosters: + models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration, + importance_type=importance_type)) + return {"boosters": models_str, "best_iteration": self.best_iteration} + + def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]: + """Redirect methods call of CVBooster.""" + def handler_function(*args: Any, **kwargs: Any) -> List[Any]: + """Call methods with each booster, and concatenate their results.""" + ret = [] + for booster in self.boosters: + ret.append(getattr(booster, name)(*args, **kwargs)) + return ret + return handler_function + + def __getstate__(self) -> Dict[str, Any]: + return vars(self) + + def __setstate__(self, state: Dict[str, Any]) -> None: + vars(self).update(state) + + def model_from_string(self, model_str: str) -> "CVBooster": + """Load CVBooster from a string. + + Parameters + ---------- + model_str : str + Model will be loaded from this string. + + Returns + ------- + self : CVBooster + Loaded CVBooster object. + """ + self._from_dict(json.loads(model_str)) + return self + + def model_to_string( + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> str: + """Save CVBooster to JSON string. + + Parameters + ---------- + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + str_repr : str + JSON string representation of CVBooster. + """ + return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type)) + + def save_model( + self, + filename: Union[str, Path], + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> "CVBooster": + """Save CVBooster to a file as JSON text. + + Parameters + ---------- + filename : str or pathlib.Path + Filename to save CVBooster. + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + self : CVBooster + Returns self. + """ + with open(filename, "w") as file: + json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file) + + return self + + +def _make_n_folds( + full_data: Dataset, + folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]], + nfold: int, + params: Dict[str, Any], + seed: int, + fpreproc: Optional[_LGBM_PreprocFunction], + stratified: bool, + shuffle: bool, + eval_train_metric: bool +) -> CVBooster: + """Make a n-fold list of Booster from random indices.""" + full_data = full_data.construct() + num_data = full_data.num_data() + if folds is not None: + if not hasattr(folds, '__iter__') and not hasattr(folds, 'split'): + raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx) tuples " + "or scikit-learn splitter object with split method") + if hasattr(folds, 'split'): + group_info = full_data.get_group() + if group_info is not None: + group_info = np.array(group_info, dtype=np.int32, copy=False) + flatted_group = np.repeat(range(len(group_info)), repeats=group_info) + else: + flatted_group = np.zeros(num_data, dtype=np.int32) + folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group) + else: + if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg", + "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} + for obj_alias in _ConfigAliases.get("objective")): + if not SKLEARN_INSTALLED: + raise LightGBMError('scikit-learn is required for ranking cv') + # ranking task, split according to groups + group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False) + flatted_group = np.repeat(range(len(group_info)), repeats=group_info) + group_kfold = _LGBMGroupKFold(n_splits=nfold) + folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group) + elif stratified: + if not SKLEARN_INSTALLED: + raise LightGBMError('scikit-learn is required for stratified cv') + skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) + folds = skf.split(X=np.empty(num_data), y=full_data.get_label()) + else: + if shuffle: + randidx = np.random.RandomState(seed).permutation(num_data) + else: + randidx = np.arange(num_data) + kstep = int(num_data / nfold) + test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)] + train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)] + folds = zip(train_id, test_id) + + ret = CVBooster() + for train_idx, test_idx in folds: + train_set = full_data.subset(sorted(train_idx)) + valid_set = full_data.subset(sorted(test_idx)) + # run preprocessing on the data set if needed + if fpreproc is not None: + train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy()) + else: + tparam = params + booster_for_fold = Booster(tparam, train_set) + if eval_train_metric: + booster_for_fold.add_valid(train_set, 'train') + booster_for_fold.add_valid(valid_set, 'valid') + ret.boosters.append(booster_for_fold) + return ret + + +def _agg_cv_result( + raw_results: List[List[_LGBM_BoosterEvalMethodResultType]] +) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: + """Aggregate cross-validation results.""" + cvmap: Dict[str, List[float]] = OrderedDict() + metric_type: Dict[str, bool] = {} + for one_result in raw_results: + for one_line in one_result: + key = f"{one_line[0]} {one_line[1]}" + metric_type[key] = one_line[3] + cvmap.setdefault(key, []) + cvmap[key].append(one_line[2]) + return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] + + +def cv( + params: Dict[str, Any], + train_set: Dataset, + num_boost_round: int = 100, + folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None, + nfold: int = 5, + stratified: bool = True, + shuffle: bool = True, + metrics: Optional[Union[str, List[str]]] = None, + feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None, + init_model: Optional[Union[str, Path, Booster]] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + fpreproc: Optional[_LGBM_PreprocFunction] = None, + seed: int = 0, + callbacks: Optional[List[Callable]] = None, + eval_train_metric: bool = False, + return_cvbooster: bool = False +) -> Dict[str, Union[List[float], CVBooster]]: + """Perform the cross-validation with given parameters. + + Parameters + ---------- + params : dict + Parameters for training. Values passed through ``params`` take precedence over those + supplied via arguments. + train_set : Dataset + Data to be trained on. + num_boost_round : int, optional (default=100) + Number of boosting iterations. + folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None) + If generator or iterator, it should yield the train and test indices for each fold. + If object, it should be one of the scikit-learn splitter classes + (https://scikit-learn.org/stable/modules/classes.html#splitter-classes) + and have ``split`` method. + This argument has highest priority over other data split arguments. + nfold : int, optional (default=5) + Number of folds in CV. + stratified : bool, optional (default=True) + Whether to perform stratified sampling. + shuffle : bool, optional (default=True) + Whether to shuffle before splitting data. + metrics : str, list of str, or None, optional (default=None) + Evaluation metrics to be monitored while CV. + If not None, the metric in ``params`` will be overridden. + feval : callable, list of callable, or None, optional (default=None) + Customized evaluation function. + Each evaluation function should accept two parameters: preds, eval_data, + and return (eval_name, eval_result, is_higher_better) or list of such tuples. + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes]. + If custom objective function is used, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + eval_data : Dataset + A ``Dataset`` to evaluate. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + + To ignore the default metric corresponding to the used objective, + set ``metrics`` to the string ``"None"``. + init_model : str, pathlib.Path, Booster or None, optional (default=None) + Filename of LightGBM model or Booster instance used for continue training. + feature_name : list of str, or 'auto', optional (default="auto") + Feature names. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of str or int, or 'auto', optional (default="auto") + Categorical features. + If list of int, interpreted as indices. + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + Floating point numbers in categorical features will be rounded towards 0. + fpreproc : callable or None, optional (default=None) + Preprocessing function that takes (dtrain, dtest, params) + and returns transformed versions of those. + seed : int, optional (default=0) + Seed used to generate the folds (passed to numpy.random.seed). + callbacks : list of callable, or None, optional (default=None) + List of callback functions that are applied at each iteration. + See Callbacks in Python API for more information. + eval_train_metric : bool, optional (default=False) + Whether to display the train metric in progress. + The score of the metric is calculated again after each training step, so there is some impact on performance. + return_cvbooster : bool, optional (default=False) + Whether to return Booster models trained on each fold through ``CVBooster``. + + Note + ---- + A custom objective function can be provided for the ``objective`` parameter. + It should accept two parameters: preds, train_data and return (grad, hess). + + preds : numpy 1-D array or numpy 2-D array (for multi-class task) + The predicted values. + Predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task. + train_data : Dataset + The training dataset. + grad : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of preds for each sample point. + hess : numpy 1-D array or numpy 2-D array (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of preds for each sample point. + + For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + + Returns + ------- + eval_results : dict + History of evaluation results of each metric. + The dictionary has the following format: + {'valid metric1-mean': [values], 'valid metric1-stdv': [values], + 'valid metric2-mean': [values], 'valid metric2-stdv': [values], + ...}. + If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key. + If ``eval_train_metric=True``, also returns the train metric history. + In this case, the dictionary has the following format: + {'train metric1-mean': [values], 'valid metric1-mean': [values], + 'train metric2-mean': [values], 'valid metric2-mean': [values], + ...}. + """ + if not isinstance(train_set, Dataset): + raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.") + + if num_boost_round <= 0: + raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") + + params = copy.deepcopy(params) + params = _choose_param_value( + main_param_name='objective', + params=params, + default_value=None + ) + fobj: Optional[_LGBM_CustomObjectiveFunction] = None + if callable(params["objective"]): + fobj = params["objective"] + params["objective"] = 'none' + for alias in _ConfigAliases.get("num_iterations"): + if alias in params: + _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument") + num_boost_round = params.pop(alias) + params["num_iterations"] = num_boost_round + # setting early stopping via global params should be possible + params = _choose_param_value( + main_param_name="early_stopping_round", + params=params, + default_value=None + ) + if params["early_stopping_round"] is None: + params.pop("early_stopping_round") + first_metric_only = params.get('first_metric_only', False) + + if isinstance(init_model, (str, Path)): + predictor = _InnerPredictor.from_model_file( + model_file=init_model, + pred_parameter=params + ) + elif isinstance(init_model, Booster): + predictor = _InnerPredictor.from_booster( + booster=init_model, + pred_parameter=dict(init_model.params, **params) + ) + else: + predictor = None + + if metrics is not None: + for metric_alias in _ConfigAliases.get("metric"): + params.pop(metric_alias, None) + params['metric'] = metrics + + train_set._update_params(params) \ + ._set_predictor(predictor) \ + .set_feature_name(feature_name) \ + .set_categorical_feature(categorical_feature) + + results = defaultdict(list) + cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold, + params=params, seed=seed, fpreproc=fpreproc, + stratified=stratified, shuffle=shuffle, + eval_train_metric=eval_train_metric) + + # setup callbacks + if callbacks is None: + callbacks_set = set() + else: + for i, cb in enumerate(callbacks): + cb.__dict__.setdefault('order', i - len(callbacks)) + callbacks_set = set(callbacks) + + if "early_stopping_round" in params: + callbacks_set.add( + callback.early_stopping( + stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] + first_metric_only=first_metric_only, + verbose=_choose_param_value( + main_param_name="verbosity", + params=params, + default_value=1 + ).pop("verbosity") > 0 + ) + ) + + callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)} + callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set + callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order')) + callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order')) + + for i in range(num_boost_round): + for cb in callbacks_before_iter: + cb(callback.CallbackEnv(model=cvfolds, + params=params, + iteration=i, + begin_iteration=0, + end_iteration=num_boost_round, + evaluation_result_list=None)) + cvfolds.update(fobj=fobj) # type: ignore[call-arg] + res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg] + for _, key, mean, _, std in res: + results[f'{key}-mean'].append(mean) + results[f'{key}-stdv'].append(std) + try: + for cb in callbacks_after_iter: + cb(callback.CallbackEnv(model=cvfolds, + params=params, + iteration=i, + begin_iteration=0, + end_iteration=num_boost_round, + evaluation_result_list=res)) + except callback.EarlyStopException as earlyStopException: + cvfolds.best_iteration = earlyStopException.best_iteration + 1 + for bst in cvfolds.boosters: + bst.best_iteration = cvfolds.best_iteration + for k in results: + results[k] = results[k][:cvfolds.best_iteration] + break + + if return_cvbooster: + results['cvbooster'] = cvfolds # type: ignore[assignment] + + return dict(results) diff --git a/ext/lightgbm/lib/lib_lightgbm.so b/ext/lightgbm/lib/lib_lightgbm.so Binary files differnew file mode 100755 index 0000000..aeac091 --- /dev/null +++ b/ext/lightgbm/lib/lib_lightgbm.so diff --git a/ext/lightgbm/libpath.py b/ext/lightgbm/libpath.py new file mode 100644 index 0000000..2122222 --- /dev/null +++ b/ext/lightgbm/libpath.py @@ -0,0 +1,32 @@ +# coding: utf-8 +"""Find the path to LightGBM dynamic library files.""" +from pathlib import Path +from platform import system +from typing import List + +__all__: List[str] = [] + + +def find_lib_path() -> List[str]: + """Find the path to LightGBM library files. + + Returns + ------- + lib_path: list of str + List of all found library paths to LightGBM. + """ + curr_path = Path(__file__).absolute() + dll_path = [curr_path.parents[1], + curr_path.parents[0] / 'bin', + curr_path.parents[0] / 'lib'] + if system() in ('Windows', 'Microsoft'): + dll_path.append(curr_path.parents[1] / 'Release') + dll_path.append(curr_path.parents[1] / 'windows' / 'x64' / 'DLL') + dll_path = [p / 'lib_lightgbm.dll' for p in dll_path] + else: + dll_path = [p / 'lib_lightgbm.so' for p in dll_path] + lib_path = [str(p) for p in dll_path if p.is_file()] + if not lib_path: + dll_path_joined = '\n'.join(map(str, dll_path)) + raise Exception(f'Cannot find lightgbm library file in following paths:\n{dll_path_joined}') + return lib_path diff --git a/ext/lightgbm/plotting.py b/ext/lightgbm/plotting.py new file mode 100644 index 0000000..85b245c --- /dev/null +++ b/ext/lightgbm/plotting.py @@ -0,0 +1,828 @@ +# coding: utf-8 +"""Plotting library.""" +import math +from copy import deepcopy +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np + +from .basic import Booster, _data_from_pandas, _is_zero, _log_warning, _MissingType +from .compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, pd_DataFrame +from .sklearn import LGBMModel + +__all__ = [ + 'create_tree_digraph', + 'plot_importance', + 'plot_metric', + 'plot_split_value_histogram', + 'plot_tree', +] + + +def _check_not_tuple_of_2_elements(obj: Any, obj_name: str) -> None: + """Check object is not tuple or does not have 2 elements.""" + if not isinstance(obj, tuple) or len(obj) != 2: + raise TypeError(f"{obj_name} must be a tuple of 2 elements.") + + +def _float2str(value: float, precision: Optional[int]) -> str: + return (f"{value:.{precision}f}" + if precision is not None and not isinstance(value, str) + else str(value)) + + +def plot_importance( + booster: Union[Booster, LGBMModel], + ax=None, + height: float = 0.2, + xlim: Optional[Tuple[float, float]] = None, + ylim: Optional[Tuple[float, float]] = None, + title: Optional[str] = 'Feature importance', + xlabel: Optional[str] = 'Feature importance', + ylabel: Optional[str] = 'Features', + importance_type: str = 'auto', + max_num_features: Optional[int] = None, + ignore_zero: bool = True, + figsize: Optional[Tuple[float, float]] = None, + dpi: Optional[int] = None, + grid: bool = True, + precision: Optional[int] = 3, + **kwargs: Any +) -> Any: + """Plot model's feature importances. + + Parameters + ---------- + booster : Booster or LGBMModel + Booster or LGBMModel instance which feature importance should be plotted. + ax : matplotlib.axes.Axes or None, optional (default=None) + Target axes instance. + If None, new figure and axes will be created. + height : float, optional (default=0.2) + Bar height, passed to ``ax.barh()``. + xlim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.xlim()``. + ylim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.ylim()``. + title : str or None, optional (default="Feature importance") + Axes title. + If None, title is disabled. + xlabel : str or None, optional (default="Feature importance") + X-axis title label. + If None, title is disabled. + @importance_type@ placeholder can be used, and it will be replaced with the value of ``importance_type`` parameter. + ylabel : str or None, optional (default="Features") + Y-axis title label. + If None, title is disabled. + importance_type : str, optional (default="auto") + How the importance is calculated. + If "auto", if ``booster`` parameter is LGBMModel, ``booster.importance_type`` attribute is used; "split" otherwise. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + max_num_features : int or None, optional (default=None) + Max number of top features displayed on plot. + If None or <1, all features will be displayed. + ignore_zero : bool, optional (default=True) + Whether to ignore features with zero importance. + figsize : tuple of 2 elements or None, optional (default=None) + Figure size. + dpi : int or None, optional (default=None) + Resolution of the figure. + grid : bool, optional (default=True) + Whether to add a grid for axes. + precision : int or None, optional (default=3) + Used to restrict the display of floating point values to a certain precision. + **kwargs + Other parameters passed to ``ax.barh()``. + + Returns + ------- + ax : matplotlib.axes.Axes + The plot with model's feature importances. + """ + if MATPLOTLIB_INSTALLED: + import matplotlib.pyplot as plt + else: + raise ImportError('You must install matplotlib and restart your session to plot importance.') + + if isinstance(booster, LGBMModel): + if importance_type == "auto": + importance_type = booster.importance_type + booster = booster.booster_ + elif isinstance(booster, Booster): + if importance_type == "auto": + importance_type = "split" + else: + raise TypeError('booster must be Booster or LGBMModel.') + + importance = booster.feature_importance(importance_type=importance_type) + feature_name = booster.feature_name() + + if not len(importance): + raise ValueError("Booster's feature_importance is empty.") + + tuples = sorted(zip(feature_name, importance), key=lambda x: x[1]) + if ignore_zero: + tuples = [x for x in tuples if x[1] > 0] + if max_num_features is not None and max_num_features > 0: + tuples = tuples[-max_num_features:] + labels, values = zip(*tuples) + + if ax is None: + if figsize is not None: + _check_not_tuple_of_2_elements(figsize, 'figsize') + _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) + + ylocs = np.arange(len(values)) + ax.barh(ylocs, values, align='center', height=height, **kwargs) + + for x, y in zip(values, ylocs): + ax.text(x + 1, y, + _float2str(x, precision) if importance_type == 'gain' else x, + va='center') + + ax.set_yticks(ylocs) + ax.set_yticklabels(labels) + + if xlim is not None: + _check_not_tuple_of_2_elements(xlim, 'xlim') + else: + xlim = (0, max(values) * 1.1) + ax.set_xlim(xlim) + + if ylim is not None: + _check_not_tuple_of_2_elements(ylim, 'ylim') + else: + ylim = (-1, len(values)) + ax.set_ylim(ylim) + + if title is not None: + ax.set_title(title) + if xlabel is not None: + xlabel = xlabel.replace('@importance_type@', importance_type) + ax.set_xlabel(xlabel) + if ylabel is not None: + ax.set_ylabel(ylabel) + ax.grid(grid) + return ax + + +def plot_split_value_histogram( + booster: Union[Booster, LGBMModel], + feature: Union[int, str], + bins: Union[int, str, None] = None, + ax=None, + width_coef: float = 0.8, + xlim: Optional[Tuple[float, float]] = None, + ylim: Optional[Tuple[float, float]] = None, + title: Optional[str] = 'Split value histogram for feature with @index/name@ @feature@', + xlabel: Optional[str] = 'Feature split value', + ylabel: Optional[str] = 'Count', + figsize: Optional[Tuple[float, float]] = None, + dpi: Optional[int] = None, + grid: bool = True, + **kwargs: Any +) -> Any: + """Plot split value histogram for the specified feature of the model. + + Parameters + ---------- + booster : Booster or LGBMModel + Booster or LGBMModel instance of which feature split value histogram should be plotted. + feature : int or str + The feature name or index the histogram is plotted for. + If int, interpreted as index. + If str, interpreted as name. + bins : int, str or None, optional (default=None) + The maximum number of bins. + If None, the number of bins equals number of unique split values. + If str, it should be one from the list of the supported values by ``numpy.histogram()`` function. + ax : matplotlib.axes.Axes or None, optional (default=None) + Target axes instance. + If None, new figure and axes will be created. + width_coef : float, optional (default=0.8) + Coefficient for histogram bar width. + xlim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.xlim()``. + ylim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.ylim()``. + title : str or None, optional (default="Split value histogram for feature with @index/name@ @feature@") + Axes title. + If None, title is disabled. + @feature@ placeholder can be used, and it will be replaced with the value of ``feature`` parameter. + @index/name@ placeholder can be used, + and it will be replaced with ``index`` word in case of ``int`` type ``feature`` parameter + or ``name`` word in case of ``str`` type ``feature`` parameter. + xlabel : str or None, optional (default="Feature split value") + X-axis title label. + If None, title is disabled. + ylabel : str or None, optional (default="Count") + Y-axis title label. + If None, title is disabled. + figsize : tuple of 2 elements or None, optional (default=None) + Figure size. + dpi : int or None, optional (default=None) + Resolution of the figure. + grid : bool, optional (default=True) + Whether to add a grid for axes. + **kwargs + Other parameters passed to ``ax.bar()``. + + Returns + ------- + ax : matplotlib.axes.Axes + The plot with specified model's feature split value histogram. + """ + if MATPLOTLIB_INSTALLED: + import matplotlib.pyplot as plt + from matplotlib.ticker import MaxNLocator + else: + raise ImportError('You must install matplotlib and restart your session to plot split value histogram.') + + if isinstance(booster, LGBMModel): + booster = booster.booster_ + elif not isinstance(booster, Booster): + raise TypeError('booster must be Booster or LGBMModel.') + + hist, split_bins = booster.get_split_value_histogram(feature=feature, bins=bins, xgboost_style=False) + if np.count_nonzero(hist) == 0: + raise ValueError('Cannot plot split value histogram, ' + f'because feature {feature} was not used in splitting') + width = width_coef * (split_bins[1] - split_bins[0]) + centred = (split_bins[:-1] + split_bins[1:]) / 2 + + if ax is None: + if figsize is not None: + _check_not_tuple_of_2_elements(figsize, 'figsize') + _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) + + ax.bar(centred, hist, align='center', width=width, **kwargs) + + if xlim is not None: + _check_not_tuple_of_2_elements(xlim, 'xlim') + else: + range_result = split_bins[-1] - split_bins[0] + xlim = (split_bins[0] - range_result * 0.2, split_bins[-1] + range_result * 0.2) + ax.set_xlim(xlim) + + ax.yaxis.set_major_locator(MaxNLocator(integer=True)) + if ylim is not None: + _check_not_tuple_of_2_elements(ylim, 'ylim') + else: + ylim = (0, max(hist) * 1.1) + ax.set_ylim(ylim) + + if title is not None: + title = title.replace('@feature@', str(feature)) + title = title.replace('@index/name@', ('name' if isinstance(feature, str) else 'index')) + ax.set_title(title) + if xlabel is not None: + ax.set_xlabel(xlabel) + if ylabel is not None: + ax.set_ylabel(ylabel) + ax.grid(grid) + return ax + + +def plot_metric( + booster: Union[Dict, LGBMModel], + metric: Optional[str] = None, + dataset_names: Optional[List[str]] = None, + ax=None, + xlim: Optional[Tuple[float, float]] = None, + ylim: Optional[Tuple[float, float]] = None, + title: Optional[str] = 'Metric during training', + xlabel: Optional[str] = 'Iterations', + ylabel: Optional[str] = '@metric@', + figsize: Optional[Tuple[float, float]] = None, + dpi: Optional[int] = None, + grid: bool = True +) -> Any: + """Plot one metric during training. + + Parameters + ---------- + booster : dict or LGBMModel + Dictionary returned from ``lightgbm.train()`` or LGBMModel instance. + metric : str or None, optional (default=None) + The metric name to plot. + Only one metric supported because different metrics have various scales. + If None, first metric picked from dictionary (according to hashcode). + dataset_names : list of str, or None, optional (default=None) + List of the dataset names which are used to calculate metric to plot. + If None, all datasets are used. + ax : matplotlib.axes.Axes or None, optional (default=None) + Target axes instance. + If None, new figure and axes will be created. + xlim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.xlim()``. + ylim : tuple of 2 elements or None, optional (default=None) + Tuple passed to ``ax.ylim()``. + title : str or None, optional (default="Metric during training") + Axes title. + If None, title is disabled. + xlabel : str or None, optional (default="Iterations") + X-axis title label. + If None, title is disabled. + ylabel : str or None, optional (default="@metric@") + Y-axis title label. + If 'auto', metric name is used. + If None, title is disabled. + @metric@ placeholder can be used, and it will be replaced with metric name. + figsize : tuple of 2 elements or None, optional (default=None) + Figure size. + dpi : int or None, optional (default=None) + Resolution of the figure. + grid : bool, optional (default=True) + Whether to add a grid for axes. + + Returns + ------- + ax : matplotlib.axes.Axes + The plot with metric's history over the training. + """ + if MATPLOTLIB_INSTALLED: + import matplotlib.pyplot as plt + else: + raise ImportError('You must install matplotlib and restart your session to plot metric.') + + if isinstance(booster, LGBMModel): + eval_results = deepcopy(booster.evals_result_) + elif isinstance(booster, dict): + eval_results = deepcopy(booster) + elif isinstance(booster, Booster): + raise TypeError("booster must be dict or LGBMModel. To use plot_metric with Booster type, first record the metrics using record_evaluation callback then pass that to plot_metric as argument `booster`") + else: + raise TypeError('booster must be dict or LGBMModel.') + + num_data = len(eval_results) + + if not num_data: + raise ValueError('eval results cannot be empty.') + + if ax is None: + if figsize is not None: + _check_not_tuple_of_2_elements(figsize, 'figsize') + _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) + + if dataset_names is None: + dataset_names_iter = iter(eval_results.keys()) + elif not isinstance(dataset_names, (list, tuple, set)) or not dataset_names: + raise ValueError('dataset_names should be iterable and cannot be empty') + else: + dataset_names_iter = iter(dataset_names) + + name = next(dataset_names_iter) # take one as sample + metrics_for_one = eval_results[name] + num_metric = len(metrics_for_one) + if metric is None: + if num_metric > 1: + _log_warning("More than one metric available, picking one to plot.") + metric, results = metrics_for_one.popitem() + else: + if metric not in metrics_for_one: + raise KeyError('No given metric in eval results.') + results = metrics_for_one[metric] + num_iteration = len(results) + max_result = max(results) + min_result = min(results) + x_ = range(num_iteration) + ax.plot(x_, results, label=name) + + for name in dataset_names_iter: + metrics_for_one = eval_results[name] + results = metrics_for_one[metric] + max_result = max(max(results), max_result) + min_result = min(min(results), min_result) + ax.plot(x_, results, label=name) + + ax.legend(loc='best') + + if xlim is not None: + _check_not_tuple_of_2_elements(xlim, 'xlim') + else: + xlim = (0, num_iteration) + ax.set_xlim(xlim) + + if ylim is not None: + _check_not_tuple_of_2_elements(ylim, 'ylim') + else: + range_result = max_result - min_result + ylim = (min_result - range_result * 0.2, max_result + range_result * 0.2) + ax.set_ylim(ylim) + + if title is not None: + ax.set_title(title) + if xlabel is not None: + ax.set_xlabel(xlabel) + if ylabel is not None: + ylabel = ylabel.replace('@metric@', metric) + ax.set_ylabel(ylabel) + ax.grid(grid) + return ax + + +def _determine_direction_for_numeric_split( + fval: float, + threshold: float, + missing_type_str: str, + default_left: bool, +) -> str: + missing_type = _MissingType(missing_type_str) + if math.isnan(fval) and missing_type != _MissingType.NAN: + fval = 0.0 + if ((missing_type == _MissingType.ZERO and _is_zero(fval)) + or (missing_type == _MissingType.NAN and math.isnan(fval))): + direction = 'left' if default_left else 'right' + else: + direction = 'left' if fval <= threshold else 'right' + return direction + + +def _determine_direction_for_categorical_split(fval: float, thresholds: str) -> str: + if math.isnan(fval) or int(fval) < 0: + return 'right' + int_thresholds = {int(t) for t in thresholds.split('||')} + return 'left' if int(fval) in int_thresholds else 'right' + + +def _to_graphviz( + tree_info: Dict[str, Any], + show_info: List[str], + feature_names: Union[List[str], None], + precision: Optional[int], + orientation: str, + constraints: Optional[List[int]], + example_case: Optional[Union[np.ndarray, pd_DataFrame]], + max_category_values: int, + **kwargs: Any +) -> Any: + """Convert specified tree to graphviz instance. + + See: + - https://graphviz.readthedocs.io/en/stable/api.html#digraph + """ + if GRAPHVIZ_INSTALLED: + from graphviz import Digraph + else: + raise ImportError('You must install graphviz and restart your session to plot tree.') + + def add( + root: Dict[str, Any], + total_count: int, + parent: Optional[str], + decision: Optional[str], + highlight: bool + ) -> None: + """Recursively add node or edge.""" + fillcolor = 'white' + style = '' + tooltip = None + if highlight: + color = 'blue' + penwidth = '3' + else: + color = 'black' + penwidth = '1' + if 'split_index' in root: # non-leaf + shape = "rectangle" + l_dec = 'yes' + r_dec = 'no' + threshold = root['threshold'] + if root['decision_type'] == '<=': + operator = "≤" + elif root['decision_type'] == '==': + operator = "=" + else: + raise ValueError('Invalid decision type in tree model.') + name = f"split{root['split_index']}" + split_feature = root['split_feature'] + if feature_names is not None: + label = f"<B>{feature_names[split_feature]}</B> {operator}" + else: + label = f"feature <B>{split_feature}</B> {operator} " + direction = None + if example_case is not None: + if root['decision_type'] == '==': + direction = _determine_direction_for_categorical_split( + fval=example_case[split_feature], + thresholds=root['threshold'] + ) + else: + direction = _determine_direction_for_numeric_split( + fval=example_case[split_feature], + threshold=root['threshold'], + missing_type_str=root['missing_type'], + default_left=root['default_left'] + ) + if root['decision_type'] == '==': + category_values = root['threshold'].split('||') + if len(category_values) > max_category_values: + tooltip = root['threshold'] + threshold = '||'.join(category_values[:2]) + '||...||' + category_values[-1] + + label += f"<B>{_float2str(threshold, precision)}</B>" + for info in ['split_gain', 'internal_value', 'internal_weight', "internal_count", "data_percentage"]: + if info in show_info: + output = info.split('_')[-1] + if info in {'split_gain', 'internal_value', 'internal_weight'}: + label += f"<br/>{_float2str(root[info], precision)} {output}" + elif info == 'internal_count': + label += f"<br/>{output}: {root[info]}" + elif info == "data_percentage": + label += f"<br/>{_float2str(root['internal_count'] / total_count * 100, 2)}% of data" + + if constraints: + if constraints[root['split_feature']] == 1: + fillcolor = "#ddffdd" # light green + if constraints[root['split_feature']] == -1: + fillcolor = "#ffdddd" # light red + style = "filled" + label = f"<{label}>" + add( + root=root['left_child'], + total_count=total_count, + parent=name, + decision=l_dec, + highlight=highlight and direction == "left" + ) + add( + root=root['right_child'], + total_count=total_count, + parent=name, + decision=r_dec, + highlight=highlight and direction == "right" + ) + else: # leaf + shape = "ellipse" + name = f"leaf{root['leaf_index']}" + label = f"leaf {root['leaf_index']}: " + label += f"<B>{_float2str(root['leaf_value'], precision)}</B>" + if 'leaf_weight' in show_info: + label += f"<br/>{_float2str(root['leaf_weight'], precision)} weight" + if 'leaf_count' in show_info: + label += f"<br/>count: {root['leaf_count']}" + if "data_percentage" in show_info: + label += f"<br/>{_float2str(root['leaf_count'] / total_count * 100, 2)}% of data" + label = f"<{label}>" + graph.node(name, label=label, shape=shape, style=style, fillcolor=fillcolor, color=color, penwidth=penwidth, tooltip=tooltip) + if parent is not None: + graph.edge(parent, name, decision, color=color, penwidth=penwidth) + + graph = Digraph(**kwargs) + rankdir = "LR" if orientation == "horizontal" else "TB" + graph.attr("graph", nodesep="0.05", ranksep="0.3", rankdir=rankdir) + if "internal_count" in tree_info['tree_structure']: + add( + root=tree_info['tree_structure'], + total_count=tree_info['tree_structure']["internal_count"], + parent=None, + decision=None, + highlight=example_case is not None + ) + else: + raise Exception("Cannot plot trees with no split") + + if constraints: + # "#ddffdd" is light green, "#ffdddd" is light red + legend = """< + <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4"> + <TR> + <TD COLSPAN="2"><B>Monotone constraints</B></TD> + </TR> + <TR> + <TD>Increasing</TD> + <TD BGCOLOR="#ddffdd"></TD> + </TR> + <TR> + <TD>Decreasing</TD> + <TD BGCOLOR="#ffdddd"></TD> + </TR> + </TABLE> + >""" + graph.node("legend", label=legend, shape="rectangle", color="white") + return graph + + +def create_tree_digraph( + booster: Union[Booster, LGBMModel], + tree_index: int = 0, + show_info: Optional[List[str]] = None, + precision: Optional[int] = 3, + orientation: str = 'horizontal', + example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None, + max_category_values: int = 10, + **kwargs: Any +) -> Any: + """Create a digraph representation of specified tree. + + Each node in the graph represents a node in the tree. + + Non-leaf nodes have labels like ``Column_10 <= 875.9``, which means + "this node splits on the feature named "Column_10", with threshold 875.9". + + Leaf nodes have labels like ``leaf 2: 0.422``, which means "this node is a + leaf node, and the predicted value for records that fall into this node + is 0.422". The number (``2``) is an internal unique identifier and doesn't + have any special meaning. + + .. note:: + + For more information please visit + https://graphviz.readthedocs.io/en/stable/api.html#digraph. + + Parameters + ---------- + booster : Booster or LGBMModel + Booster or LGBMModel instance to be converted. + tree_index : int, optional (default=0) + The index of a target tree to convert. + show_info : list of str, or None, optional (default=None) + What information should be shown in nodes. + + - ``'split_gain'`` : gain from adding this split to the model + - ``'internal_value'`` : raw predicted value that would be produced by this node if it was a leaf node + - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node + - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node + - ``'leaf_count'`` : number of records from the training data that fall into this leaf node + - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node + - ``'data_percentage'`` : percentage of training data that fall into this node + precision : int or None, optional (default=3) + Used to restrict the display of floating point values to a certain precision. + orientation : str, optional (default='horizontal') + Orientation of the tree. + Can be 'horizontal' or 'vertical'. + example_case : numpy 2-D array, pandas DataFrame or None, optional (default=None) + Single row with the same structure as the training data. + If not None, the plot will highlight the path that sample takes through the tree. + + .. versionadded:: 4.0.0 + + max_category_values : int, optional (default=10) + The maximum number of category values to display in tree nodes, if the number of thresholds is greater than this value, thresholds will be collapsed and displayed on the label tooltip instead. + + .. warning:: + + Consider wrapping the SVG string of the tree graph with ``IPython.display.HTML`` when running on JupyterLab to get the `tooltip <https://graphviz.org/docs/attrs/tooltip>`_ working right. + + Example: + + .. code-block:: python + + from IPython.display import HTML + + graph = lgb.create_tree_digraph(clf, max_category_values=5) + HTML(graph._repr_image_svg_xml()) + + .. versionadded:: 4.0.0 + + **kwargs + Other parameters passed to ``Digraph`` constructor. + Check https://graphviz.readthedocs.io/en/stable/api.html#digraph for the full list of supported parameters. + + Returns + ------- + graph : graphviz.Digraph + The digraph representation of specified tree. + """ + if isinstance(booster, LGBMModel): + booster = booster.booster_ + elif not isinstance(booster, Booster): + raise TypeError('booster must be Booster or LGBMModel.') + + model = booster.dump_model() + tree_infos = model['tree_info'] + feature_names = model.get('feature_names', None) + monotone_constraints = model.get('monotone_constraints', None) + + if tree_index < len(tree_infos): + tree_info = tree_infos[tree_index] + else: + raise IndexError('tree_index is out of range.') + + if show_info is None: + show_info = [] + + if example_case is not None: + if not isinstance(example_case, (np.ndarray, pd_DataFrame)) or example_case.ndim != 2: + raise ValueError('example_case must be a numpy 2-D array or a pandas DataFrame') + if example_case.shape[0] != 1: + raise ValueError('example_case must have a single row.') + if isinstance(example_case, pd_DataFrame): + example_case = _data_from_pandas( + data=example_case, + feature_name="auto", + categorical_feature="auto", + pandas_categorical=booster.pandas_categorical + )[0] + example_case = example_case[0] + + return _to_graphviz( + tree_info=tree_info, + show_info=show_info, + feature_names=feature_names, + precision=precision, + orientation=orientation, + constraints=monotone_constraints, + example_case=example_case, + max_category_values=max_category_values, + **kwargs + ) + + +def plot_tree( + booster: Union[Booster, LGBMModel], + ax=None, + tree_index: int = 0, + figsize: Optional[Tuple[float, float]] = None, + dpi: Optional[int] = None, + show_info: Optional[List[str]] = None, + precision: Optional[int] = 3, + orientation: str = 'horizontal', + example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None, + **kwargs: Any +) -> Any: + """Plot specified tree. + + Each node in the graph represents a node in the tree. + + Non-leaf nodes have labels like ``Column_10 <= 875.9``, which means + "this node splits on the feature named "Column_10", with threshold 875.9". + + Leaf nodes have labels like ``leaf 2: 0.422``, which means "this node is a + leaf node, and the predicted value for records that fall into this node + is 0.422". The number (``2``) is an internal unique identifier and doesn't + have any special meaning. + + .. note:: + + It is preferable to use ``create_tree_digraph()`` because of its lossless quality + and returned objects can be also rendered and displayed directly inside a Jupyter notebook. + + Parameters + ---------- + booster : Booster or LGBMModel + Booster or LGBMModel instance to be plotted. + ax : matplotlib.axes.Axes or None, optional (default=None) + Target axes instance. + If None, new figure and axes will be created. + tree_index : int, optional (default=0) + The index of a target tree to plot. + figsize : tuple of 2 elements or None, optional (default=None) + Figure size. + dpi : int or None, optional (default=None) + Resolution of the figure. + show_info : list of str, or None, optional (default=None) + What information should be shown in nodes. + + - ``'split_gain'`` : gain from adding this split to the model + - ``'internal_value'`` : raw predicted value that would be produced by this node if it was a leaf node + - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node + - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node + - ``'leaf_count'`` : number of records from the training data that fall into this leaf node + - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node + - ``'data_percentage'`` : percentage of training data that fall into this node + precision : int or None, optional (default=3) + Used to restrict the display of floating point values to a certain precision. + orientation : str, optional (default='horizontal') + Orientation of the tree. + Can be 'horizontal' or 'vertical'. + example_case : numpy 2-D array, pandas DataFrame or None, optional (default=None) + Single row with the same structure as the training data. + If not None, the plot will highlight the path that sample takes through the tree. + + .. versionadded:: 4.0.0 + + **kwargs + Other parameters passed to ``Digraph`` constructor. + Check https://graphviz.readthedocs.io/en/stable/api.html#digraph for the full list of supported parameters. + + Returns + ------- + ax : matplotlib.axes.Axes + The plot with single tree. + """ + if MATPLOTLIB_INSTALLED: + import matplotlib.image as image + import matplotlib.pyplot as plt + else: + raise ImportError('You must install matplotlib and restart your session to plot tree.') + + if ax is None: + if figsize is not None: + _check_not_tuple_of_2_elements(figsize, 'figsize') + _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) + + graph = create_tree_digraph(booster=booster, tree_index=tree_index, + show_info=show_info, precision=precision, + orientation=orientation, example_case=example_case, **kwargs) + + s = BytesIO() + s.write(graph.pipe(format='png')) + s.seek(0) + img = image.imread(s) + + ax.imshow(img) + ax.axis('off') + return ax diff --git a/ext/lightgbm/py.typed b/ext/lightgbm/py.typed new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/ext/lightgbm/py.typed diff --git a/ext/lightgbm/sklearn.py b/ext/lightgbm/sklearn.py new file mode 100644 index 0000000..120a666 --- /dev/null +++ b/ext/lightgbm/sklearn.py @@ -0,0 +1,1370 @@ +# coding: utf-8 +"""Scikit-learn wrapper interface for LightGBM.""" +import copy +from inspect import signature +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import scipy.sparse + +from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType, + _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, + _LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning) +from .callback import _EvalResultDict, record_evaluation +from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, + _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, + _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, + dt_DataTable, np_random_Generator, pd_DataFrame) +from .engine import train + +__all__ = [ + 'LGBMClassifier', + 'LGBMModel', + 'LGBMRanker', + 'LGBMRegressor', +] + +_LGBM_ScikitMatrixLike = Union[ + dt_DataTable, + List[Union[List[float], List[int]]], + np.ndarray, + pd_DataFrame, + scipy.sparse.spmatrix +] +_LGBM_ScikitCustomObjectiveFunction = Union[ + # f(labels, preds) + Callable[ + [Optional[np.ndarray], np.ndarray], + Tuple[np.ndarray, np.ndarray] + ], + # f(labels, preds, weights) + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + Tuple[np.ndarray, np.ndarray] + ], + # f(labels, preds, weights, group) + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + Tuple[np.ndarray, np.ndarray] + ], +] +_LGBM_ScikitCustomEvalFunction = Union[ + # f(labels, preds) + Callable[ + [Optional[np.ndarray], np.ndarray], + _LGBM_EvalFunctionResultType + ], + Callable[ + [Optional[np.ndarray], np.ndarray], + List[_LGBM_EvalFunctionResultType] + ], + # f(labels, preds, weights) + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + _LGBM_EvalFunctionResultType + ], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + List[_LGBM_EvalFunctionResultType] + ], + # f(labels, preds, weights, group) + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + _LGBM_EvalFunctionResultType + ], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + List[_LGBM_EvalFunctionResultType] + ] +] +_LGBM_ScikitEvalMetricType = Union[ + str, + _LGBM_ScikitCustomEvalFunction, + List[Union[str, _LGBM_ScikitCustomEvalFunction]] +] +_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType] + + +def _get_group_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]: + group = dataset.get_group() + error_msg = ( + "Estimators in lightgbm.sklearn should only retrieve query groups from a constructed Dataset. " + "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." + ) + assert (group is None or isinstance(group, np.ndarray)), error_msg + return group + + +def _get_label_from_constructed_dataset(dataset: Dataset) -> np.ndarray: + label = dataset.get_label() + error_msg = ( + "Estimators in lightgbm.sklearn should only retrieve labels from a constructed Dataset. " + "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." + ) + assert isinstance(label, np.ndarray), error_msg + return label + + +def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]: + weight = dataset.get_weight() + error_msg = ( + "Estimators in lightgbm.sklearn should only retrieve weights from a constructed Dataset. " + "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." + ) + assert (weight is None or isinstance(weight, np.ndarray)), error_msg + return weight + + +class _ObjectiveFunctionWrapper: + """Proxy class for objective function.""" + + def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction): + """Construct a proxy class. + + This class transforms objective function to match objective function with signature ``new_func(preds, dataset)`` + as expected by ``lightgbm.engine.train``. + + Parameters + ---------- + func : callable + Expects a callable with following signatures: + ``func(y_true, y_pred)``, + ``func(y_true, y_pred, weight)`` + or ``func(y_true, y_pred, weight, group)`` + and returns (grad, hess): + + y_true : numpy 1-D array of shape = [n_samples] + The target values. + y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + Predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of y_pred for each sample point. + hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of y_pred for each sample point. + + .. note:: + + For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + """ + self.func = func + + def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]: + """Call passed function with appropriate arguments. + + Parameters + ---------- + preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + dataset : Dataset + The training dataset. + + Returns + ------- + grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of preds for each sample point. + hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of preds for each sample point. + """ + labels = _get_label_from_constructed_dataset(dataset) + argc = len(signature(self.func).parameters) + if argc == 2: + grad, hess = self.func(labels, preds) # type: ignore[call-arg] + return grad, hess + + weight = _get_weight_from_constructed_dataset(dataset) + if argc == 3: + grad, hess = self.func(labels, preds, weight) # type: ignore[call-arg] + return grad, hess + + if argc == 4: + group = _get_group_from_constructed_dataset(dataset) + return self.func(labels, preds, weight, group) # type: ignore[call-arg] + + raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}") + + +class _EvalFunctionWrapper: + """Proxy class for evaluation function.""" + + def __init__(self, func: _LGBM_ScikitCustomEvalFunction): + """Construct a proxy class. + + This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)`` + as expected by ``lightgbm.engine.train``. + + Parameters + ---------- + func : callable + Expects a callable with following signatures: + ``func(y_true, y_pred)``, + ``func(y_true, y_pred, weight)`` + or ``func(y_true, y_pred, weight, group)`` + and returns (eval_name, eval_result, is_higher_better) or + list of (eval_name, eval_result, is_higher_better): + + y_true : numpy 1-D array of shape = [n_samples] + The target values. + y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + In case of custom ``objective``, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + """ + self.func = func + + def __call__( + self, + preds: np.ndarray, + dataset: Dataset + ) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]: + """Call passed function with appropriate arguments. + + Parameters + ---------- + preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + dataset : Dataset + The training dataset. + + Returns + ------- + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. + """ + labels = _get_label_from_constructed_dataset(dataset) + argc = len(signature(self.func).parameters) + if argc == 2: + return self.func(labels, preds) # type: ignore[call-arg] + + weight = _get_weight_from_constructed_dataset(dataset) + if argc == 3: + return self.func(labels, preds, weight) # type: ignore[call-arg] + + if argc == 4: + group = _get_group_from_constructed_dataset(dataset) + return self.func(labels, preds, weight, group) # type: ignore[call-arg] + + raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}") + + +# documentation templates for LGBMModel methods are shared between the classes in +# this module and those in the ``dask`` module + +_lgbmmodel_doc_fit = ( + """ + Build a gradient boosting model from the training set (X, y). + + Parameters + ---------- + X : {X_shape} + Input feature matrix. + y : {y_shape} + The target values (class labels in classification, real numbers in regression). + sample_weight : {sample_weight_shape} + Weights of training data. Weights should be non-negative. + init_score : {init_score_shape} + Init score of training data. + group : {group_shape} + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_set : list or None, optional (default=None) + A list of (X, y) tuple pairs to use as validation sets. + eval_names : list of str, or None, optional (default=None) + Names of eval_set. + eval_sample_weight : {eval_sample_weight_shape} + Weights of eval data. Weights should be non-negative. + eval_class_weight : list or None, optional (default=None) + Class weights of eval data. + eval_init_score : {eval_init_score_shape} + Init score of eval data. + eval_group : {eval_group_shape} + Group data of eval data. + eval_metric : str, callable, list or None, optional (default=None) + If str, it should be a built-in evaluation metric to use. + If callable, it should be a custom evaluation metric, see note below for more details. + If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both. + In either case, the ``metric`` from the model parameters will be evaluated and used as well. + Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker. + feature_name : list of str, or 'auto', optional (default='auto') + Feature names. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of str or int, or 'auto', optional (default='auto') + Categorical features. + If list of int, interpreted as indices. + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + Floating point numbers in categorical features will be rounded towards 0. + callbacks : list of callable, or None, optional (default=None) + List of callback functions that are applied at each iteration. + See Callbacks in Python API for more information. + init_model : str, pathlib.Path, Booster, LGBMModel or None, optional (default=None) + Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training. + + Returns + ------- + self : LGBMModel + Returns self. + """ +) + +_lgbmmodel_doc_custom_eval_note = """ + Note + ---- + Custom eval function expects a callable with following signatures: + ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or + ``func(y_true, y_pred, weight, group)`` + and returns (eval_name, eval_result, is_higher_better) or + list of (eval_name, eval_result, is_higher_better): + + y_true : numpy 1-D array of shape = [n_samples] + The target values. + y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + In case of custom ``objective``, predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task in this case. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + eval_name : str + The name of evaluation function (without whitespace). + eval_result : float + The eval result. + is_higher_better : bool + Is eval result higher better, e.g. AUC is ``is_higher_better``. +""" + +_lgbmmodel_doc_predict = ( + """ + {description} + + Parameters + ---------- + X : {X_shape} + Input features matrix. + raw_score : bool, optional (default=False) + Whether to predict raw scores. + start_iteration : int, optional (default=0) + Start index of the iteration to predict. + If <= 0, starts from the first iteration. + num_iteration : int or None, optional (default=None) + Total number of iterations used in the prediction. + If None, if the best iteration exists and start_iteration <= 0, the best iteration is used; + otherwise, all iterations from ``start_iteration`` are used (no limits). + If <= 0, all iterations from ``start_iteration`` are used (no limits). + pred_leaf : bool, optional (default=False) + Whether to predict leaf index. + pred_contrib : bool, optional (default=False) + Whether to predict feature contributions. + + .. note:: + + If you want to get more explanations for your model's predictions using SHAP values, + like SHAP interaction values, + you can install the shap package (https://github.com/slundberg/shap). + Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra + column, where the last column is the expected value. + + validate_features : bool, optional (default=False) + If True, ensure that the features used to predict match the ones used to train. + Used only if data is pandas DataFrame. + **kwargs + Other parameters for the prediction. + + Returns + ------- + {output_name} : {predicted_result_shape} + The predicted values. + X_leaves : {X_leaves_shape} + If ``pred_leaf=True``, the predicted leaf of every tree for each sample. + X_SHAP_values : {X_SHAP_values_shape} + If ``pred_contrib=True``, the feature contributions for each sample. + """ +) + + +class LGBMModel(_LGBMModelBase): + """Implementation of the scikit-learn API for LightGBM.""" + + def __init__( + self, + boosting_type: str = 'gbdt', + num_leaves: int = 31, + max_depth: int = -1, + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample_for_bin: int = 200000, + objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, + class_weight: Optional[Union[Dict, str]] = None, + min_split_gain: float = 0., + min_child_weight: float = 1e-3, + min_child_samples: int = 20, + subsample: float = 1., + subsample_freq: int = 0, + colsample_bytree: float = 1., + reg_alpha: float = 0., + reg_lambda: float = 0., + random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + n_jobs: Optional[int] = None, + importance_type: str = 'split', + **kwargs + ): + r"""Construct a gradient boosting model. + + Parameters + ---------- + boosting_type : str, optional (default='gbdt') + 'gbdt', traditional Gradient Boosting Decision Tree. + 'dart', Dropouts meet Multiple Additive Regression Trees. + 'rf', Random Forest. + num_leaves : int, optional (default=31) + Maximum tree leaves for base learners. + max_depth : int, optional (default=-1) + Maximum tree depth for base learners, <=0 means no limit. + learning_rate : float, optional (default=0.1) + Boosting learning rate. + You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate + in training using ``reset_parameter`` callback. + Note, that this will ignore the ``learning_rate`` argument in training. + n_estimators : int, optional (default=100) + Number of boosted trees to fit. + subsample_for_bin : int, optional (default=200000) + Number of samples for constructing bins. + objective : str, callable or None, optional (default=None) + Specify the learning task and the corresponding learning objective or + a custom objective function to be used (see note below). + Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker. + class_weight : dict, 'balanced' or None, optional (default=None) + Weights associated with classes in the form ``{class_label: weight}``. + Use this parameter only for multi-class classification task; + for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters. + Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities. + You may want to consider performing probability calibration + (https://scikit-learn.org/stable/modules/calibration.html) of your model. + The 'balanced' mode uses the values of y to automatically adjust weights + inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. + If None, all classes are supposed to have weight one. + Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method) + if ``sample_weight`` is specified. + min_split_gain : float, optional (default=0.) + Minimum loss reduction required to make a further partition on a leaf node of the tree. + min_child_weight : float, optional (default=1e-3) + Minimum sum of instance weight (Hessian) needed in a child (leaf). + min_child_samples : int, optional (default=20) + Minimum number of data needed in a child (leaf). + subsample : float, optional (default=1.) + Subsample ratio of the training instance. + subsample_freq : int, optional (default=0) + Frequency of subsample, <=0 means no enable. + colsample_bytree : float, optional (default=1.) + Subsample ratio of columns when constructing each tree. + reg_alpha : float, optional (default=0.) + L1 regularization term on weights. + reg_lambda : float, optional (default=0.) + L2 regularization term on weights. + random_state : int, RandomState object or None, optional (default=None) + Random number seed. + If int, this number is used to seed the C++ code. + If RandomState or Generator object (numpy), a random integer is picked based on its state to seed the C++ code. + If None, default seeds in C++ code are used. + n_jobs : int or None, optional (default=None) + Number of parallel threads to use for training (can be changed at prediction time by + passing it as an extra keyword argument). + + For better performance, it is recommended to set this to the number of physical cores + in the CPU. + + Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like + scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of + threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds + to using the number of physical cores in the system (its correct detection requires + either the ``joblib`` or the ``psutil`` util libraries to be installed). + + .. versionchanged:: 4.0.0 + + importance_type : str, optional (default='split') + The type of feature importance to be filled into ``feature_importances_``. + If 'split', result contains numbers of times the feature is used in a model. + If 'gain', result contains total gains of splits which use the feature. + **kwargs + Other parameters for the model. + Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters. + + .. warning:: + + \*\*kwargs is not supported in sklearn, it may cause unexpected issues. + + Note + ---- + A custom objective function can be provided for the ``objective`` parameter. + In this case, it should have the signature + ``objective(y_true, y_pred) -> grad, hess``, + ``objective(y_true, y_pred, weight) -> grad, hess`` + or ``objective(y_true, y_pred, weight, group) -> grad, hess``: + + y_true : numpy 1-D array of shape = [n_samples] + The target values. + y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The predicted values. + Predicted values are returned before any transformation, + e.g. they are raw margin instead of probability of positive class for binary task. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The value of the first order derivative (gradient) of the loss + with respect to the elements of y_pred for each sample point. + hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) + The value of the second order derivative (Hessian) of the loss + with respect to the elements of y_pred for each sample point. + + For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes], + and grad and hess should be returned in the same format. + """ + if not SKLEARN_INSTALLED: + raise LightGBMError('scikit-learn is required for lightgbm.sklearn. ' + 'You must install scikit-learn and restart your session to use this module.') + + self.boosting_type = boosting_type + self.objective = objective + self.num_leaves = num_leaves + self.max_depth = max_depth + self.learning_rate = learning_rate + self.n_estimators = n_estimators + self.subsample_for_bin = subsample_for_bin + self.min_split_gain = min_split_gain + self.min_child_weight = min_child_weight + self.min_child_samples = min_child_samples + self.subsample = subsample + self.subsample_freq = subsample_freq + self.colsample_bytree = colsample_bytree + self.reg_alpha = reg_alpha + self.reg_lambda = reg_lambda + self.random_state = random_state + self.n_jobs = n_jobs + self.importance_type = importance_type + self._Booster: Optional[Booster] = None + self._evals_result: _EvalResultDict = {} + self._best_score: _LGBM_BoosterBestScoreType = {} + self._best_iteration: int = -1 + self._other_params: Dict[str, Any] = {} + self._objective = objective + self.class_weight = class_weight + self._class_weight: Optional[Union[Dict, str]] = None + self._class_map: Optional[Dict[int, int]] = None + self._n_features: int = -1 + self._n_features_in: int = -1 + self._classes: Optional[np.ndarray] = None + self._n_classes: int = -1 + self.set_params(**kwargs) + + def _more_tags(self) -> Dict[str, Any]: + return { + 'allow_nan': True, + 'X_types': ['2darray', 'sparse', '1dlabels'], + '_xfail_checks': { + 'check_no_attributes_set_in_init': + 'scikit-learn incorrectly asserts that private attributes ' + 'cannot be set in __init__: ' + '(see https://github.com/microsoft/LightGBM/issues/2628)' + } + } + + def __sklearn_is_fitted__(self) -> bool: + return getattr(self, "fitted_", False) + + def get_params(self, deep: bool = True) -> Dict[str, Any]: + """Get parameters for this estimator. + + Parameters + ---------- + deep : bool, optional (default=True) + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + params = super().get_params(deep=deep) + params.update(self._other_params) + return params + + def set_params(self, **params: Any) -> "LGBMModel": + """Set the parameters of this estimator. + + Parameters + ---------- + **params + Parameter names with their new values. + + Returns + ------- + self : object + Returns self. + """ + for key, value in params.items(): + setattr(self, key, value) + if hasattr(self, f"_{key}"): + setattr(self, f"_{key}", value) + self._other_params[key] = value + return self + + def _process_params(self, stage: str) -> Dict[str, Any]: + """Process the parameters of this estimator based on its type, parameter aliases, etc. + + Parameters + ---------- + stage : str + Name of the stage (can be ``fit`` or ``predict``) this method is called from. + + Returns + ------- + processed_params : dict + Processed parameter names mapped to their values. + """ + assert stage in {"fit", "predict"} + params = self.get_params() + + params.pop('objective', None) + for alias in _ConfigAliases.get('objective'): + if alias in params: + obj = params.pop(alias) + _log_warning(f"Found '{alias}' in params. Will use it instead of 'objective' argument") + if stage == "fit": + self._objective = obj + if stage == "fit": + if self._objective is None: + if isinstance(self, LGBMRegressor): + self._objective = "regression" + elif isinstance(self, LGBMClassifier): + if self._n_classes > 2: + self._objective = "multiclass" + else: + self._objective = "binary" + elif isinstance(self, LGBMRanker): + self._objective = "lambdarank" + else: + raise ValueError("Unknown LGBMModel type.") + if callable(self._objective): + if stage == "fit": + params['objective'] = _ObjectiveFunctionWrapper(self._objective) + else: + params['objective'] = 'None' + else: + params['objective'] = self._objective + + params.pop('importance_type', None) + params.pop('n_estimators', None) + params.pop('class_weight', None) + + if isinstance(params['random_state'], np.random.RandomState): + params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max) + elif isinstance(params['random_state'], np_random_Generator): + params['random_state'] = int( + params['random_state'].integers(np.iinfo(np.int32).max) + ) + if self._n_classes > 2: + for alias in _ConfigAliases.get('num_class'): + params.pop(alias, None) + params['num_class'] = self._n_classes + if hasattr(self, '_eval_at'): + eval_at = self._eval_at + for alias in _ConfigAliases.get('eval_at'): + if alias in params: + _log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument") + eval_at = params.pop(alias) + params['eval_at'] = eval_at + + # register default metric for consistency with callable eval_metric case + original_metric = self._objective if isinstance(self._objective, str) else None + if original_metric is None: + # try to deduce from class instance + if isinstance(self, LGBMRegressor): + original_metric = "l2" + elif isinstance(self, LGBMClassifier): + original_metric = "multi_logloss" if self._n_classes > 2 else "binary_logloss" + elif isinstance(self, LGBMRanker): + original_metric = "ndcg" + + # overwrite default metric by explicitly set metric + params = _choose_param_value("metric", params, original_metric) + + # use joblib conventions for negative n_jobs, just like scikit-learn + # at predict time, this is handled later due to the order of parameter updates + if stage == "fit": + params = _choose_param_value("num_threads", params, self.n_jobs) + params["num_threads"] = self._process_n_jobs(params["num_threads"]) + + return params + + def _process_n_jobs(self, n_jobs: Optional[int]) -> int: + """Convert special values of n_jobs to their actual values according to the formulas that apply. + + Parameters + ---------- + n_jobs : int or None + The original value of n_jobs, potentially having special values such as 'None' or + negative integers. + + Returns + ------- + n_jobs : int + The value of n_jobs with special values converted to actual number of threads. + """ + if n_jobs is None: + n_jobs = _LGBMCpuCount(only_physical_cores=True) + elif n_jobs < 0: + n_jobs = max(_LGBMCpuCount(only_physical_cores=False) + 1 + n_jobs, 1) + return n_jobs + + def fit( + self, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + group: Optional[_LGBM_GroupType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_class_weight: Optional[List[float]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_group: Optional[List[_LGBM_GroupType]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + callbacks: Optional[List[Callable]] = None, + init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None + ) -> "LGBMModel": + """Docstring is set after definition, using a template.""" + params = self._process_params(stage="fit") + + # Do not modify original args in fit function + # Refer to https://github.com/microsoft/LightGBM/pull/2619 + eval_metric_list: List[Union[str, _LGBM_ScikitCustomEvalFunction]] + if eval_metric is None: + eval_metric_list = [] + elif isinstance(eval_metric, list): + eval_metric_list = copy.deepcopy(eval_metric) + else: + eval_metric_list = [copy.deepcopy(eval_metric)] + + # Separate built-in from callable evaluation metrics + eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)] + eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)] + + # concatenate metric from params (or default if not provided in params) and eval_metric + params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric'] + params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric'] + params['metric'] = [metric for metric in params['metric'] if metric is not None] + + if not isinstance(X, (pd_DataFrame, dt_DataTable)): + _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) + if sample_weight is not None: + sample_weight = _LGBMCheckSampleWeight(sample_weight, _X) + else: + _X, _y = X, y + + if self._class_weight is None: + self._class_weight = self.class_weight + if self._class_weight is not None: + class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y) + if sample_weight is None or len(sample_weight) == 0: + sample_weight = class_sample_weight + else: + sample_weight = np.multiply(sample_weight, class_sample_weight) + + self._n_features = _X.shape[1] + # copy for consistency + self._n_features_in = self._n_features + + train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group, + init_score=init_score, categorical_feature=categorical_feature, + params=params) + + valid_sets: List[Dataset] = [] + if eval_set is not None: + + def _get_meta_data(collection, name, i): + if collection is None: + return None + elif isinstance(collection, list): + return collection[i] if len(collection) > i else None + elif isinstance(collection, dict): + return collection.get(i, None) + else: + raise TypeError(f"{name} should be dict or list") + + if isinstance(eval_set, tuple): + eval_set = [eval_set] + for i, valid_data in enumerate(eval_set): + # reduce cost for prediction training data + if valid_data[0] is X and valid_data[1] is y: + valid_set = train_set + else: + valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i) + valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i) + if valid_class_weight is not None: + if isinstance(valid_class_weight, dict) and self._class_map is not None: + valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()} + valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1]) + if valid_weight is None or len(valid_weight) == 0: + valid_weight = valid_class_sample_weight + else: + valid_weight = np.multiply(valid_weight, valid_class_sample_weight) + valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i) + valid_group = _get_meta_data(eval_group, 'eval_group', i) + valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight, + group=valid_group, init_score=valid_init_score, + categorical_feature='auto', params=params) + + valid_sets.append(valid_set) + + if isinstance(init_model, LGBMModel): + init_model = init_model.booster_ + + if callbacks is None: + callbacks = [] + else: + callbacks = copy.copy(callbacks) # don't use deepcopy here to allow non-serializable objects + + evals_result: _EvalResultDict = {} + callbacks.append(record_evaluation(evals_result)) + + self._Booster = train( + params=params, + train_set=train_set, + num_boost_round=self.n_estimators, + valid_sets=valid_sets, + valid_names=eval_names, + feval=eval_metrics_callable, # type: ignore[arg-type] + init_model=init_model, + feature_name=feature_name, + callbacks=callbacks + ) + + self._evals_result = evals_result + self._best_iteration = self._Booster.best_iteration + self._best_score = self._Booster.best_score + + self.fitted_ = True + + # free dataset + self._Booster.free_dataset() + del train_set, valid_sets + return self + + fit.__doc__ = _lgbmmodel_doc_fit.format( + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", + y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]", + sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)", + init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)", + eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)", + eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)", + eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)" + ) + "\n\n" + _lgbmmodel_doc_custom_eval_note + + def predict( + self, + X: _LGBM_ScikitMatrixLike, + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ): + """Docstring is set after definition, using a template.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") + if not isinstance(X, (pd_DataFrame, dt_DataTable)): + X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) + n_features = X.shape[1] + if self._n_features != n_features: + raise ValueError("Number of features of the model must " + f"match the input. Model n_features_ is {self._n_features} and " + f"input n_features is {n_features}") + # retrive original params that possibly can be used in both training and prediction + # and then overwrite them (considering aliases) with params that were passed directly in prediction + predict_params = self._process_params(stage="predict") + for alias in _ConfigAliases.get_by_alias( + "data", + "X", + "raw_score", + "start_iteration", + "num_iteration", + "pred_leaf", + "pred_contrib", + *kwargs.keys() + ): + predict_params.pop(alias, None) + predict_params.update(kwargs) + + # number of threads can have values with special meaning which is only applied + # in the scikit-learn interface, these should not reach the c++ side as-is + predict_params = _choose_param_value("num_threads", predict_params, self.n_jobs) + predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"]) + + return self._Booster.predict( # type: ignore[union-attr] + X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, + pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, + **predict_params + ) + + predict.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted value for each sample.", + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", + output_name="predicted_result", + predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + + @property + def n_features_(self) -> int: + """:obj:`int`: The number of features of fitted model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.') + return self._n_features + + @property + def n_features_in_(self) -> int: + """:obj:`int`: The number of features of fitted model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') + return self._n_features_in + + @property + def best_score_(self) -> _LGBM_BoosterBestScoreType: + """:obj:`dict`: The best score of fitted model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.') + return self._best_score + + @property + def best_iteration_(self) -> int: + """:obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.') + return self._best_iteration + + @property + def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]: + """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No objective found. Need to call fit beforehand.') + return self._objective # type: ignore[return-value] + + @property + def n_estimators_(self) -> int: + """:obj:`int`: True number of boosting iterations performed. + + This might be less than parameter ``n_estimators`` if early stopping was enabled or + if boosting stopped early due to limits on complexity like ``min_gain_to_split``. + + .. versionadded:: 4.0.0 + """ + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No n_estimators found. Need to call fit beforehand.') + return self._Booster.current_iteration() # type: ignore + + @property + def n_iter_(self) -> int: + """:obj:`int`: True number of boosting iterations performed. + + This might be less than parameter ``n_estimators`` if early stopping was enabled or + if boosting stopped early due to limits on complexity like ``min_gain_to_split``. + + .. versionadded:: 4.0.0 + """ + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No n_iter found. Need to call fit beforehand.') + return self._Booster.current_iteration() # type: ignore + + @property + def booster_(self) -> Booster: + """Booster: The underlying Booster of this model.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No booster found. Need to call fit beforehand.') + return self._Booster # type: ignore[return-value] + + @property + def evals_result_(self) -> _EvalResultDict: + """:obj:`dict`: The evaluation results if validation sets have been specified.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.') + return self._evals_result + + @property + def feature_importances_(self) -> np.ndarray: + """:obj:`array` of shape = [n_features]: The feature importances (the higher, the more important). + + .. note:: + + ``importance_type`` attribute is passed to the function + to configure the type of importance values to be extracted. + """ + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.') + return self._Booster.feature_importance(importance_type=self.importance_type) # type: ignore[union-attr] + + @property + def feature_name_(self) -> List[str]: + """:obj:`list` of shape = [n_features]: The names of features.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.') + return self._Booster.feature_name() # type: ignore[union-attr] + + +class LGBMRegressor(_LGBMRegressorBase, LGBMModel): + """LightGBM regressor.""" + + def fit( # type: ignore[override] + self, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + callbacks: Optional[List[Callable]] = None, + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + ) -> "LGBMRegressor": + """Docstring is inherited from the LGBMModel.""" + super().fit( + X, + y, + sample_weight=sample_weight, + init_score=init_score, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_init_score=eval_init_score, + eval_metric=eval_metric, + feature_name=feature_name, + categorical_feature=categorical_feature, + callbacks=callbacks, + init_model=init_model + ) + return self + + _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRegressor") # type: ignore + _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore + + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore + _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] + + _base_doc[_base_doc.find('eval_init_score :'):]) + fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')] + + _base_doc[_base_doc.find('eval_metric :'):]) + + +class LGBMClassifier(_LGBMClassifierBase, LGBMModel): + """LightGBM classifier.""" + + def fit( # type: ignore[override] + self, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_class_weight: Optional[List[float]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + callbacks: Optional[List[Callable]] = None, + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + ) -> "LGBMClassifier": + """Docstring is inherited from the LGBMModel.""" + _LGBMAssertAllFinite(y) + _LGBMCheckClassificationTargets(y) + self._le = _LGBMLabelEncoder().fit(y) + _y = self._le.transform(y) + self._class_map = dict(zip(self._le.classes_, self._le.transform(self._le.classes_))) + if isinstance(self.class_weight, dict): + self._class_weight = {self._class_map[k]: v for k, v in self.class_weight.items()} + + self._classes = self._le.classes_ + self._n_classes = len(self._classes) # type: ignore[arg-type] + if self.objective is None: + self._objective = None + + # adjust eval metrics to match whether binary or multiclass + # classification is being performed + if not callable(eval_metric): + if isinstance(eval_metric, list): + eval_metric_list = eval_metric + elif isinstance(eval_metric, str): + eval_metric_list = [eval_metric] + else: + eval_metric_list = [] + if self._n_classes > 2: + for index, metric in enumerate(eval_metric_list): + if metric in {'logloss', 'binary_logloss'}: + eval_metric_list[index] = "multi_logloss" + elif metric in {'error', 'binary_error'}: + eval_metric_list[index] = "multi_error" + else: + for index, metric in enumerate(eval_metric_list): + if metric in {'logloss', 'multi_logloss'}: + eval_metric_list[index] = 'binary_logloss' + elif metric in {'error', 'multi_error'}: + eval_metric_list[index] = 'binary_error' + eval_metric = eval_metric_list + + # do not modify args, as it causes errors in model selection tools + valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None + if eval_set is not None: + if isinstance(eval_set, tuple): + eval_set = [eval_set] + valid_sets = [] + for valid_x, valid_y in eval_set: + if valid_x is X and valid_y is y: + valid_sets.append((valid_x, _y)) + else: + valid_sets.append((valid_x, self._le.transform(valid_y))) + + super().fit( + X, + _y, + sample_weight=sample_weight, + init_score=init_score, + eval_set=valid_sets, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_class_weight=eval_class_weight, + eval_init_score=eval_init_score, + eval_metric=eval_metric, + feature_name=feature_name, + categorical_feature=categorical_feature, + callbacks=callbacks, + init_model=init_model + ) + return self + + _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMClassifier") # type: ignore + _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore + + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore + fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')] + + _base_doc[_base_doc.find('eval_metric :'):]) + + def predict( + self, + X: _LGBM_ScikitMatrixLike, + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ): + """Docstring is inherited from the LGBMModel.""" + result = self.predict_proba( + X=X, + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + if callable(self._objective) or raw_score or pred_leaf or pred_contrib: + return result + else: + class_index = np.argmax(result, axis=1) + return self._le.inverse_transform(class_index) + + predict.__doc__ = LGBMModel.predict.__doc__ + + def predict_proba( + self, + X: _LGBM_ScikitMatrixLike, + raw_score: bool = False, + start_iteration: int = 0, + num_iteration: Optional[int] = None, + pred_leaf: bool = False, + pred_contrib: bool = False, + validate_features: bool = False, + **kwargs: Any + ): + """Docstring is set after definition, using a template.""" + result = super().predict( + X=X, + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **kwargs + ) + if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): + _log_warning("Cannot compute class probabilities or labels " + "due to the usage of customized objective function.\n" + "Returning raw scores instead.") + return result + elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib: # type: ignore [operator] + return result + else: + return np.vstack((1. - result, result)).transpose() + + predict_proba.__doc__ = _lgbmmodel_doc_predict.format( + description="Return the predicted probability for each class for each sample.", + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", + output_name="predicted_probability", + predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", + X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + ) + + @property + def classes_(self) -> np.ndarray: + """:obj:`array` of shape = [n_classes]: The class label array.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No classes found. Need to call fit beforehand.') + return self._classes # type: ignore[return-value] + + @property + def n_classes_(self) -> int: + """:obj:`int`: The number of classes.""" + if not self.__sklearn_is_fitted__(): + raise LGBMNotFittedError('No classes found. Need to call fit beforehand.') + return self._n_classes + + +class LGBMRanker(LGBMModel): + """LightGBM ranker. + + .. warning:: + + scikit-learn doesn't support ranking applications yet, + therefore this class is not really compatible with the sklearn ecosystem. + Please use this class mainly for training and applying ranking models in common sklearnish way. + """ + + def fit( # type: ignore[override] + self, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + group: Optional[_LGBM_GroupType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, + eval_names: Optional[List[str]] = None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_group: Optional[List[_LGBM_GroupType]] = None, + eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, + eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), + feature_name: _LGBM_FeatureNameConfiguration = 'auto', + categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + callbacks: Optional[List[Callable]] = None, + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + ) -> "LGBMRanker": + """Docstring is inherited from the LGBMModel.""" + # check group data + if group is None: + raise ValueError("Should set group for ranking task") + + if eval_set is not None: + if eval_group is None: + raise ValueError("Eval_group cannot be None when eval_set is not None") + elif len(eval_group) != len(eval_set): + raise ValueError("Length of eval_group should be equal to eval_set") + elif (isinstance(eval_group, dict) + and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group))) + or isinstance(eval_group, list) + and any(group is None for group in eval_group)): + raise ValueError("Should set group for all eval datasets for ranking task; " + "if you use dict, the index should start from 0") + + self._eval_at = eval_at + super().fit( + X, + y, + sample_weight=sample_weight, + init_score=init_score, + group=group, + eval_set=eval_set, + eval_names=eval_names, + eval_sample_weight=eval_sample_weight, + eval_init_score=eval_init_score, + eval_group=eval_group, + eval_metric=eval_metric, + feature_name=feature_name, + categorical_feature=categorical_feature, + callbacks=callbacks, + init_model=init_model + ) + return self + + _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRanker") # type: ignore + fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')] # type: ignore + + _base_doc[_base_doc.find('eval_init_score :'):]) # type: ignore + _base_doc = fit.__doc__ + _before_feature_name, _feature_name, _after_feature_name = _base_doc.partition('feature_name :') + fit.__doc__ = f"""{_before_feature_name}eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5)) + The evaluation positions of the specified metric. + {_feature_name}{_after_feature_name}""" |