1 files changed, 1589 insertions, 0 deletions
diff --git a/lib/lineartree/lineartree.py b/lib/lineartree/lineartree.py
new file mode 100644
index 0000000..0c8f2d9
--- /dev/null
+++ b/lib/lineartree/lineartree.py
@@ -0,0 +1,1589 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 Marco Cerliani, MIT License <https://github.com/cerlymarco/linear-tree>
+
+import numpy as np
+
+from sklearn.base import ClassifierMixin, RegressorMixin
+
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted, _check_sample_weight
+
+from ._classes import _predict_branch
+from ._classes import _LinearTree, _LinearBoosting, _LinearForest
+
+
+class LinearTreeRegressor(_LinearTree, RegressorMixin):
+    """A Linear Tree Regressor.
+
+    A Linear Tree Regressor is a meta-estimator that combine the learning
+    ability of Decision Tree and the predictive power of Linear Models.
+    Like in tree-based algorithms, the received data are splitted according
+    simple decision rules. The goodness of slits is evaluated in gain terms
+    fitting linear models in each node. This implies that the models in the
+    leaves are linear instead of constant approximations like in classical
+    Decision Tree.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The base estimator to fit on dataset splits.
+        The base estimator must be a sklearn.linear_model.
+
+    criterion : {"mse", "rmse", "mae", "poisson"}, default="mse"
+        The function to measure the quality of a split. "poisson"
+        requires ``y >= 0``.
+
+    max_depth : int, default=5
+        The maximum depth of the tree considering only the splitting nodes.
+        A higher value implies a higher training time.
+
+    min_samples_split : int or float, default=6
+        The minimum number of samples required to split an internal node.
+        The minimum valid number of samples in each node is 6.
+        A lower value implies a higher training time.
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=0.1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least `min_samples_leaf` training samples in each of the left and
+        right branches.
+        The minimum valid number of samples in each leaf is 3.
+        A lower value implies a higher training time.
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    max_bins : int, default=25
+        The maximum number of bins to use to search the optimal split in each
+        feature. Features with a small number of unique values may use less than
+        ``max_bins`` bins. Must be lower than 120 and larger than 10.
+        A higher value implies a higher training time.
+
+    categorical_features : int or array-like of int, default=None
+        Indicates the categorical features.
+        All categorical indices must be in `[0, n_features)`.
+        Categorical features are used for splits but are not used in
+        model fitting.
+        More categorical features imply a higher training time.
+        - None : no feature will be considered categorical.
+        - integer array-like : integer indices indicating categorical
+          features.
+        - integer : integer index indicating a categorical
+          feature.
+
+    split_features : int or array-like of int, default=None
+        Defines which features can be used to split on.
+        All split feature indices must be in `[0, n_features)`.
+        - None : All features will be used for splitting.
+        - integer array-like : integer indices indicating splitting features.
+        - integer : integer index indicating a single splitting feature.
+
+    linear_features : int or array-like of int, default=None
+        Defines which features are used for the linear model in the leaves.
+        All linear feature indices must be in `[0, n_features)`.
+        - None : All features except those in `categorical_features`
+          will be used in the leaf models.
+        - integer array-like : integer indices indicating features to
+          be used in the leaf models.
+        - integer : integer index indicating a single feature to be used
+          in the leaf models.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for model fitting.
+        ``None`` means 1 using one processor. ``-1`` means using all
+        processors.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    feature_importances_ : ndarray of shape (n_features, )
+        Normalized total reduction of criteria by splitting features.
+
+    n_targets_ : int
+        The number of targets when :meth:`fit` is performed.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from lineartree import LinearTreeRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=100, n_features=4,
+    ...                        n_informative=2, n_targets=1,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = LinearTreeRegressor(base_estimator=LinearRegression())
+    >>> regr.fit(X, y)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([8.8817842e-16])
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        criterion="mse",
+        max_depth=5,
+        min_samples_split=6,
+        min_samples_leaf=0.1,
+        max_bins=25,
+        categorical_features=None,
+        split_features=None,
+        linear_features=None,
+        n_jobs=None
+    ):
+
+        self.base_estimator = base_estimator
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.max_bins = max_bins
+        self.categorical_features = categorical_features
+        self.split_features = split_features
+        self.linear_features = linear_features
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Tree of a linear estimator from the training
+        set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, ) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that if the base estimator does not support sample weighting,
+            the sample weights are still used to evaluate the splits.
+
+        Returns
+        -------
+        self : object
+        """
+        reg_criterions = ("mse", "rmse", "mae", "poisson")
+
+        if self.criterion not in reg_criterions:
+            raise ValueError(
+                "Regression tasks support only criterion in {}, "
+                "got '{}'.".format(reg_criterions, self.criterion)
+            )
+
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X,
+            y,
+            accept_sparse=False,
+            dtype=None,
+            force_all_finite=False,
+            multi_output=True,
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        y_shape = np.shape(y)
+        self.n_targets_ = y_shape[1] if len(y_shape) > 1 else 1
+        if self.n_targets_ < 2:
+            y = y.ravel()
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, ) or also (n_samples, n_targets) if
+            multitarget regression.
+            The predicted values.
+        """
+        check_is_fitted(self, attributes="_nodes")
+
+        X = check_array(X, accept_sparse=False, dtype=None, force_all_finite=False)
+        self._check_n_features(X, reset=False)
+
+        if self.n_targets_ > 1:
+            pred = np.zeros((X.shape[0], self.n_targets_))
+        else:
+            pred = np.zeros(X.shape[0])
+
+        for L in self._leaves.values():
+
+            mask = _predict_branch(X, L.threshold)
+            if (~mask).all():
+                continue
+
+            pred[mask] = L.model.predict(X[np.ix_(mask, self._linear_features)])
+
+        return pred
+
+
+class LinearTreeClassifier(_LinearTree, ClassifierMixin):
+    """A Linear Tree Classifier.
+
+    A Linear Tree Classifier is a meta-estimator that combine the learning
+    ability of Decision Tree and the predictive power of Linear Models.
+    Like in tree-based algorithms, the received data are splitted according
+    simple decision rules. The goodness of slits is evaluated in gain terms
+    fitting linear models in each node. This implies that the models in the
+    leaves are linear instead of constant approximations like in classical
+    Decision Tree.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The base estimator to fit on dataset splits.
+        The base estimator must be a sklearn.linear_model.
+        The selected base estimator is automatically substituted by a
+        `~sklearn.dummy.DummyClassifier` when a dataset split
+        is composed of unique labels.
+
+    criterion : {"hamming", "crossentropy"}, default="hamming"
+        The function to measure the quality of a split. `"crossentropy"`
+        can be used only if `base_estimator` has `predict_proba` method.
+
+    max_depth : int, default=5
+        The maximum depth of the tree considering only the splitting nodes.
+        A higher value implies a higher training time.
+
+    min_samples_split : int or float, default=6
+        The minimum number of samples required to split an internal node.
+        The minimum valid number of samples in each node is 6.
+        A lower value implies a higher training time.
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=0.1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least `min_samples_leaf` training samples in each of the left and
+        right branches.
+        The minimum valid number of samples in each leaf is 3.
+        A lower value implies a higher training time.
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    max_bins : int, default=25
+        The maximum number of bins to use to search the optimal split in each
+        feature. Features with a small number of unique values may use less than
+        ``max_bins`` bins. Must be lower than 120 and larger than 10.
+        A higher value implies a higher training time.
+
+    categorical_features : int or array-like of int, default=None
+        Indicates the categorical features.
+        All categorical indices must be in `[0, n_features)`.
+        Categorical features are used for splits but are not used in
+        model fitting.
+        More categorical features imply a higher training time.
+        - None : no feature will be considered categorical.
+        - integer array-like : integer indices indicating categorical
+          features.
+        - integer : integer index indicating a categorical
+          feature.
+
+    split_features : int or array-like of int, default=None
+        Defines which features can be used to split on.
+        All split feature indices must be in `[0, n_features)`.
+        - None : All features will be used for splitting.
+        - integer array-like : integer indices indicating splitting features.
+        - integer : integer index indicating a single splitting feature.
+
+    linear_features : int or array-like of int, default=None
+        Defines which features are used for the linear model in the leaves.
+        All linear feature indices must be in `[0, n_features)`.
+        - None : All features except those in `categorical_features`
+          will be used in the leaf models.
+        - integer array-like : integer indices indicating features to
+          be used in the leaf models.
+        - integer : integer index indicating a single feature to be used
+          in the leaf models.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel for model fitting.
+        ``None`` means 1 using one processor. ``-1`` means using all
+        processors.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    feature_importances_ : ndarray of shape (n_features, )
+        Normalized total reduction of criteria by splitting features.
+
+    classes_ : ndarray of shape (n_classes, )
+        A list of class labels known to the classifier.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import RidgeClassifier
+    >>> from lineartree import LinearTreeClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = LinearTreeClassifier(base_estimator=RidgeClassifier())
+    >>> clf.fit(X, y)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        criterion="hamming",
+        max_depth=5,
+        min_samples_split=6,
+        min_samples_leaf=0.1,
+        max_bins=25,
+        categorical_features=None,
+        split_features=None,
+        linear_features=None,
+        n_jobs=None
+    ):
+
+        self.base_estimator = base_estimator
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.max_bins = max_bins
+        self.categorical_features = categorical_features
+        self.split_features = split_features
+        self.linear_features = linear_features
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Tree of a linear estimator from the training
+        set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, )
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that if the base estimator does not support sample weighting,
+            the sample weights are still used to evaluate the splits.
+
+        Returns
+        -------
+        self : object
+        """
+        clas_criterions = ("hamming", "crossentropy")
+
+        if self.criterion not in clas_criterions:
+            raise ValueError(
+                "Classification tasks support only criterion in {}, "
+                "got '{}'.".format(clas_criterions, self.criterion)
+            )
+
+        if (
+            not hasattr(self.base_estimator, "predict_proba")
+            and self.criterion == "crossentropy"
+        ):
+            raise ValueError(
+                "The 'crossentropy' criterion requires a base_estimator "
+                "with predict_proba method."
+            )
+
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X,
+            y,
+            accept_sparse=False,
+            dtype=None,
+            force_all_finite=False,
+            multi_output=False,
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        self.classes_ = np.unique(y)
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def predict(self, X):
+        """Predict class for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, )
+            The predicted classes.
+        """
+        check_is_fitted(self, attributes="_nodes")
+
+        X = check_array(X, accept_sparse=False, dtype=None, force_all_finite=False)
+        self._check_n_features(X, reset=False)
+
+        pred = np.empty(X.shape[0], dtype=self.classes_.dtype)
+
+        for L in self._leaves.values():
+
+            mask = _predict_branch(X, L.threshold)
+            if (~mask).all():
+                continue
+
+            pred[mask] = L.model.predict(X[np.ix_(mask, self._linear_features)])
+
+        return pred
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        If base estimators do not implement a ``predict_proba`` method,
+        then the one-hot encoding of the predicted class is returned.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self, attributes="_nodes")
+
+        X = check_array(X, accept_sparse=False, dtype=None, force_all_finite=False)
+        self._check_n_features(X, reset=False)
+
+        pred = np.zeros((X.shape[0], len(self.classes_)))
+
+        if hasattr(self.base_estimator, "predict_proba"):
+            for L in self._leaves.values():
+
+                mask = _predict_branch(X, L.threshold)
+                if (~mask).all():
+                    continue
+
+                pred[
+                    np.ix_(mask, np.isin(self.classes_, L.classes))
+                ] = L.model.predict_proba(X[np.ix_(mask, self._linear_features)])
+
+        else:
+            pred_class = self.predict(X)
+            class_to_int = dict(map(reversed, enumerate(self.classes_)))
+            pred_class = np.array([class_to_int[i] for i in pred_class])
+            pred[np.arange(X.shape[0]), pred_class] = 1
+
+        return pred
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        If base estimators do not implement a ``predict_log_proba`` method,
+        then the logarithm of the one-hot encoded predicted class is returned.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        return np.log(self.predict_proba(X))
+
+
+class LinearBoostRegressor(_LinearBoosting, RegressorMixin):
+    """A Linear Boosting Regressor.
+
+    A Linear Boosting Regressor is an iterative meta-estimator that starts
+    with a linear regressor, and model the residuals through decision trees.
+    At each iteration, the path leading to highest error (i.e. the worst leaf)
+    is added as a new binary variable to the base model. This kind of Linear
+    Boosting can be considered as an improvement over general linear models
+    since it enables incorporating non-linear features by residuals modeling.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The base estimator iteratively fitted.
+        The base estimator must be a sklearn.linear_model.
+
+    loss : {"linear", "square", "absolute", "exponential"}, default="linear"
+        The function used to calculate the residuals of each sample.
+
+    n_estimators : int, default=10
+        The number of boosting stages to perform. It corresponds to the number
+        of the new features generated.
+
+    max_depth : int, default=3
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `int(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the estimator.
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    n_features_out_ : int
+        The total number of features used to fit the base estimator in the
+        last iteration. The number of output features is equal to the sum
+        of n_features_in_ and n_estimators.
+
+    coef_ : array of shape (n_features_out_, ) or (n_targets, n_features_out_)
+        Estimated coefficients for the linear regression problem.
+        If multiple targets are passed during the fit (y 2D), this is a
+        2D array of shape (n_targets, n_features_out_), while if only one target
+        is passed, this is a 1D array of length n_features_out_.
+
+    intercept_ : float or array of shape (n_targets, )
+        Independent term in the linear model. Set to 0 if `fit_intercept = False`
+        in `base_estimator`
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from lineartree import LinearBoostRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=100, n_features=4,
+    ...                        n_informative=2, n_targets=1,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = LinearBoostRegressor(base_estimator=LinearRegression())
+    >>> regr.fit(X, y)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([8.8817842e-16])
+
+    References
+    ----------
+    Explainable boosted linear regression for time series forecasting.
+    Authors: Igor Ilic, Berk Gorgulu, Mucahit Cevik, Mustafa Gokce Baydogan.
+    (https://arxiv.org/abs/2009.09110)
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        loss="linear",
+        n_estimators=10,
+        max_depth=3,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=None,
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        ccp_alpha=0.0
+    ):
+
+        self.base_estimator = base_estimator
+        self.loss = loss
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.random_state = random_state
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Boosting from the training set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, ) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        reg_losses = ("linear", "square", "absolute", "exponential")
+
+        if self.loss not in reg_losses:
+            raise ValueError(
+                "Regression tasks support only loss in {}, "
+                "got '{}'.".format(reg_losses, self.loss)
+            )
+
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X, y, accept_sparse=False, dtype=np.float32, multi_output=True
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        y_shape = np.shape(y)
+        n_targets = y_shape[1] if len(y_shape) > 1 else 1
+        if n_targets < 2:
+            y = y.ravel()
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, ) or also (n_samples, n_targets) if
+            multitarget regression.
+            The predicted values.
+        """
+        check_is_fitted(self, attributes="base_estimator_")
+
+        return self.base_estimator_.predict(self.transform(X))
+
+
+class LinearBoostClassifier(_LinearBoosting, ClassifierMixin):
+    """A Linear Boosting Classifier.
+
+    A Linear Boosting Classifier is an iterative meta-estimator that starts
+    with a linear classifier, and model the residuals through decision trees.
+    At each iteration, the path leading to highest error (i.e. the worst leaf)
+    is added as a new binary variable to the base model. This kind of Linear
+    Boosting can be considered as an improvement over general linear models
+    since it enables incorporating non-linear features by residuals modeling.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The base estimator iteratively fitted.
+        The base estimator must be a sklearn.linear_model.
+
+    loss : {"hamming", "entropy"}, default="entropy"
+        The function used to calculate the residuals of each sample.
+        `"entropy"` can be used only if `base_estimator` has `predict_proba`
+        method.
+
+    n_estimators : int, default=10
+        The number of boosting stages to perform. It corresponds to the number
+        of the new features generated.
+
+    max_depth : int, default=3
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `int(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the randomness of the estimator.
+
+    max_leaf_nodes : int, default=None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    n_features_out_ : int
+        The total number of features used to fit the base estimator in the
+        last iteration. The number of output features is equal to the sum
+        of n_features_in_ and n_estimators.
+
+    coef_ : ndarray of shape (1, n_features_out_) or (n_classes, n_features_out_)
+        Coefficient of the features in the decision function.
+
+    intercept_ : float or array of shape (n_classes, )
+        Independent term in the linear model. Set to 0 if `fit_intercept = False`
+        in `base_estimator`
+
+    classes_ : ndarray of shape (n_classes, )
+        A list of class labels known to the classifier.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import RidgeClassifier
+    >>> from lineartree import LinearBoostClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = LinearBoostClassifier(base_estimator=RidgeClassifier())
+    >>> clf.fit(X, y)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+
+    References
+    ----------
+    Explainable boosted linear regression for time series forecasting.
+    Authors: Igor Ilic, Berk Gorgulu, Mucahit Cevik, Mustafa Gokce Baydogan.
+    (https://arxiv.org/abs/2009.09110)
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        loss="hamming",
+        n_estimators=10,
+        max_depth=3,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features=None,
+        random_state=None,
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        ccp_alpha=0.0
+    ):
+
+        self.base_estimator = base_estimator
+        self.loss = loss
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.random_state = random_state
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.ccp_alpha = ccp_alpha
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Boosting from the training set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, )
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        clas_losses = ("hamming", "entropy")
+
+        if self.loss not in clas_losses:
+            raise ValueError(
+                "Classification tasks support only loss in {}, "
+                "got '{}'.".format(clas_losses, self.loss)
+            )
+
+        if not hasattr(self.base_estimator, "predict_proba") and self.loss == "entropy":
+            raise ValueError(
+                "The 'entropy' loss requires a base_estimator "
+                "with predict_proba method."
+            )
+
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X, y, accept_sparse=False, dtype=np.float32, multi_output=False
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        self.classes_ = np.unique(y)
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def predict(self, X):
+        """Predict class for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, )
+            The predicted classes.
+        """
+        check_is_fitted(self, attributes="base_estimator_")
+
+        return self.base_estimator_.predict(self.transform(X))
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        If base estimators do not implement a ``predict_proba`` method,
+        then the one-hot encoding of the predicted class is returned.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        if hasattr(self.base_estimator, "predict_proba"):
+            check_is_fitted(self, attributes="base_estimator_")
+            pred = self.base_estimator_.predict_proba(self.transform(X))
+
+        else:
+            pred_class = self.predict(X)
+            pred = np.zeros((pred_class.shape[0], len(self.classes_)))
+            class_to_int = dict(map(reversed, enumerate(self.classes_)))
+            pred_class = np.array([class_to_int[v] for v in pred_class])
+            pred[np.arange(pred_class.shape[0]), pred_class] = 1
+
+        return pred
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        If base estimators do not implement a ``predict_log_proba`` method,
+        then the logarithm of the one-hot encoded predicted class is returned.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        return np.log(self.predict_proba(X))
+
+
+class LinearForestClassifier(_LinearForest, ClassifierMixin):
+    """ "A Linear Forest Classifier.
+
+    Linear forests generalizes the well known random forests by combining
+    linear models with the same random forests.
+    The key idea of linear forests is to use the strength of linear models
+    to improve the nonparametric learning ability of tree-based algorithms.
+    Firstly, a linear model is fitted on the whole dataset, then a random
+    forest is trained on the same dataset but using the residuals of the
+    previous steps as target. The final predictions are the sum of the raw
+    linear predictions and the residuals modeled by the random forest.
+
+    For classification tasks the same approach used in regression context
+    is adopted. The binary targets are transformed into logits using the
+    inverse sigmoid function. A linear regression is fitted. A random forest
+    regressor is trained to approximate the residulas from logits and linear
+    predictions. Finally the sigmoid of the combinded predictions are taken
+    to obtain probabilities.
+    The multi-label scenario is carried out using OneVsRestClassifier.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The linear estimator fitted on the raw target.
+        The linear estimator must be a regressor from sklearn.linear_model.
+
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `round(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+    bootstrap : bool, default=True
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls both the randomness of the bootstrapping of the samples used
+        when building trees (if ``bootstrap=True``) and the sampling of the
+        features to consider when looking for the best split at each node
+        (if ``max_features < n_features``).
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details.
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples. Thus,
+          `max_samples` should be in the interval `(0, 1]`.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    feature_importances_ : ndarray of shape (n_features, )
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+    coef_ : ndarray of shape (1, n_features_out_)
+        Coefficient of the features in the decision function.
+
+    intercept_ : float
+        Independent term in the linear model. Set to 0 if `fit_intercept = False`
+        in `base_estimator`.
+
+    classes_ : ndarray of shape (n_classes, )
+        A list of class labels known to the classifier.
+
+    base_estimator_ : object
+        A fitted linear model instance.
+
+    forest_estimator_ : object
+        A fitted random forest instance.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from lineartree import LinearForestClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_classes=2, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = LinearForestClassifier(base_estimator=LinearRegression())
+    >>> clf.fit(X, y)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+
+    References
+    ----------
+    Regression-Enhanced Random Forests.
+    Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu.
+    (https://arxiv.org/abs/1904.10416)
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        n_estimators=100,
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features="auto",
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        ccp_alpha=0.0,
+        max_samples=None
+    ):
+
+        self.base_estimator = base_estimator
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.bootstrap = bootstrap
+        self.oob_score = oob_score
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.ccp_alpha = ccp_alpha
+        self.max_samples = max_samples
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Forest from the training set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, ) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X, y, accept_sparse=True, dtype=None, multi_output=False
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        self.classes_ = np.unique(y)
+        if len(self.classes_) > 2:
+            raise ValueError(
+                "LinearForestClassifier supports only binary classification task. "
+                "To solve a multi-lable classification task use "
+                "LinearForestClassifier with OneVsRestClassifier from sklearn."
+            )
+
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def decision_function(self, X):
+        """Predict confidence scores for samples.
+
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, )
+            Confidence scores.
+            Confidence score for self.classes_[1] where >0 means this
+            class would be predicted.
+        """
+        check_is_fitted(self, attributes="base_estimator_")
+        X = check_array(X, dtype=None, accept_sparse=False)
+        self._check_n_features(X, reset=False)
+
+        linear_pred = self.base_estimator_.predict(X)
+        forest_pred = self.forest_estimator_.predict(X)
+
+        return linear_pred + forest_pred
+
+    def predict(self, X):
+        """Predict class for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, )
+            The predicted classes.
+        """
+        pred = self.decision_function(X)
+        pred_class = (self._sigmoid(pred) > 0.5).astype(int)
+        int_to_class = dict(enumerate(self.classes_))
+        pred_class = np.array([int_to_class[i] for i in pred_class])
+
+        return pred_class
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        proba : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self, attributes="base_estimator_")
+        X = check_array(X, dtype=None, accept_sparse=False)
+        self._check_n_features(X, reset=False)
+
+        pred = self._sigmoid(self.base_estimator_.predict(X))
+        proba = np.zeros((X.shape[0], 2))
+        proba[:, 0] = 1 - pred
+        proba[:, 1] = pred
+
+        return proba
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        return np.log(self.predict_proba(X))
+
+
+class LinearForestRegressor(_LinearForest, RegressorMixin):
+    """ "A Linear Forest Regressor.
+
+    Linear forests generalizes the well known random forests by combining
+    linear models with the same random forests.
+    The key idea of linear forests is to use the strength of linear models
+    to improve the nonparametric learning ability of tree-based algorithms.
+    Firstly, a linear model is fitted on the whole dataset, then a random
+    forest is trained on the same dataset but using the residuals of the
+    previous steps as target. The final predictions are the sum of the raw
+    linear predictions and the residuals modeled by the random forest.
+
+    Parameters
+    ----------
+    base_estimator : object
+        The linear estimator fitted on the raw target.
+        The linear estimator must be a regressor from sklearn.linear_model.
+
+    n_estimators : int, default=100
+        The number of trees in the forest.
+
+    max_depth : int, default=None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int or float, default=2
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int or float, default=1
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : {"auto", "sqrt", "log2"}, int or float, default="auto"
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `round(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int, default=None
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+    bootstrap : bool, default=True
+        Whether bootstrap samples are used when building trees. If False, the
+        whole dataset is used to build each tree.
+
+    oob_score : bool, default=False
+        Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
+        :meth:`decision_path` and :meth:`apply` are all parallelized over the
+        trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
+        context. ``-1`` means using all processors.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls both the randomness of the bootstrapping of the samples used
+        when building trees (if ``bootstrap=True``) and the sampling of the
+        features to consider when looking for the best split at each node
+        (if ``max_features < n_features``).
+
+    ccp_alpha : non-negative float, default=0.0
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
+        :ref:`minimal_cost_complexity_pruning` for details.
+
+    max_samples : int or float, default=None
+        If bootstrap is True, the number of samples to draw from X
+        to train each base estimator.
+
+        - If None (default), then draw `X.shape[0]` samples.
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples. Thus,
+          `max_samples` should be in the interval `(0, 1]`.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        The number of features when :meth:`fit` is performed.
+
+    feature_importances_ : ndarray of shape (n_features, )
+        The impurity-based feature importances.
+        The higher, the more important the feature.
+        The importance of a feature is computed as the (normalized)
+        total reduction of the criterion brought by that feature.  It is also
+        known as the Gini importance.
+
+    coef_ : array of shape (n_features, ) or (n_targets, n_features)
+        Estimated coefficients for the linear regression problem.
+        If multiple targets are passed during the fit (y 2D), this is a
+        2D array of shape (n_targets, n_features), while if only one target
+        is passed, this is a 1D array of length n_features.
+
+    intercept_ : float or array of shape (n_targets,)
+        Independent term in the linear model. Set to 0 if `fit_intercept = False`
+        in `base_estimator`.
+
+    base_estimator_ : object
+        A fitted linear model instance.
+
+    forest_estimator_ : object
+        A fitted random forest instance.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from lineartree import LinearForestRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=100, n_features=4,
+    ...                        n_informative=2, n_targets=1,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = LinearForestRegressor(base_estimator=LinearRegression())
+    >>> regr.fit(X, y)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([8.8817842e-16])
+
+    References
+    ----------
+    Regression-Enhanced Random Forests.
+    Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu.
+    (https://arxiv.org/abs/1904.10416)
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        *,
+        n_estimators=100,
+        max_depth=None,
+        min_samples_split=2,
+        min_samples_leaf=1,
+        min_weight_fraction_leaf=0.0,
+        max_features="auto",
+        max_leaf_nodes=None,
+        min_impurity_decrease=0.0,
+        bootstrap=True,
+        oob_score=False,
+        n_jobs=None,
+        random_state=None,
+        ccp_alpha=0.0,
+        max_samples=None
+    ):
+
+        self.base_estimator = base_estimator
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.bootstrap = bootstrap
+        self.oob_score = oob_score
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.ccp_alpha = ccp_alpha
+        self.max_samples = max_samples
+
+    def fit(self, X, y, sample_weight=None):
+        """Build a Linear Forest from the training set (X, y).
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like of shape (n_samples, ) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples, ), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+        """
+        # Convert data (X is required to be 2d and indexable)
+        X, y = self._validate_data(
+            X, y, accept_sparse=True, dtype=None, multi_output=True
+        )
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
+
+        y_shape = np.shape(y)
+        n_targets = y_shape[1] if len(y_shape) > 1 else 1
+        if n_targets < 2:
+            y = y.ravel()
+        self._fit(X, y, sample_weight)
+
+        return self
+
+    def predict(self, X):
+        """Predict regression target for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        pred : ndarray of shape (n_samples, ) or also (n_samples, n_targets) if
+            multitarget regression.
+            The predicted values.
+        """
+        check_is_fitted(self, attributes="base_estimator_")
+        X = check_array(X, dtype=None, accept_sparse=False)
+        self._check_n_features(X, reset=False)
+
+        linear_pred = self.base_estimator_.predict(X)
+        forest_pred = self.forest_estimator_.predict(X)
+
+        return linear_pred + forest_pred