Add LightGBM support

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-06 16:27:19 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-06 16:27:19 +0100
commit: 29d10d5dd6c08bcafc7c34c48b8db599fcbd7e49 (patch)
tree: 8de6f67ffed23da7e99be12c87a2c28bdfedad0a /lib
parent: 89568c6b4b9a35612c794431d551bc0cc638e46d (diff)
4 files changed, 190 insertions, 1 deletions
diff --git a/lib/cli.py b/lib/cli.py
index 89fabf6..ed25d62 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -207,6 +207,8 @@ def print_model(prefix, info):
         print_splitinfo(info, prefix)
     elif type(info) is df.LMTFunction:
         print_lmtinfo(prefix, info)
+    elif type(info) is df.LightGBMFunction:
+        print_xgbinfo(prefix, info)
     elif type(info) is df.XGBoostFunction:
         print_xgbinfo(prefix, info)
     elif type(info) is df.SymbolicRegressionFunction:
diff --git a/lib/functions.py b/lib/functions.py
index f557cbb..417c8c8 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -991,6 +991,173 @@ class LMTFunction(SKLearnRegressionFunction):
         }
 
 
+class LightGBMFunction(SKLearnRegressionFunction):
+
+    def fit(self, param_values, data):
+
+        # boosting_type : str, optional (default='gbdt')
+        #     'gbdt', traditional Gradient Boosting Decision Tree.
+        #     'dart', Dropouts meet Multiple Additive Regression Trees.
+        #     'rf', Random Forest.
+        boosting_type = os.getenv("DFATOOL_LGBM_BOOSTER", "gbdt")
+
+        # n_estimators : int, optional (default=100)
+        #     Number of boosted trees to fit.
+        n_estimators = int(os.getenv("DFATOOL_LGBM_N_ESTIMATORS", "100"))
+
+        # max_depth : int, optional (default=-1)
+        #     Maximum tree depth for base learners, <=0 means no limit.
+        max_depth = int(os.getenv("DFATOOL_LGBM_MAX_DEPTH", "-1"))
+
+        # num_leaves : int, optional (default=31)
+        #     Maximum tree leaves for base learners.
+        num_leaves = int(os.getenv("DFATOOL_LGBM_NUM_LEAVES", "31"))
+
+        # subsample : float, optional (default=1.)
+        #     Subsample ratio of the training instance.
+        subsample = float(os.getenv("DFATOOL_LGBM_SUBSAMPLE", "1."))
+
+        # learning_rate : float, optional (default=0.1)
+        #     Boosting learning rate.
+        #     You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
+        #     in training using ``reset_parameter`` callback.
+        #     Note, that this will ignore the ``learning_rate`` argument in training.
+        learning_rate = float(os.getenv("DFATOOL_LGBM_LEARNING_RATE", "0.1"))
+
+        # min_split_gain : float, optional (default=0.)
+        #     Minimum loss reduction required to make a further partition on a leaf node of the tree.
+        min_split_gain = float(os.getenv("DFATOOL_LGBM_MIN_SPLIT_GAIN", "0."))
+
+        # min_child_samples : int, optional (default=20)
+        #     Minimum number of data needed in a child (leaf).
+        min_child_samples = int(os.getenv("DFATOOL_LGBM_MIN_CHILD_SAMPLES", "20"))
+
+        # reg_alpha : float, optional (default=0.)
+        #     L1 regularization term on weights.
+        reg_alpha = float(os.getenv("DFATOOL_LGBM_REG_ALPHA", "0."))
+
+        # reg_lambda : float, optional (default=0.)
+        #     L2 regularization term on weights.
+        reg_lambda = float(os.getenv("DFATOOL_LGBM_REG_LAMBDA", "0."))
+
+        fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray(
+            param_values,
+            with_nan=False,
+            categorical_to_scalar=self.categorical_to_scalar,
+        )
+        if fit_parameters.shape[1] == 0:
+            logger.warning(
+                f"Cannot run LightGBM due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}"
+            )
+            self.fit_success = False
+            return self
+
+        import dfatool.lightgbm as lightgbm
+
+        lgbr = lightgbm.LGBMRegressor(
+            boosting_type=boosting_type,
+            n_estimators=n_estimators,
+            max_depth=max_depth,
+            num_leaves=num_leaves,
+            subsample=subsample,
+            learning_rate=learning_rate,
+            min_split_gain=min_split_gain,
+            min_child_samples=min_child_samples,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+        )
+        lgbr.fit(fit_parameters, data)
+        self.fit_success = True
+        self.regressor = lgbr
+        self._build_feature_names()
+
+        return self
+
+    def to_json(self, internal=False, **kwargs):
+        forest = self.regressor.booster_.dump_model()["tree_info"]
+        if internal:
+            return forest
+        return list(
+            map(
+                lambda tree: self._model_to_json(tree["tree_structure"], **kwargs),
+                forest,
+            )
+        )
+
+    def _model_to_json(self, tree, **kwargs):
+        ret = dict()
+        if "left_child" in tree:
+            assert "right_child" in tree
+            assert tree["decision_type"] == "<="
+            return {
+                "type": "scalarSplit",
+                "paramName": self.feature_names[tree["split_feature"]],
+                "threshold": tree["threshold"],
+                "value": None,
+                "left": self._model_to_json(tree["left_child"], **kwargs),
+                "right": self._model_to_json(tree["right_child"], **kwargs),
+            }
+        else:
+            return {
+                "type": "static",
+                "value": tree["leaf_value"],
+            }
+
+    def get_number_of_nodes(self):
+        return sum(
+            map(
+                lambda t: self._get_number_of_nodes(t["tree_structure"]),
+                self.to_json(internal=True),
+            )
+        )
+
+    def _get_number_of_nodes(self, data):
+        ret = 1
+        if "left_child" in data:
+            ret += self._get_number_of_nodes(data["left_child"])
+        if "right_child" in data:
+            ret += self._get_number_of_nodes(data["right_child"])
+        return ret
+
+    def get_number_of_leaves(self):
+        return sum(map(lambda t: t["num_leaves"], self.to_json(internal=True)))
+
+    def get_max_depth(self):
+        return max(
+            map(
+                lambda t: self._get_max_depth(t["tree_structure"]),
+                self.to_json(internal=True),
+            )
+        )
+
+    def _get_max_depth(self, data):
+        ret = [0]
+        if "left_child" in data:
+            ret.append(self._get_max_depth(data["left_child"]))
+        if "right_child" in data:
+            ret.append(self._get_max_depth(data["right_child"]))
+        return 1 + max(ret)
+
+    def get_complexity_score(self):
+        return self.get_number_of_nodes()
+
+    def hyper_to_dref(self):
+        return {
+            "lgbm/boosting type": self.regressor.boosting_type,
+            "lgbm/n estimators": self.regressor.n_estimators,
+            "lgbm/max depth": self.regressor.max_depth == -1
+            and "infty"
+            or self.regressor.max_depth,
+            "lgbm/max leaves": self.regressor.num_leaves,
+            "lgbm/subsample": self.regressor.subsample,
+            "lgbm/learning rate": self.regressor.learning_rate,
+            "lgbm/min split gain": self.regressor.min_split_gain,
+            "lgbm/min child samples": self.regressor.min_child_samples,
+            "lgbm/alpha": self.regressor.reg_alpha,
+            "lgbm/lambda": self.regressor.reg_lambda,
+        }
+
+
 class XGBoostFunction(SKLearnRegressionFunction):
 
     def fit(self, param_values, data):
diff --git a/lib/model.py b/lib/model.py
index ffe1fb7..5770218 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -313,10 +313,12 @@ class AnalyticModel:
                         self.attr_by_name[name][attr].build_lmt()
                     elif model_type == "symreg":
                         self.attr_by_name[name][attr].build_symreg()
+                    elif model_type == "lgbm":
+                        self.attr_by_name[name][attr].build_lgbm()
                     elif model_type == "xgb":
                         self.attr_by_name[name][attr].build_xgb()
                     else:
-                        logger.error("build_fitted: unknown model type: {model_type}")
+                        logger.error(f"build_fitted: unknown model type: {model_type}")
         elif self.force_tree:
             for name in self.names:
                 for attr in self.by_name[name]["attributes"]:
diff --git a/lib/parameters.py b/lib/parameters.py
index 0b0da81..8c7c9cb 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -990,6 +990,24 @@ class ModelAttribute:
             )
             return False
 
+    def build_lgbm(self):
+        mf = df.LightGBMFunction(
+            np.mean(self.data),
+            n_samples=len(self.data),
+            param_names=self.param_names,
+            arg_count=self.arg_count,
+        ).fit(self.param_values, self.data)
+
+        if mf.fit_success:
+            self.model_function = mf
+            return True
+        else:
+            logger.warning(f"LightGBM generation for {self.name} {self.attr} faled")
+            self.model_function = df.StaticFunction(
+                np.mean(self.data), n_samples=len(self.data)
+            )
+            return False
+
     def build_xgb(self):
         mf = df.XGBoostFunction(
             np.mean(self.data),
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-06 16:27:19 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-06 16:27:19 +0100
commit	29d10d5dd6c08bcafc7c34c48b8db599fcbd7e49 (patch)
tree	8de6f67ffed23da7e99be12c87a2c28bdfedad0a /lib
parent	89568c6b4b9a35612c794431d551bc0cc638e46d (diff)