diff options
-rw-r--r-- | README.md | 13 | ||||
-rw-r--r-- | lib/cli.py | 2 | ||||
-rw-r--r-- | lib/functions.py | 167 | ||||
-rw-r--r-- | lib/model.py | 4 | ||||
-rw-r--r-- | lib/parameters.py | 18 |
5 files changed, 202 insertions, 2 deletions
@@ -45,6 +45,7 @@ dfatool supports six types of performance models: * CART: Regression Trees * DECART: Regression Trees with exclusively binary features/parameters +* LightGBM: Regressin Forests * XGB: Regression Forests * LMT: Linear Model Trees * RMT: [Regression Model Trees](https://ess.cs.uos.de/static/papers/Friesel-2022-CPSIoTBench.pdf) with [non-binary nodes](https://ess.cs.uos.de/static/papers/Friesel-2022-CAIN.pdf) @@ -111,10 +112,20 @@ The following variables may be set to alter the behaviour of dfatool components. | `DFATOOL_KCONF_WITH_CHOICE_NODES` | 0, **1** | Treat kconfig choices (e.g. "choice Model → MobileNet / ResNet / Inception") as enum parameters. If enabled, the corresponding boolean kconfig variables (e.g. "Model\_MobileNet") are not converted to parameters. If disabled, all (and only) boolean kconfig variables are treated as parameters. Mostly relevant for analyze-kconfig, eval-kconfig | | `DFATOOL_COMPENSATE_DRIFT` | **0**, 1 | Perform drift compensation for loaders without sync input (e.g. EnergyTrace or Keysight) | | `DFATOOL_DRIFT_COMPENSATION_PENALTY` | 0 .. 100 (default: majority vote over several penalties) | Specify penalty for ruptures.py PELT changepoint petection | -| `DFATOOL_MODEL` | cart, decart, fol, lmt, **rmt**, symreg, xgb | Modeling method. See below for method-specific configuration options. | +| `DFATOOL_MODEL` | cart, decart, fol, lgbm, lmt, **rmt**, symreg, xgb | Modeling method. See below for method-specific configuration options. | | `DFATOOL_RMT_SUBMODEL` | cart, fol, static, symreg, **uls** | Modeling method for RMT leaf functions. | | `DFATOOL_RMT_ENABLED` | 0, **1** | Use decision trees in get\_fitted | | `DFATOOL_CART_MAX_DEPTH` | **0** .. *n* | maximum depth for sklearn CART. Default (0): unlimited. | +| `DFATOOL_LGBM_BOOSTER` | **gbdt**, dart, rf | Boosting type. | +| `DFATOOL_LGBM_N_ESTIMATORS` | .., **100**, .. | Number of estimators. | +| `DFATOOL_LGBM_MAX_DEPTH` | **-1**, .., *n* | Maximum tree depth, unlimited if ≤ 0. | +| `DFATOOL_LGBM_NUM_LEAVES` | .., **31**, .. | Maximum number of leaves per tree. | +| `DFATOOL_LGBM_SUBSAMPLE` | 0.0 .. **1.0** | Subsampling ration. | +| `DFATOOL_LGBM_LEARNING_RATE` | 0 .. **0.1** .. 1 | Learning rate. | +| `DFATOOL_LGBM_MIN_SPLIT_GAIN` | **0.0** .. 1 | Minimum loss reduction required for a split. | +| `DFATOOL_LGBM_MIN_CHILD_SAMPLES` | .., **20**, .. | Minimum samples that each leaf of a split candidate must contain. | +| `DFATOOL_LGBM_REG_ALPHA` | **0.0** .. *n* | L1 regularization term on weights. | +| `DFATOOL_LGBM_REG_LAMBDA` | **0.0** .. *n* | L2 regularization term on weights. | | `DFATOOL_LMT_MAX_DEPTH` | **5** .. 20 | Maximum depth for LMT. | | `DFATOOL_LMT_MIN_SAMPLES_SPLIT` | 0.0 .. 1.0, **6** .. *n* | Minimum samples required to still perform an LMT split. A value below 1.0 sets the specified ratio of the total number of training samples as minimum. | | `DFATOOL_LMT_MIN_SAMPLES_LEAF` | 0.0 .. **0.1** .. 1.0, 3 .. *n* | Minimum samples that each leaf of a split candidate must contain. A value below 1.0 specifies a ratio of the total number of training samples. A value above 1 specifies an absolute number of samples. | @@ -207,6 +207,8 @@ def print_model(prefix, info): print_splitinfo(info, prefix) elif type(info) is df.LMTFunction: print_lmtinfo(prefix, info) + elif type(info) is df.LightGBMFunction: + print_xgbinfo(prefix, info) elif type(info) is df.XGBoostFunction: print_xgbinfo(prefix, info) elif type(info) is df.SymbolicRegressionFunction: diff --git a/lib/functions.py b/lib/functions.py index f557cbb..417c8c8 100644 --- a/lib/functions.py +++ b/lib/functions.py @@ -991,6 +991,173 @@ class LMTFunction(SKLearnRegressionFunction): } +class LightGBMFunction(SKLearnRegressionFunction): + + def fit(self, param_values, data): + + # boosting_type : str, optional (default='gbdt') + # 'gbdt', traditional Gradient Boosting Decision Tree. + # 'dart', Dropouts meet Multiple Additive Regression Trees. + # 'rf', Random Forest. + boosting_type = os.getenv("DFATOOL_LGBM_BOOSTER", "gbdt") + + # n_estimators : int, optional (default=100) + # Number of boosted trees to fit. + n_estimators = int(os.getenv("DFATOOL_LGBM_N_ESTIMATORS", "100")) + + # max_depth : int, optional (default=-1) + # Maximum tree depth for base learners, <=0 means no limit. + max_depth = int(os.getenv("DFATOOL_LGBM_MAX_DEPTH", "-1")) + + # num_leaves : int, optional (default=31) + # Maximum tree leaves for base learners. + num_leaves = int(os.getenv("DFATOOL_LGBM_NUM_LEAVES", "31")) + + # subsample : float, optional (default=1.) + # Subsample ratio of the training instance. + subsample = float(os.getenv("DFATOOL_LGBM_SUBSAMPLE", "1.")) + + # learning_rate : float, optional (default=0.1) + # Boosting learning rate. + # You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate + # in training using ``reset_parameter`` callback. + # Note, that this will ignore the ``learning_rate`` argument in training. + learning_rate = float(os.getenv("DFATOOL_LGBM_LEARNING_RATE", "0.1")) + + # min_split_gain : float, optional (default=0.) + # Minimum loss reduction required to make a further partition on a leaf node of the tree. + min_split_gain = float(os.getenv("DFATOOL_LGBM_MIN_SPLIT_GAIN", "0.")) + + # min_child_samples : int, optional (default=20) + # Minimum number of data needed in a child (leaf). + min_child_samples = int(os.getenv("DFATOOL_LGBM_MIN_CHILD_SAMPLES", "20")) + + # reg_alpha : float, optional (default=0.) + # L1 regularization term on weights. + reg_alpha = float(os.getenv("DFATOOL_LGBM_REG_ALPHA", "0.")) + + # reg_lambda : float, optional (default=0.) + # L2 regularization term on weights. + reg_lambda = float(os.getenv("DFATOOL_LGBM_REG_LAMBDA", "0.")) + + fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray( + param_values, + with_nan=False, + categorical_to_scalar=self.categorical_to_scalar, + ) + if fit_parameters.shape[1] == 0: + logger.warning( + f"Cannot run LightGBM due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}" + ) + self.fit_success = False + return self + + import dfatool.lightgbm as lightgbm + + lgbr = lightgbm.LGBMRegressor( + boosting_type=boosting_type, + n_estimators=n_estimators, + max_depth=max_depth, + num_leaves=num_leaves, + subsample=subsample, + learning_rate=learning_rate, + min_split_gain=min_split_gain, + min_child_samples=min_child_samples, + reg_alpha=reg_alpha, + reg_lambda=reg_lambda, + ) + lgbr.fit(fit_parameters, data) + self.fit_success = True + self.regressor = lgbr + self._build_feature_names() + + return self + + def to_json(self, internal=False, **kwargs): + forest = self.regressor.booster_.dump_model()["tree_info"] + if internal: + return forest + return list( + map( + lambda tree: self._model_to_json(tree["tree_structure"], **kwargs), + forest, + ) + ) + + def _model_to_json(self, tree, **kwargs): + ret = dict() + if "left_child" in tree: + assert "right_child" in tree + assert tree["decision_type"] == "<=" + return { + "type": "scalarSplit", + "paramName": self.feature_names[tree["split_feature"]], + "threshold": tree["threshold"], + "value": None, + "left": self._model_to_json(tree["left_child"], **kwargs), + "right": self._model_to_json(tree["right_child"], **kwargs), + } + else: + return { + "type": "static", + "value": tree["leaf_value"], + } + + def get_number_of_nodes(self): + return sum( + map( + lambda t: self._get_number_of_nodes(t["tree_structure"]), + self.to_json(internal=True), + ) + ) + + def _get_number_of_nodes(self, data): + ret = 1 + if "left_child" in data: + ret += self._get_number_of_nodes(data["left_child"]) + if "right_child" in data: + ret += self._get_number_of_nodes(data["right_child"]) + return ret + + def get_number_of_leaves(self): + return sum(map(lambda t: t["num_leaves"], self.to_json(internal=True))) + + def get_max_depth(self): + return max( + map( + lambda t: self._get_max_depth(t["tree_structure"]), + self.to_json(internal=True), + ) + ) + + def _get_max_depth(self, data): + ret = [0] + if "left_child" in data: + ret.append(self._get_max_depth(data["left_child"])) + if "right_child" in data: + ret.append(self._get_max_depth(data["right_child"])) + return 1 + max(ret) + + def get_complexity_score(self): + return self.get_number_of_nodes() + + def hyper_to_dref(self): + return { + "lgbm/boosting type": self.regressor.boosting_type, + "lgbm/n estimators": self.regressor.n_estimators, + "lgbm/max depth": self.regressor.max_depth == -1 + and "infty" + or self.regressor.max_depth, + "lgbm/max leaves": self.regressor.num_leaves, + "lgbm/subsample": self.regressor.subsample, + "lgbm/learning rate": self.regressor.learning_rate, + "lgbm/min split gain": self.regressor.min_split_gain, + "lgbm/min child samples": self.regressor.min_child_samples, + "lgbm/alpha": self.regressor.reg_alpha, + "lgbm/lambda": self.regressor.reg_lambda, + } + + class XGBoostFunction(SKLearnRegressionFunction): def fit(self, param_values, data): diff --git a/lib/model.py b/lib/model.py index ffe1fb7..5770218 100644 --- a/lib/model.py +++ b/lib/model.py @@ -313,10 +313,12 @@ class AnalyticModel: self.attr_by_name[name][attr].build_lmt() elif model_type == "symreg": self.attr_by_name[name][attr].build_symreg() + elif model_type == "lgbm": + self.attr_by_name[name][attr].build_lgbm() elif model_type == "xgb": self.attr_by_name[name][attr].build_xgb() else: - logger.error("build_fitted: unknown model type: {model_type}") + logger.error(f"build_fitted: unknown model type: {model_type}") elif self.force_tree: for name in self.names: for attr in self.by_name[name]["attributes"]: diff --git a/lib/parameters.py b/lib/parameters.py index 0b0da81..8c7c9cb 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -990,6 +990,24 @@ class ModelAttribute: ) return False + def build_lgbm(self): + mf = df.LightGBMFunction( + np.mean(self.data), + n_samples=len(self.data), + param_names=self.param_names, + arg_count=self.arg_count, + ).fit(self.param_values, self.data) + + if mf.fit_success: + self.model_function = mf + return True + else: + logger.warning(f"LightGBM generation for {self.name} {self.attr} faled") + self.model_function = df.StaticFunction( + np.mean(self.data), n_samples=len(self.data) + ) + return False + def build_xgb(self): mf = df.XGBoostFunction( np.mean(self.data), |