SKLearnRegressionFunction: export ndarray and pre-proc paramcount to dref

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-08 09:43:56 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-03-08 09:43:56 +0100
commit: b09b01fe74eeb446fb14f4449f927cf6130cf1db (patch)
tree: b163b36fe37ea0d83178fe3fdaf8b06bd6c6ad6e
parent: 22e116e51462aa98321c88f375dabf973b0ab2c2 (diff)
1 files changed, 73 insertions, 45 deletions
diff --git a/lib/functions.py b/lib/functions.py
index 47f2983..be4f56b 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -614,6 +614,8 @@ class SKLearnRegressionFunction(ModelFunction):
             int(os.getenv("DFATOOL_PARAM_CATEGORICAL_TO_SCALAR", "0"))
         )
         self.fit_success = None
+        self.paramcount_ndarray = None
+        self.paramcount_preprocessed = None
 
     def _check_fit_param(self, fit_parameters, name, step):
         if fit_parameters.shape[1] == 0:
@@ -623,6 +625,7 @@ class SKLearnRegressionFunction(ModelFunction):
         return True
 
     def _preprocess_parameters(self, fit_parameters, data):
+        self.paramcount_ndarray = len(fit_parameters[0])
         if dfatool_preproc_relevance_method == "mi":
             return self._preprocess_parameters_mi(fit_parameters, data)
         return fit_parameters
@@ -650,11 +653,20 @@ class SKLearnRegressionFunction(ModelFunction):
         ret = list()
         for param_tuple in fit_parameters:
             ret.append(param_tuple[tt])
+        self.paramcount_preprocessed = len(ret[0])
         logger.debug(
-            f"information gain: in {len(fit_parameters[0])} parameters -> out {len(ret[0])} parameters"
+            f"information gain: in {self.paramcount_ndarray} parameters -> out {self.paramcount_preprocessed} parameters"
         )
         return np.asarray(ret)
 
+    def hyper_to_dref(self):
+        ret = {
+            "paramcount/ndarray": self.paramcount_ndarray,
+        }
+        if self.paramcount_preprocessed is not None:
+            ret["paramcount/preprocessed"] = self.paramcount_preprocessed
+        return ret
+
     def _build_feature_names(self):
         # SKLearnRegressionFunction descendants use self.param_names \ self.ignore_index as features.
         # Thus, model feature indexes ≠ self.param_names indexes.
@@ -833,15 +845,19 @@ class CARTFunction(SKLearnRegressionFunction):
         return ret
 
     def hyper_to_dref(self):
-        return {
-            "cart/max depth": self.regressor.max_depth or "infty",
-            "cart/min samples split": self.regressor.min_samples_split,
-            "cart/min samples leaf": self.regressor.min_samples_leaf,
-            "cart/min impurity decrease": self.regressor.min_impurity_decrease,
-            "cart/max leaf nodes": self.regressor.max_leaf_nodes or "infty",
-            "cart/criterion": self.regressor.criterion,
-            "cart/splitter": self.regressor.splitter,
-        }
+        hyper = super().hyper_to_dref()
+        hyper.update(
+            {
+                "cart/max depth": self.regressor.max_depth or "infty",
+                "cart/min samples split": self.regressor.min_samples_split,
+                "cart/min samples leaf": self.regressor.min_samples_leaf,
+                "cart/min impurity decrease": self.regressor.min_impurity_decrease,
+                "cart/max leaf nodes": self.regressor.max_leaf_nodes or "infty",
+                "cart/criterion": self.regressor.criterion,
+                "cart/splitter": self.regressor.splitter,
+            }
+        )
+        return hyper
 
     # recursive function for all nodes:
     def recurse_(self, tree, node_id, depth=0):
@@ -1009,13 +1025,17 @@ class LMTFunction(SKLearnRegressionFunction):
         return ret
 
     def hyper_to_dref(self):
-        return {
-            "lmt/max depth": self.regressor.max_depth,
-            "lmt/max bins": self.regressor.max_bins,
-            "lmt/min samples split": self.regressor.min_samples_split,
-            "lmt/min samples leaf": self.regressor.min_samples_leaf,
-            "lmt/criterion": self.regressor.criterion,
-        }
+        hyper = super().hyper_to_dref()
+        hyper.update(
+            {
+                "lmt/max depth": self.regressor.max_depth,
+                "lmt/max bins": self.regressor.max_bins,
+                "lmt/min samples split": self.regressor.min_samples_split,
+                "lmt/min samples leaf": self.regressor.min_samples_leaf,
+                "lmt/criterion": self.regressor.criterion,
+            }
+        )
+        return hyper
 
     def recurse_(self, node_hash, node_index):
         node = node_hash[node_index]
@@ -1198,20 +1218,24 @@ class LightGBMFunction(SKLearnRegressionFunction):
         return self.get_number_of_nodes()
 
     def hyper_to_dref(self):
-        return {
-            "lgbm/boosting type": self.regressor.boosting_type,
-            "lgbm/n estimators": self.regressor.n_estimators,
-            "lgbm/max depth": self.regressor.max_depth == -1
-            and "infty"
-            or self.regressor.max_depth,
-            "lgbm/max leaves": self.regressor.num_leaves,
-            "lgbm/subsample": self.regressor.subsample,
-            "lgbm/learning rate": self.regressor.learning_rate,
-            "lgbm/min split gain": self.regressor.min_split_gain,
-            "lgbm/min child samples": self.regressor.min_child_samples,
-            "lgbm/alpha": self.regressor.reg_alpha,
-            "lgbm/lambda": self.regressor.reg_lambda,
-        }
+        hyper = super().hyper_to_dref()
+        hyper.update(
+            {
+                "lgbm/boosting type": self.regressor.boosting_type,
+                "lgbm/n estimators": self.regressor.n_estimators,
+                "lgbm/max depth": self.regressor.max_depth == -1
+                and "infty"
+                or self.regressor.max_depth,
+                "lgbm/max leaves": self.regressor.num_leaves,
+                "lgbm/subsample": self.regressor.subsample,
+                "lgbm/learning rate": self.regressor.learning_rate,
+                "lgbm/min split gain": self.regressor.min_split_gain,
+                "lgbm/min child samples": self.regressor.min_child_samples,
+                "lgbm/alpha": self.regressor.reg_alpha,
+                "lgbm/lambda": self.regressor.reg_lambda,
+            }
+        )
+        return hyper
 
 
 class XGBoostFunction(SKLearnRegressionFunction):
@@ -1384,20 +1408,24 @@ class XGBoostFunction(SKLearnRegressionFunction):
         return self.get_number_of_nodes()
 
     def hyper_to_dref(self):
-        return {
-            "xgb/n estimators": self.regressor.n_estimators,
-            "xgb/max depth": self.regressor.max_depth == 0
-            and "infty"
-            or self.regressor.max_depth,
-            "xgb/max leaves": self.regressor.max_leaves == 0
-            and "infty"
-            or self.regressor.max_leaves,
-            "xgb/subsample": self.regressor.subsample,
-            "xgb/eta": self.regressor.learning_rate,
-            "xgb/gamma": self.regressor.gamma,
-            "xgb/alpha": self.regressor.reg_alpha,
-            "xgb/lambda": self.regressor.reg_lambda,
-        }
+        hyper = super().hyper_to_dref()
+        hyper.update(
+            {
+                "xgb/n estimators": self.regressor.n_estimators,
+                "xgb/max depth": self.regressor.max_depth == 0
+                and "infty"
+                or self.regressor.max_depth,
+                "xgb/max leaves": self.regressor.max_leaves == 0
+                and "infty"
+                or self.regressor.max_leaves,
+                "xgb/subsample": self.regressor.subsample,
+                "xgb/eta": self.regressor.learning_rate,
+                "xgb/gamma": self.regressor.gamma,
+                "xgb/alpha": self.regressor.reg_alpha,
+                "xgb/lambda": self.regressor.reg_lambda,
+            }
+        )
+        return hyper
 
 
 class SymbolicRegressionFunction(SKLearnRegressionFunction):
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-08 09:43:56 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-03-08 09:43:56 +0100
commit	b09b01fe74eeb446fb14f4449f927cf6130cf1db (patch)
tree	b163b36fe37ea0d83178fe3fdaf8b06bd6c6ad6e
parent	22e116e51462aa98321c88f375dabf973b0ab2c2 (diff)