summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-03-07 15:07:53 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-03-07 15:07:53 +0100
commit52f9a8ee5808db71412ddc3429d2f63f947b1d1c (patch)
treec20ea2b7ed877364ce08dc4234eea4ed5444dd84
parentd26f7a51e17911cc5a11749df27d69cb095ced4c (diff)
feature pre-processing via information gain
-rw-r--r--README.md2
-rw-r--r--lib/functions.py117
2 files changed, 112 insertions, 7 deletions
diff --git a/README.md b/README.md
index 0fbbd16..b60d265 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,8 @@ The following variables may be set to alter the behaviour of dfatool components.
| `DFATOOL_DRIFT_COMPENSATION_PENALTY` | 0 .. 100 (default: majority vote over several penalties) | Specify penalty for ruptures.py PELT changepoint petection |
| `DFATOOL_MODEL` | cart, decart, fol, lgbm, lmt, **rmt**, symreg, uls, xgb | Modeling method. See below for method-specific configuration options. |
| `DFATOOL_RMT_SUBMODEL` | cart, fol, static, symreg, **uls** | Modeling method for RMT leaf functions. |
+| `DFATOOL_PREPROCESSING_RELEVANCE_METHOD` | **none**, mi | Ignore parameters deemed irrelevant by the specified heuristic before passing them on to `DFATOOL_MODEL`. |
+| `DFATOOL_PREPROCESSING_RELEVANCE_THRESHOLD` | .. **0.1** .. | Threshold for relevance heuristic. |
| `DFATOOL_CART_MAX_DEPTH` | **0** .. *n* | maximum depth for sklearn CART. Default (0): unlimited. |
| `DFATOOL_LGBM_BOOSTER` | **gbdt**, dart, rf | Boosting type. |
| `DFATOOL_LGBM_N_ESTIMATORS` | .., **100**, .. | Number of estimators. |
diff --git a/lib/functions.py b/lib/functions.py
index efed2f6..88ecb76 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -15,6 +15,16 @@ from .utils import is_numeric, param_to_ndarray
logger = logging.getLogger(__name__)
+dfatool_preproc_relevance_method = os.getenv(
+ "DFATOOL_PREPROCESSING_RELEVANCE_METHOD", None
+)
+dfatool_preproc_relevance_threshold = float(
+ os.getenv("DFATOOL_PREPROCESSING_RELEVANCE_THRESHOLD", "0.1")
+)
+
+if dfatool_preproc_relevance_method == "mi":
+ import sklearn.feature_selection
+
def powerset(iterable):
"""
@@ -605,6 +615,39 @@ class SKLearnRegressionFunction(ModelFunction):
)
self.fit_success = None
+ def _preprocess_parameters(self, fit_parameters, data):
+ if dfatool_preproc_relevance_method == "mi":
+ return self._preprocess_parameters_mi(fit_parameters, data)
+ return fit_parameters
+
+ def _preprocess_parameters_mi(self, fit_parameters, data):
+ fit_param_to_param = dict()
+ j = 0
+ for i in range(len(self.param_names_and_args)):
+ if not self.ignore_index[i]:
+ fit_param_to_param[j] = i
+ j += 1
+ try:
+ mutual_information = sklearn.feature_selection.mutual_info_regression(
+ fit_parameters, data
+ )
+ except ValueError as e:
+ logger.error(f"mutual_info_regression failed: {e}")
+ return fit_parameters
+
+ tt = list()
+ for i, information_gain in enumerate(mutual_information):
+ tt.append(information_gain >= dfatool_preproc_relevance_threshold)
+ self.ignore_index[fit_param_to_param[i]] = not tt[i]
+
+ ret = list()
+ for param_tuple in fit_parameters:
+ ret.append(param_tuple[tt])
+ logger.debug(
+ f"information gain: in {len(fit_parameters[0])} parameters -> out {len(ret[0])} parameters"
+ )
+ return np.asarray(ret)
+
def _build_feature_names(self):
# SKLearnRegressionFunction descendants use self.param_names \ self.ignore_index as features.
# Thus, model feature indexes ≠ self.param_names indexes.
@@ -744,7 +787,16 @@ class CARTFunction(SKLearnRegressionFunction):
if fit_parameters.shape[1] == 0:
logger.warning(
- f"Cannot generate CART due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}"
+ f"Cannot generate CART due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after param_to_ndarray is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot generate CART due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
)
self.fit_success = False
return self
@@ -917,6 +969,15 @@ class LMTFunction(SKLearnRegressionFunction):
self.fit_success = False
return self
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot generate LMT due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
logger.debug("Fitting LMT ...")
try:
lmt.fit(fit_parameters, data)
@@ -1052,6 +1113,15 @@ class LightGBMFunction(SKLearnRegressionFunction):
self.fit_success = False
return self
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot generate LightGBM due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
import dfatool.lightgbm as lightgbm
lightgbm.register_logger(logger)
@@ -1232,6 +1302,15 @@ class XGBoostFunction(SKLearnRegressionFunction):
self.fit_success = False
return self
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot run XGBoost due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
import xgboost
xgb = xgboost.XGBRegressor(
@@ -1423,6 +1502,15 @@ class SymbolicRegressionFunction(SKLearnRegressionFunction):
self.fit_success = False
return self
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot use Symbolic Regression due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
from dfatool.gplearn.genetic import SymbolicRegressor
self._build_feature_names()
@@ -1475,14 +1563,29 @@ class FOLFunction(ModelFunction):
int(os.getenv("DFATOOL_PARAM_CATEGORICAL_TO_SCALAR", "0"))
)
second_order = int(os.getenv("DFATOOL_FOL_SECOND_ORDER", "0"))
- fit_parameters, categorical_to_index, ignore_index = param_to_ndarray(
+ fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray(
param_values,
with_nan=False,
categorical_to_scalar=self.categorical_to_scalar,
ignore_indexes=ignore_param_indexes,
)
- self.categorical_to_index = categorical_to_index
- self.ignore_index = ignore_index
+
+ if fit_parameters.shape[1] == 0:
+ logger.debug(
+ f"Cannot run FOL due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
+ fit_parameters = self._preprocess_parameters(fit_parameters, data)
+
+ if fit_parameters.shape[1] == 0:
+ logger.warning(
+ f"Cannot run FOL due to lack of parameters: parameter shape is {np.array(param_values).shape}, fit_parameter shape after pre-processing is {fit_parameters.shape}"
+ )
+ self.fit_success = False
+ return self
+
fit_parameters = fit_parameters.swapaxes(0, 1)
if second_order:
@@ -1499,7 +1602,7 @@ class FOLFunction(ModelFunction):
funbuf = "regression_arg(0)"
num_vars = 1
for j, param_name in enumerate(self.parameter_names):
- if ignore_index[j]:
+ if self.ignore_index[j]:
continue
else:
if second_order == 2:
@@ -1508,7 +1611,7 @@ class FOLFunction(ModelFunction):
)
num_vars += 1
for k in range(j + 1, len(self.parameter_names)):
- if ignore_index[j]:
+ if self.ignore_index[j]:
continue
funbuf += f" + regression_arg({num_vars}) * parameter({param_name}) * parameter({self.parameter_names[k]})"
num_vars += 1
@@ -1520,7 +1623,7 @@ class FOLFunction(ModelFunction):
funbuf = "regression_arg(0)"
i = 1
for j, param_name in enumerate(self.parameter_names):
- if ignore_index[j]:
+ if self.ignore_index[j]:
continue
else:
funbuf += f" + regression_arg({i}) * parameter({param_name})"