summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <daniel.friesel@uos.de>2021-12-23 15:33:45 +0100
committerDaniel Friesel <daniel.friesel@uos.de>2021-12-23 15:33:45 +0100
commit3473ff227b9c085b5333147f0c2f6cb1e431c875 (patch)
treeab782568c0adea3e4b00be786299525f8b3e7692
parenta9d538afbd9d766a35093e851fbe5c12112fb2eb (diff)
model: add sklearn CART support (CART with scalar features)
-rw-r--r--lib/functions.py30
-rw-r--r--lib/model.py16
-rw-r--r--lib/parameters.py48
3 files changed, 91 insertions, 3 deletions
diff --git a/lib/functions.py b/lib/functions.py
index b1477da..320e8ed 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -409,6 +409,36 @@ class SubstateFunction(ModelFunction):
return "SubstateFunction"
+class SKLearnRegressionFunction(ModelFunction):
+ def __init__(self, value, regressor, ignore_index):
+ super().__init__(value)
+ self.regressor = regressor
+ self.ignore_index = ignore_index
+
+ def is_predictable(self, param_list=None):
+ """
+ Return whether the model function can be evaluated on the given parameter values.
+
+ For a StaticFunction, this is always the case (i.e., this function always returns true).
+ """
+ return True
+
+ def eval(self, param_list=None):
+ """
+ Evaluate model function with specified param/arg values.
+
+ Far a Staticfunction, this is just the static value
+
+ """
+ if param_list is None:
+ return self.value
+ actual_param_list = list()
+ for i, param in enumerate(param_list):
+ if not self.ignore_index[i]:
+ actual_param_list.append(param)
+ return self.regressor.predict([actual_param_list])
+
+
class AnalyticFunction(ModelFunction):
"""
A multi-dimensional model function, generated from a string, which can be optimized using regression.
diff --git a/lib/model.py b/lib/model.py
index 9133196..4f5f60f 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -157,6 +157,9 @@ class AnalyticModel:
with_nonbinary_nodes = bool(
int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1"))
)
+ with_sklearn_cart = bool(
+ int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
+ )
loss_ignore_scalar = bool(
int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
)
@@ -169,6 +172,7 @@ class AnalyticModel:
threshold=threshold,
with_function_leaves=with_function_leaves,
with_nonbinary_nodes=with_nonbinary_nodes,
+ with_sklearn_cart=with_sklearn_cart,
loss_ignore_scalar=loss_ignore_scalar,
)
self.fit_done = True
@@ -317,6 +321,12 @@ class AnalyticModel:
with_nonbinary_nodes = bool(
int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1"))
)
+ with_sklearn_cart = bool(
+ int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0"))
+ )
+ loss_ignore_scalar = bool(
+ int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
+ )
threshold = self.attr_by_name[name][attr].stats.std_param_lut
if (
self.dtree_max_std
@@ -324,9 +334,6 @@ class AnalyticModel:
and attr in self.dtree_max_std[name]
):
threshold = self.dtree_max_std[name][attr]
- loss_ignore_scalar = bool(
- int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0"))
- )
logger.debug(
f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})"
)
@@ -336,6 +343,7 @@ class AnalyticModel:
threshold=threshold,
with_function_leaves=with_function_leaves,
with_nonbinary_nodes=with_nonbinary_nodes,
+ with_sklearn_cart=with_sklearn_cart,
loss_ignore_scalar=loss_ignore_scalar,
)
else:
@@ -414,6 +422,7 @@ class AnalyticModel:
threshold=100,
with_function_leaves=False,
with_nonbinary_nodes=True,
+ with_sklearn_cart=False,
loss_ignore_scalar=False,
):
@@ -435,6 +444,7 @@ class AnalyticModel:
self.by_name[name][attribute],
with_function_leaves=with_function_leaves,
with_nonbinary_nodes=with_nonbinary_nodes,
+ with_sklearn_cart=with_sklearn_cart,
loss_ignore_scalar=loss_ignore_scalar,
threshold=threshold,
)
diff --git a/lib/parameters.py b/lib/parameters.py
index e199153..9cfe145 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -38,6 +38,37 @@ def distinct_param_values(param_tuples):
return distinct_values
+def param_to_ndarray(param_tuples, with_nan=True):
+ has_nan = dict()
+ has_non_numeric = dict()
+
+ for param_tuple in param_tuples:
+ for i, param in enumerate(param_tuple):
+ if not is_numeric(param):
+ if param is None:
+ has_nan[i] = True
+ else:
+ has_non_numeric[i] = True
+
+ ignore_index = dict()
+ for i in range(len(param_tuples[0])):
+ if has_non_numeric.get(i, False):
+ ignore_index[i] = True
+ elif not with_nan and has_nan.get(i, False):
+ ignore_index[i] = True
+ else:
+ ignore_index[i] = False
+
+ ret_tuples = list()
+ for param_tuple in param_tuples:
+ ret_tuple = list()
+ for i, param in enumerate(param_tuple):
+ if not ignore_index[i]:
+ ret_tuple.append(param)
+ ret_tuples.append(ret_tuple)
+ return np.asarray(ret_tuples), ignore_index
+
+
def _depends_on_param(corr_param, std_param, std_lut):
# if self.use_corrcoef:
if False:
@@ -843,6 +874,7 @@ class ModelAttribute:
data,
with_function_leaves=False,
with_nonbinary_nodes=True,
+ with_sklearn_cart=False,
loss_ignore_scalar=False,
threshold=100,
):
@@ -853,12 +885,28 @@ class ModelAttribute:
:param data: Measurements. [data 1, data 2, data 3, ...]
:param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters
:param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children)
+ :param with_sklearn_cart: Use `sklearn.tree.DecisionTreeRegressor` CART implementation for tree generation. Does not support categorial (enum)
+ and sparse parameters. Both are ignored during fitting. All other options are ignored as well.
:param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled.
:param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100.
:returns: SplitFunction or StaticFunction
"""
+ if with_sklearn_cart:
+ from sklearn.tree import DecisionTreeRegressor
+
+ max_depth = int(os.getenv("DFATOOL_CART_MAX_DEPTH", "0"))
+ if max_depth == 0:
+ max_depth = None
+ cart = DecisionTreeRegressor(max_depth=max_depth)
+ fit_parameters, ignore_index = param_to_ndarray(parameters, with_nan=False)
+ cart.fit(fit_parameters, data)
+ self.model_function = df.SKLearnRegressionFunction(
+ np.mean(data), cart, ignore_index
+ )
+ return
+
if loss_ignore_scalar and not with_function_leaves:
logger.warning(
"build_dtree called with loss_ignore_scalar=True, with_function_leaves=False. This does not make sense."