diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2021-12-23 15:33:45 +0100 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2021-12-23 15:33:45 +0100 |
commit | 3473ff227b9c085b5333147f0c2f6cb1e431c875 (patch) | |
tree | ab782568c0adea3e4b00be786299525f8b3e7692 | |
parent | a9d538afbd9d766a35093e851fbe5c12112fb2eb (diff) |
model: add sklearn CART support (CART with scalar features)
-rw-r--r-- | lib/functions.py | 30 | ||||
-rw-r--r-- | lib/model.py | 16 | ||||
-rw-r--r-- | lib/parameters.py | 48 |
3 files changed, 91 insertions, 3 deletions
diff --git a/lib/functions.py b/lib/functions.py index b1477da..320e8ed 100644 --- a/lib/functions.py +++ b/lib/functions.py @@ -409,6 +409,36 @@ class SubstateFunction(ModelFunction): return "SubstateFunction" +class SKLearnRegressionFunction(ModelFunction): + def __init__(self, value, regressor, ignore_index): + super().__init__(value) + self.regressor = regressor + self.ignore_index = ignore_index + + def is_predictable(self, param_list=None): + """ + Return whether the model function can be evaluated on the given parameter values. + + For a StaticFunction, this is always the case (i.e., this function always returns true). + """ + return True + + def eval(self, param_list=None): + """ + Evaluate model function with specified param/arg values. + + Far a Staticfunction, this is just the static value + + """ + if param_list is None: + return self.value + actual_param_list = list() + for i, param in enumerate(param_list): + if not self.ignore_index[i]: + actual_param_list.append(param) + return self.regressor.predict([actual_param_list]) + + class AnalyticFunction(ModelFunction): """ A multi-dimensional model function, generated from a string, which can be optimized using regression. diff --git a/lib/model.py b/lib/model.py index 9133196..4f5f60f 100644 --- a/lib/model.py +++ b/lib/model.py @@ -157,6 +157,9 @@ class AnalyticModel: with_nonbinary_nodes = bool( int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1")) ) + with_sklearn_cart = bool( + int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0")) + ) loss_ignore_scalar = bool( int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) ) @@ -169,6 +172,7 @@ class AnalyticModel: threshold=threshold, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + with_sklearn_cart=with_sklearn_cart, loss_ignore_scalar=loss_ignore_scalar, ) self.fit_done = True @@ -317,6 +321,12 @@ class AnalyticModel: with_nonbinary_nodes = bool( int(os.getenv("DFATOOL_DTREE_NONBINARY_NODES", "1")) ) + with_sklearn_cart = bool( + int(os.getenv("DFATOOL_DTREE_SKLEARN_CART", "0")) + ) + loss_ignore_scalar = bool( + int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) + ) threshold = self.attr_by_name[name][attr].stats.std_param_lut if ( self.dtree_max_std @@ -324,9 +334,6 @@ class AnalyticModel: and attr in self.dtree_max_std[name] ): threshold = self.dtree_max_std[name][attr] - loss_ignore_scalar = bool( - int(os.getenv("DFATOOL_DTREE_LOSS_IGNORE_SCALAR", "0")) - ) logger.debug( f"build_dtree({name}, {attr}, threshold={threshold}, with_function_leaves={with_function_leaves}, with_nonbinary_nodes={with_nonbinary_nodes}, loss_ignore_scalar={loss_ignore_scalar})" ) @@ -336,6 +343,7 @@ class AnalyticModel: threshold=threshold, with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + with_sklearn_cart=with_sklearn_cart, loss_ignore_scalar=loss_ignore_scalar, ) else: @@ -414,6 +422,7 @@ class AnalyticModel: threshold=100, with_function_leaves=False, with_nonbinary_nodes=True, + with_sklearn_cart=False, loss_ignore_scalar=False, ): @@ -435,6 +444,7 @@ class AnalyticModel: self.by_name[name][attribute], with_function_leaves=with_function_leaves, with_nonbinary_nodes=with_nonbinary_nodes, + with_sklearn_cart=with_sklearn_cart, loss_ignore_scalar=loss_ignore_scalar, threshold=threshold, ) diff --git a/lib/parameters.py b/lib/parameters.py index e199153..9cfe145 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -38,6 +38,37 @@ def distinct_param_values(param_tuples): return distinct_values +def param_to_ndarray(param_tuples, with_nan=True): + has_nan = dict() + has_non_numeric = dict() + + for param_tuple in param_tuples: + for i, param in enumerate(param_tuple): + if not is_numeric(param): + if param is None: + has_nan[i] = True + else: + has_non_numeric[i] = True + + ignore_index = dict() + for i in range(len(param_tuples[0])): + if has_non_numeric.get(i, False): + ignore_index[i] = True + elif not with_nan and has_nan.get(i, False): + ignore_index[i] = True + else: + ignore_index[i] = False + + ret_tuples = list() + for param_tuple in param_tuples: + ret_tuple = list() + for i, param in enumerate(param_tuple): + if not ignore_index[i]: + ret_tuple.append(param) + ret_tuples.append(ret_tuple) + return np.asarray(ret_tuples), ignore_index + + def _depends_on_param(corr_param, std_param, std_lut): # if self.use_corrcoef: if False: @@ -843,6 +874,7 @@ class ModelAttribute: data, with_function_leaves=False, with_nonbinary_nodes=True, + with_sklearn_cart=False, loss_ignore_scalar=False, threshold=100, ): @@ -853,12 +885,28 @@ class ModelAttribute: :param data: Measurements. [data 1, data 2, data 3, ...] :param with_function_leaves: Use fitted function sets to generate function leaves for scalar parameters :param with_nonbinary_nodes: Allow non-binary nodes for enum and scalar parameters (i.e., nodes with more than two children) + :param with_sklearn_cart: Use `sklearn.tree.DecisionTreeRegressor` CART implementation for tree generation. Does not support categorial (enum) + and sparse parameters. Both are ignored during fitting. All other options are ignored as well. :param loss_ignore_scalar: Ignore scalar parameters when computing the loss for split candidates. Only sensible if with_function_leaves is enabled. :param threshold: Return a StaticFunction leaf node if std(data) < threshold. Default 100. :returns: SplitFunction or StaticFunction """ + if with_sklearn_cart: + from sklearn.tree import DecisionTreeRegressor + + max_depth = int(os.getenv("DFATOOL_CART_MAX_DEPTH", "0")) + if max_depth == 0: + max_depth = None + cart = DecisionTreeRegressor(max_depth=max_depth) + fit_parameters, ignore_index = param_to_ndarray(parameters, with_nan=False) + cart.fit(fit_parameters, data) + self.model_function = df.SKLearnRegressionFunction( + np.mean(data), cart, ignore_index + ) + return + if loss_ignore_scalar and not with_function_leaves: logger.warning( "build_dtree called with loss_ignore_scalar=True, with_function_leaves=False. This does not make sense." |