diff options
author | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-02-12 10:51:42 +0100 |
---|---|---|
committer | Birte Kristina Friesel <birte.friesel@uos.de> | 2024-02-12 10:51:42 +0100 |
commit | 486690f31dfe8da33fbd0711137844424d0321eb (patch) | |
tree | 5d392998eec345c0583b2fc796a995d655b5cb44 /lib | |
parent | 152e8c6c99d1791d0dcd78c25d2a20a43f55247d (diff) |
unfuck param_names / feature_names handling
Diffstat (limited to 'lib')
-rw-r--r-- | lib/cli.py | 50 | ||||
-rw-r--r-- | lib/functions.py | 118 | ||||
-rw-r--r-- | lib/model.py | 1 | ||||
-rw-r--r-- | lib/parameters.py | 112 | ||||
-rw-r--r-- | lib/utils.py | 4 |
5 files changed, 129 insertions, 156 deletions
@@ -124,17 +124,17 @@ def print_staticinfo(prefix, info): print(f"{prefix}: {info.value}") -def print_cartinfo(prefix, info, feature_names): - _print_cartinfo(prefix, info.to_json(feature_names=feature_names), feature_names) +def print_cartinfo(prefix, info): + _print_cartinfo(prefix, info.to_json()) -def print_xgbinfo(prefix, info, feature_names): - for i, tree in enumerate(info.to_json(feature_names=feature_names)): - _print_cartinfo(prefix + f"tree{i:03d} :", tree, feature_names) +def print_xgbinfo(prefix, info): + for i, tree in enumerate(info.to_json()): + _print_cartinfo(prefix + f"tree{i:03d} :", tree) -def print_lmtinfo(prefix, info, feature_names): - _print_lmtinfo(prefix, info.to_json(feature_names=feature_names)) +def print_lmtinfo(prefix, info): + _print_lmtinfo(prefix, info.to_json()) def _print_lmtinfo(prefix, model): @@ -157,41 +157,27 @@ def _print_lmtinfo(prefix, model): print(f"{prefix}: {model_function}") -def _print_cartinfo(prefix, model, feature_names): +def _print_cartinfo(prefix, model): if model["type"] == "static": print(f"""{prefix}: {model["value"]}""") else: _print_cartinfo( f"""{prefix} {model["paramName"]}≤{model["threshold"]} """, model["left"], - feature_names, ) _print_cartinfo( f"""{prefix} {model["paramName"]}>{model["threshold"]} """, model["right"], - feature_names, ) -def print_splitinfo(param_names, info, prefix=""): +def print_splitinfo(info, prefix=""): if type(info) is df.SplitFunction: for k, v in info.child.items(): - if info.param_index < len(param_names): - param_name = param_names[info.param_index] - else: - param_name = f"arg{info.param_index - len(param_names)}" - print_splitinfo(param_names, v, f"{prefix} {param_name}={k}") + print_splitinfo(v, f"{prefix} {info.param_name}={k}") elif type(info) is df.ScalarSplitFunction: - if info.param_index < len(param_names): - param_name = param_names[info.param_index] - else: - param_name = f"arg{info.param_index - len(param_names)}" - print_splitinfo( - param_names, info.child_le, f"{prefix} {param_name}≤{info.threshold}" - ) - print_splitinfo( - param_names, info.child_gt, f"{prefix} {param_name}>{info.threshold}" - ) + print_splitinfo(info.child_le, f"{prefix} {info.param_name}≤{info.threshold}") + print_splitinfo(info.child_gt, f"{prefix} {info.param_name}>{info.threshold}") elif type(info) is df.AnalyticFunction: print_analyticinfo(prefix, info) elif type(info) is df.StaticFunction: @@ -200,7 +186,7 @@ def print_splitinfo(param_names, info, prefix=""): print(f"{prefix}: UNKNOWN {type(info)}") -def print_model(prefix, info, feature_names): +def print_model(prefix, info): if type(info) is df.StaticFunction: print_staticinfo(prefix, info) elif type(info) is df.AnalyticFunction: @@ -208,15 +194,15 @@ def print_model(prefix, info, feature_names): elif type(info) is df.FOLFunction: print_analyticinfo(prefix, info) elif type(info) is df.CARTFunction: - print_cartinfo(prefix, info, feature_names) + print_cartinfo(prefix, info) elif type(info) is df.SplitFunction: - print_splitinfo(feature_names, info, prefix) + print_splitinfo(info, prefix) elif type(info) is df.ScalarSplitFunction: - print_splitinfo(feature_names, info, prefix) + print_splitinfo(info, prefix) elif type(info) is df.LMTFunction: - print_lmtinfo(prefix, info, feature_names) + print_lmtinfo(prefix, info) elif type(info) is df.XGBoostFunction: - print_xgbinfo(prefix, info, feature_names) + print_xgbinfo(prefix, info) else: print(f"{prefix}: {type(info)} UNIMPLEMENTED") diff --git a/lib/functions.py b/lib/functions.py index 6b4406c..81a94b4 100644 --- a/lib/functions.py +++ b/lib/functions.py @@ -317,8 +317,9 @@ class StaticFunction(ModelFunction): class SplitFunction(ModelFunction): - def __init__(self, value, param_index, child, **kwargs): + def __init__(self, value, param_index, param_name, child, **kwargs): super().__init__(value, **kwargs) + self.param_name = param_name self.param_index = param_index self.child = child self.use_weighted_avg = bool(int(os.getenv("DFATOOL_RMT_WEIGHTED_AVG", "0"))) @@ -361,15 +362,12 @@ class SplitFunction(ModelFunction): def to_json(self, **kwargs): ret = super().to_json(**kwargs) - with_param_name = kwargs.get("with_param_name", False) - param_names = kwargs.get("param_names", list()) update = { "type": "split", "paramIndex": self.param_index, + "paramName": self.param_name, "child": dict([[k, v.to_json(**kwargs)] for k, v in self.child.items()]), } - if with_param_name and param_names: - update["paramName"] = param_names[self.param_index] ret.update(update) return ret @@ -419,7 +417,7 @@ class SplitFunction(ModelFunction): @classmethod def from_json(cls, data): assert data["type"] == "split" - self = cls(data["value"], data["paramIndex"], dict()) + self = cls(data["value"], data["paramIndex"], data["paramName"], dict()) for k, v in data["child"].items(): self.child[k] = ModelFunction.from_json(v) @@ -431,9 +429,12 @@ class SplitFunction(ModelFunction): class ScalarSplitFunction(ModelFunction): - def __init__(self, value, param_index, threshold, child_le, child_gt, **kwargs): + def __init__( + self, value, param_index, param_name, threshold, child_le, child_gt, **kwargs + ): super().__init__(value, **kwargs) self.param_index = param_index + self.param_name = param_name self.threshold = threshold self.child_le = child_le self.child_gt = child_gt @@ -455,20 +456,16 @@ class ScalarSplitFunction(ModelFunction): self.child_le.webconf_function_map() + self.child_gt.webconf_function_map() ) - def to_json(self, feature_names=None, **kwargs): + def to_json(self, **kwargs): ret = super().to_json(**kwargs) - with_param_name = kwargs.get("with_param_name", False) - param_names = kwargs.get("param_names", list()) update = { "type": "scalarSplit", "paramIndex": self.param_index, - "paramName": feature_names[self.param_index], + "paramName": self.param_name, "threshold": self.threshold, - "left": self.child_le.to_json(), - "right": self.child_gt.to_json(), + "left": self.child_le.to_json(**kwargs), + "right": self.child_gt.to_json(**kwargs), } - if with_param_name and param_names: - update["paramName"] = param_names[self.param_index] ret.update(update) return ret @@ -519,7 +516,12 @@ class ScalarSplitFunction(ModelFunction): left = ModelFunction.from_json(data["left"]) right = ModelFunction.from_json(data["right"]) self = cls( - data.get("value", 0), data["paramIndex"], data["threshold"], left, right + data.get("value", 0), + data["paramIndex"], + data["paramName"], + data["threshold"], + left, + right, ) return self @@ -590,7 +592,8 @@ class SKLearnRegressionFunction(ModelFunction): def __init__(self, value, regressor, categorial_to_index, ignore_index, **kwargs): # Needed for JSON export - self.param_names = kwargs.pop("param_names", None) + self.param_names = kwargs.pop("param_names") + self.arg_count = kwargs.pop("arg_count") super().__init__(value, **kwargs) @@ -598,6 +601,31 @@ class SKLearnRegressionFunction(ModelFunction): self.categorial_to_index = categorial_to_index self.ignore_index = ignore_index + # SKLearnRegressionFunction descendants use self.param_names \ self.ignore_index as features. + # Thus, model feature indexes ≠ self.param_names indexes. + # self.feature_names accounts for this and allows mapping feature indexes back to parameter names / parameter indexes. + self.feature_names = list( + map( + lambda i: self.param_names[i], + filter( + lambda i: not self.ignore_index[i], + range(len(self.param_names)), + ), + ) + ) + self.feature_names += list( + map( + lambda i: f"arg{i-len(self.param_names)}", + filter( + lambda i: not self.ignore_index[i], + range( + len(self.param_names), + len(self.param_names) + self.arg_count, + ), + ), + ) + ) + def is_predictable(self, param_list=None): """ Return whether the model function can be evaluated on the given parameter values. @@ -657,6 +685,28 @@ class SKLearnRegressionFunction(ModelFunction): predictions = self.regressor.predict(np.array(actual_params)) return predictions + def to_json(self, **kwargs): + ret = super().to_json(**kwargs) + + # Note: categorial_to_index uses param_names, not feature_names + param_names = self.param_names + list( + map( + lambda i: f"arg{i-len(self.param_names)}", + range( + len(self.param_names), + len(self.param_names) + self.arg_count, + ), + ) + ) + ret["paramValueToIndex"] = dict( + map( + lambda kv: (param_names[kv[0]], kv[1]), + self.categorial_to_index.items(), + ) + ) + + return ret + class CARTFunction(SKLearnRegressionFunction): def get_number_of_nodes(self): @@ -671,11 +721,10 @@ class CARTFunction(SKLearnRegressionFunction): def get_complexity_score(self): return self.get_number_of_nodes() - def to_json(self, feature_names=None, **kwargs): + def to_json(self, **kwargs): import sklearn.tree self.leaf_id = sklearn.tree._tree.TREE_LEAF - self.feature_names = feature_names ret = super().to_json(**kwargs) ret.update(self.recurse_(self.regressor.tree_, 0)) @@ -763,8 +812,7 @@ class LMTFunction(SKLearnRegressionFunction): def get_max_depth(self): return max(map(len, self.regressor._leaves.keys())) + 1 - def to_json(self, feature_names=None, **kwargs): - self.feature_names = feature_names + def to_json(self, **kwargs): ret = super().to_json(**kwargs) ret.update(self.recurse_(self.regressor.summary(), 0)) return ret @@ -804,7 +852,7 @@ class LMTFunction(SKLearnRegressionFunction): class XGBoostFunction(SKLearnRegressionFunction): - def to_json(self, feature_names=None, **kwargs): + def to_json(self, **kwargs): import json tempfile = f"/tmp/xgb{os.getpid()}.json" @@ -816,31 +864,23 @@ class XGBoostFunction(SKLearnRegressionFunction): data = json.load(f) os.remove(tempfile) - if feature_names: - return list( - map( - lambda tree: self.tree_to_webconf_json( - tree, feature_names, **kwargs - ), - data, - ) + return list( + map( + lambda tree: self.tree_to_webconf_json(tree, **kwargs), + data, ) - return data + ) - def tree_to_webconf_json(self, tree, feature_names, **kwargs): + def tree_to_webconf_json(self, tree, **kwargs): ret = dict() if "children" in tree: return { "type": "scalarSplit", - "paramName": feature_names[int(tree["split"][1:])], + "paramName": self.feature_names[int(tree["split"][1:])], "threshold": tree["split_condition"], "value": None, - "left": self.tree_to_webconf_json( - tree["children"][0], feature_names, **kwargs - ), - "right": self.tree_to_webconf_json( - tree["children"][1], feature_names, **kwargs - ), + "left": self.tree_to_webconf_json(tree["children"][0], **kwargs), + "right": self.tree_to_webconf_json(tree["children"][1], **kwargs), } else: return { diff --git a/lib/model.py b/lib/model.py index 7b840ec..1c0ee01 100644 --- a/lib/model.py +++ b/lib/model.py @@ -463,6 +463,7 @@ class AnalyticModel: self.by_name[name][attribute], self.by_name[name]["param"], self.parameters, + self._num_args.get(name, 0), param_type=ParamType(self.by_name[name]["param"]), ) diff --git a/lib/parameters.py b/lib/parameters.py index ae4fffb..2e3878f 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -609,65 +609,11 @@ class ModelAttribute: return f"ModelAttribute<{self.name}, {self.attr}, mean={mean}>" def to_json(self, **kwargs): - if type(self.model_function) in ( - df.CARTFunction, - df.LMTFunction, - df.XGBoostFunction, - ): - import sklearn.tree - - feature_names = list( - map( - lambda i: self.param_names[i], - filter( - lambda i: not self.model_function.ignore_index[i], - range(len(self.param_names)), - ), - ) - ) - feature_names += list( - map( - lambda i: f"arg{i-len(self.param_names)}", - filter( - lambda i: not self.model_function.ignore_index[i], - range( - len(self.param_names), - len(self.param_names) + self.arg_count, - ), - ), - ) - ) - kwargs["feature_names"] = feature_names - ret = { + return { "paramNames": self.param_names, "argCount": self.arg_count, "modelFunction": self.model_function.to_json(**kwargs), } - if type(self.model_function) in ( - df.CARTFunction, - df.FOLFunction, - df.XGBoostFunction, - ): - feature_names = self.param_names - feature_names += list( - map( - lambda i: f"arg{i-len(self.param_names)}", - filter( - lambda i: not self.model_function.ignore_index[i], - range( - len(self.param_names), - len(self.param_names) + self.arg_count, - ), - ), - ) - ) - ret["paramValueToIndex"] = dict( - map( - lambda kv: (feature_names[kv[0]], kv[1]), - self.model_function.categorial_to_index.items(), - ) - ) - return ret def to_dref(self, unit=None): ret = {"mean": (self.mean, unit), "median": (self.median, unit)} @@ -710,47 +656,27 @@ class ModelAttribute: self.model_function.to_dot(pydot, graph, self.param_names) return graph - feature_names = list( - map( - lambda i: self.param_names[i], - filter( - lambda i: not self.model_function.ignore_index[i], - range(len(self.param_names)), - ), - ) - ) - feature_names += list( - map( - lambda i: f"arg{i-len(self.param_names)}", - filter( - lambda i: not self.model_function.ignore_index[i], - range( - len(self.param_names), - len(self.param_names) + self.arg_count, - ), - ), - ) - ) - if type(self.model_function) == df.CARTFunction: import sklearn.tree return sklearn.tree.export_graphviz( self.model_function.regressor, out_file=None, - feature_names=feature_names, + feature_names=self.model_function.feature_names, ) if type(self.model_function) == df.XGBoostFunction: import xgboost - self.model_function.regressor.get_booster().feature_names = feature_names + self.model_function.regressor.get_booster().feature_names = ( + self.model_function.feature_names + ) return [ xgboost.to_graphviz(self.model_function.regressor, num_trees=i) for i in range(self.model_function.regressor.n_estimators) ] if type(self.model_function) == df.LMTFunction: return self.model_function.regressor.model_to_dot( - feature_names=feature_names + feature_names=self.model_function.feature_names ) return None @@ -921,7 +847,13 @@ class ModelAttribute: for param_index, _ in enumerate(self.param_names): if len(self.stats.distinct_values_by_param_index[param_index]) < 2: ignore_param_indexes.append(param_index) - x = df.FOLFunction(self.median, self.param_names, n_samples=self.data.shape[0]) + x = df.FOLFunction( + self.median, + self.param_names, + n_samples=self.data.shape[0], + param_names=self.param_names, + arg_count=self.arg_count, + ) x.fit(self.param_values, self.data, ignore_param_indexes=ignore_param_indexes) if x.fit_success: self.model_function = x @@ -1063,6 +995,7 @@ class ModelAttribute: ignore_index, n_samples=len(data), param_names=self.param_names, + arg_count=self.arg_count, ) logger.debug("Fitted sklearn CART") return @@ -1150,7 +1083,13 @@ class ModelAttribute: return xgb.fit(fit_parameters, np.reshape(data, (-1, 1))) self.model_function = df.XGBoostFunction( - np.mean(data), xgb, category_to_index, ignore_index, n_samples=len(data) + np.mean(data), + xgb, + category_to_index, + ignore_index, + n_samples=len(data), + param_names=self.param_names, + arg_count=self.arg_count, ) output_filename = os.getenv("DFATOOL_XGB_DUMP_MODEL", None) if output_filename: @@ -1247,6 +1186,7 @@ class ModelAttribute: ignore_index, n_samples=len(data), param_names=self.param_names, + arg_count=self.arg_count, ) return @@ -1510,4 +1450,10 @@ class ModelAttribute: assert len(child.values()) >= 2 - return df.SplitFunction(np.mean(data), symbol_index, child, n_samples=len(data)) + return df.SplitFunction( + np.mean(data), + symbol_index, + self.log_param_names[symbol_index], + child, + n_samples=len(data), + ) diff --git a/lib/utils.py b/lib/utils.py index 989c830..d6cdfc5 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -315,8 +315,8 @@ def param_to_ndarray( for i, paramset in distinct_values.items(): distinct_values[i] = sorted(paramset) category_to_scalar[i] = dict() - for j, param in enumerate(distinct_values[i]): - category_to_scalar[i][param] = j + for j, param_value in enumerate(distinct_values[i]): + category_to_scalar[i][param_value] = j ignore_index = dict() for i in range(len(param_tuples[0])): |