summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBirte Kristina Friesel <birte.friesel@uos.de>2024-02-12 10:51:42 +0100
committerBirte Kristina Friesel <birte.friesel@uos.de>2024-02-12 10:51:42 +0100
commit486690f31dfe8da33fbd0711137844424d0321eb (patch)
tree5d392998eec345c0583b2fc796a995d655b5cb44 /lib
parent152e8c6c99d1791d0dcd78c25d2a20a43f55247d (diff)
unfuck param_names / feature_names handling
Diffstat (limited to 'lib')
-rw-r--r--lib/cli.py50
-rw-r--r--lib/functions.py118
-rw-r--r--lib/model.py1
-rw-r--r--lib/parameters.py112
-rw-r--r--lib/utils.py4
5 files changed, 129 insertions, 156 deletions
diff --git a/lib/cli.py b/lib/cli.py
index 1b6cb06..3da6fce 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -124,17 +124,17 @@ def print_staticinfo(prefix, info):
print(f"{prefix}: {info.value}")
-def print_cartinfo(prefix, info, feature_names):
- _print_cartinfo(prefix, info.to_json(feature_names=feature_names), feature_names)
+def print_cartinfo(prefix, info):
+ _print_cartinfo(prefix, info.to_json())
-def print_xgbinfo(prefix, info, feature_names):
- for i, tree in enumerate(info.to_json(feature_names=feature_names)):
- _print_cartinfo(prefix + f"tree{i:03d} :", tree, feature_names)
+def print_xgbinfo(prefix, info):
+ for i, tree in enumerate(info.to_json()):
+ _print_cartinfo(prefix + f"tree{i:03d} :", tree)
-def print_lmtinfo(prefix, info, feature_names):
- _print_lmtinfo(prefix, info.to_json(feature_names=feature_names))
+def print_lmtinfo(prefix, info):
+ _print_lmtinfo(prefix, info.to_json())
def _print_lmtinfo(prefix, model):
@@ -157,41 +157,27 @@ def _print_lmtinfo(prefix, model):
print(f"{prefix}: {model_function}")
-def _print_cartinfo(prefix, model, feature_names):
+def _print_cartinfo(prefix, model):
if model["type"] == "static":
print(f"""{prefix}: {model["value"]}""")
else:
_print_cartinfo(
f"""{prefix} {model["paramName"]}≤{model["threshold"]} """,
model["left"],
- feature_names,
)
_print_cartinfo(
f"""{prefix} {model["paramName"]}>{model["threshold"]} """,
model["right"],
- feature_names,
)
-def print_splitinfo(param_names, info, prefix=""):
+def print_splitinfo(info, prefix=""):
if type(info) is df.SplitFunction:
for k, v in info.child.items():
- if info.param_index < len(param_names):
- param_name = param_names[info.param_index]
- else:
- param_name = f"arg{info.param_index - len(param_names)}"
- print_splitinfo(param_names, v, f"{prefix} {param_name}={k}")
+ print_splitinfo(v, f"{prefix} {info.param_name}={k}")
elif type(info) is df.ScalarSplitFunction:
- if info.param_index < len(param_names):
- param_name = param_names[info.param_index]
- else:
- param_name = f"arg{info.param_index - len(param_names)}"
- print_splitinfo(
- param_names, info.child_le, f"{prefix} {param_name}≤{info.threshold}"
- )
- print_splitinfo(
- param_names, info.child_gt, f"{prefix} {param_name}>{info.threshold}"
- )
+ print_splitinfo(info.child_le, f"{prefix} {info.param_name}≤{info.threshold}")
+ print_splitinfo(info.child_gt, f"{prefix} {info.param_name}>{info.threshold}")
elif type(info) is df.AnalyticFunction:
print_analyticinfo(prefix, info)
elif type(info) is df.StaticFunction:
@@ -200,7 +186,7 @@ def print_splitinfo(param_names, info, prefix=""):
print(f"{prefix}: UNKNOWN {type(info)}")
-def print_model(prefix, info, feature_names):
+def print_model(prefix, info):
if type(info) is df.StaticFunction:
print_staticinfo(prefix, info)
elif type(info) is df.AnalyticFunction:
@@ -208,15 +194,15 @@ def print_model(prefix, info, feature_names):
elif type(info) is df.FOLFunction:
print_analyticinfo(prefix, info)
elif type(info) is df.CARTFunction:
- print_cartinfo(prefix, info, feature_names)
+ print_cartinfo(prefix, info)
elif type(info) is df.SplitFunction:
- print_splitinfo(feature_names, info, prefix)
+ print_splitinfo(info, prefix)
elif type(info) is df.ScalarSplitFunction:
- print_splitinfo(feature_names, info, prefix)
+ print_splitinfo(info, prefix)
elif type(info) is df.LMTFunction:
- print_lmtinfo(prefix, info, feature_names)
+ print_lmtinfo(prefix, info)
elif type(info) is df.XGBoostFunction:
- print_xgbinfo(prefix, info, feature_names)
+ print_xgbinfo(prefix, info)
else:
print(f"{prefix}: {type(info)} UNIMPLEMENTED")
diff --git a/lib/functions.py b/lib/functions.py
index 6b4406c..81a94b4 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -317,8 +317,9 @@ class StaticFunction(ModelFunction):
class SplitFunction(ModelFunction):
- def __init__(self, value, param_index, child, **kwargs):
+ def __init__(self, value, param_index, param_name, child, **kwargs):
super().__init__(value, **kwargs)
+ self.param_name = param_name
self.param_index = param_index
self.child = child
self.use_weighted_avg = bool(int(os.getenv("DFATOOL_RMT_WEIGHTED_AVG", "0")))
@@ -361,15 +362,12 @@ class SplitFunction(ModelFunction):
def to_json(self, **kwargs):
ret = super().to_json(**kwargs)
- with_param_name = kwargs.get("with_param_name", False)
- param_names = kwargs.get("param_names", list())
update = {
"type": "split",
"paramIndex": self.param_index,
+ "paramName": self.param_name,
"child": dict([[k, v.to_json(**kwargs)] for k, v in self.child.items()]),
}
- if with_param_name and param_names:
- update["paramName"] = param_names[self.param_index]
ret.update(update)
return ret
@@ -419,7 +417,7 @@ class SplitFunction(ModelFunction):
@classmethod
def from_json(cls, data):
assert data["type"] == "split"
- self = cls(data["value"], data["paramIndex"], dict())
+ self = cls(data["value"], data["paramIndex"], data["paramName"], dict())
for k, v in data["child"].items():
self.child[k] = ModelFunction.from_json(v)
@@ -431,9 +429,12 @@ class SplitFunction(ModelFunction):
class ScalarSplitFunction(ModelFunction):
- def __init__(self, value, param_index, threshold, child_le, child_gt, **kwargs):
+ def __init__(
+ self, value, param_index, param_name, threshold, child_le, child_gt, **kwargs
+ ):
super().__init__(value, **kwargs)
self.param_index = param_index
+ self.param_name = param_name
self.threshold = threshold
self.child_le = child_le
self.child_gt = child_gt
@@ -455,20 +456,16 @@ class ScalarSplitFunction(ModelFunction):
self.child_le.webconf_function_map() + self.child_gt.webconf_function_map()
)
- def to_json(self, feature_names=None, **kwargs):
+ def to_json(self, **kwargs):
ret = super().to_json(**kwargs)
- with_param_name = kwargs.get("with_param_name", False)
- param_names = kwargs.get("param_names", list())
update = {
"type": "scalarSplit",
"paramIndex": self.param_index,
- "paramName": feature_names[self.param_index],
+ "paramName": self.param_name,
"threshold": self.threshold,
- "left": self.child_le.to_json(),
- "right": self.child_gt.to_json(),
+ "left": self.child_le.to_json(**kwargs),
+ "right": self.child_gt.to_json(**kwargs),
}
- if with_param_name and param_names:
- update["paramName"] = param_names[self.param_index]
ret.update(update)
return ret
@@ -519,7 +516,12 @@ class ScalarSplitFunction(ModelFunction):
left = ModelFunction.from_json(data["left"])
right = ModelFunction.from_json(data["right"])
self = cls(
- data.get("value", 0), data["paramIndex"], data["threshold"], left, right
+ data.get("value", 0),
+ data["paramIndex"],
+ data["paramName"],
+ data["threshold"],
+ left,
+ right,
)
return self
@@ -590,7 +592,8 @@ class SKLearnRegressionFunction(ModelFunction):
def __init__(self, value, regressor, categorial_to_index, ignore_index, **kwargs):
# Needed for JSON export
- self.param_names = kwargs.pop("param_names", None)
+ self.param_names = kwargs.pop("param_names")
+ self.arg_count = kwargs.pop("arg_count")
super().__init__(value, **kwargs)
@@ -598,6 +601,31 @@ class SKLearnRegressionFunction(ModelFunction):
self.categorial_to_index = categorial_to_index
self.ignore_index = ignore_index
+ # SKLearnRegressionFunction descendants use self.param_names \ self.ignore_index as features.
+ # Thus, model feature indexes ≠ self.param_names indexes.
+ # self.feature_names accounts for this and allows mapping feature indexes back to parameter names / parameter indexes.
+ self.feature_names = list(
+ map(
+ lambda i: self.param_names[i],
+ filter(
+ lambda i: not self.ignore_index[i],
+ range(len(self.param_names)),
+ ),
+ )
+ )
+ self.feature_names += list(
+ map(
+ lambda i: f"arg{i-len(self.param_names)}",
+ filter(
+ lambda i: not self.ignore_index[i],
+ range(
+ len(self.param_names),
+ len(self.param_names) + self.arg_count,
+ ),
+ ),
+ )
+ )
+
def is_predictable(self, param_list=None):
"""
Return whether the model function can be evaluated on the given parameter values.
@@ -657,6 +685,28 @@ class SKLearnRegressionFunction(ModelFunction):
predictions = self.regressor.predict(np.array(actual_params))
return predictions
+ def to_json(self, **kwargs):
+ ret = super().to_json(**kwargs)
+
+ # Note: categorial_to_index uses param_names, not feature_names
+ param_names = self.param_names + list(
+ map(
+ lambda i: f"arg{i-len(self.param_names)}",
+ range(
+ len(self.param_names),
+ len(self.param_names) + self.arg_count,
+ ),
+ )
+ )
+ ret["paramValueToIndex"] = dict(
+ map(
+ lambda kv: (param_names[kv[0]], kv[1]),
+ self.categorial_to_index.items(),
+ )
+ )
+
+ return ret
+
class CARTFunction(SKLearnRegressionFunction):
def get_number_of_nodes(self):
@@ -671,11 +721,10 @@ class CARTFunction(SKLearnRegressionFunction):
def get_complexity_score(self):
return self.get_number_of_nodes()
- def to_json(self, feature_names=None, **kwargs):
+ def to_json(self, **kwargs):
import sklearn.tree
self.leaf_id = sklearn.tree._tree.TREE_LEAF
- self.feature_names = feature_names
ret = super().to_json(**kwargs)
ret.update(self.recurse_(self.regressor.tree_, 0))
@@ -763,8 +812,7 @@ class LMTFunction(SKLearnRegressionFunction):
def get_max_depth(self):
return max(map(len, self.regressor._leaves.keys())) + 1
- def to_json(self, feature_names=None, **kwargs):
- self.feature_names = feature_names
+ def to_json(self, **kwargs):
ret = super().to_json(**kwargs)
ret.update(self.recurse_(self.regressor.summary(), 0))
return ret
@@ -804,7 +852,7 @@ class LMTFunction(SKLearnRegressionFunction):
class XGBoostFunction(SKLearnRegressionFunction):
- def to_json(self, feature_names=None, **kwargs):
+ def to_json(self, **kwargs):
import json
tempfile = f"/tmp/xgb{os.getpid()}.json"
@@ -816,31 +864,23 @@ class XGBoostFunction(SKLearnRegressionFunction):
data = json.load(f)
os.remove(tempfile)
- if feature_names:
- return list(
- map(
- lambda tree: self.tree_to_webconf_json(
- tree, feature_names, **kwargs
- ),
- data,
- )
+ return list(
+ map(
+ lambda tree: self.tree_to_webconf_json(tree, **kwargs),
+ data,
)
- return data
+ )
- def tree_to_webconf_json(self, tree, feature_names, **kwargs):
+ def tree_to_webconf_json(self, tree, **kwargs):
ret = dict()
if "children" in tree:
return {
"type": "scalarSplit",
- "paramName": feature_names[int(tree["split"][1:])],
+ "paramName": self.feature_names[int(tree["split"][1:])],
"threshold": tree["split_condition"],
"value": None,
- "left": self.tree_to_webconf_json(
- tree["children"][0], feature_names, **kwargs
- ),
- "right": self.tree_to_webconf_json(
- tree["children"][1], feature_names, **kwargs
- ),
+ "left": self.tree_to_webconf_json(tree["children"][0], **kwargs),
+ "right": self.tree_to_webconf_json(tree["children"][1], **kwargs),
}
else:
return {
diff --git a/lib/model.py b/lib/model.py
index 7b840ec..1c0ee01 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -463,6 +463,7 @@ class AnalyticModel:
self.by_name[name][attribute],
self.by_name[name]["param"],
self.parameters,
+ self._num_args.get(name, 0),
param_type=ParamType(self.by_name[name]["param"]),
)
diff --git a/lib/parameters.py b/lib/parameters.py
index ae4fffb..2e3878f 100644
--- a/lib/parameters.py
+++ b/lib/parameters.py
@@ -609,65 +609,11 @@ class ModelAttribute:
return f"ModelAttribute<{self.name}, {self.attr}, mean={mean}>"
def to_json(self, **kwargs):
- if type(self.model_function) in (
- df.CARTFunction,
- df.LMTFunction,
- df.XGBoostFunction,
- ):
- import sklearn.tree
-
- feature_names = list(
- map(
- lambda i: self.param_names[i],
- filter(
- lambda i: not self.model_function.ignore_index[i],
- range(len(self.param_names)),
- ),
- )
- )
- feature_names += list(
- map(
- lambda i: f"arg{i-len(self.param_names)}",
- filter(
- lambda i: not self.model_function.ignore_index[i],
- range(
- len(self.param_names),
- len(self.param_names) + self.arg_count,
- ),
- ),
- )
- )
- kwargs["feature_names"] = feature_names
- ret = {
+ return {
"paramNames": self.param_names,
"argCount": self.arg_count,
"modelFunction": self.model_function.to_json(**kwargs),
}
- if type(self.model_function) in (
- df.CARTFunction,
- df.FOLFunction,
- df.XGBoostFunction,
- ):
- feature_names = self.param_names
- feature_names += list(
- map(
- lambda i: f"arg{i-len(self.param_names)}",
- filter(
- lambda i: not self.model_function.ignore_index[i],
- range(
- len(self.param_names),
- len(self.param_names) + self.arg_count,
- ),
- ),
- )
- )
- ret["paramValueToIndex"] = dict(
- map(
- lambda kv: (feature_names[kv[0]], kv[1]),
- self.model_function.categorial_to_index.items(),
- )
- )
- return ret
def to_dref(self, unit=None):
ret = {"mean": (self.mean, unit), "median": (self.median, unit)}
@@ -710,47 +656,27 @@ class ModelAttribute:
self.model_function.to_dot(pydot, graph, self.param_names)
return graph
- feature_names = list(
- map(
- lambda i: self.param_names[i],
- filter(
- lambda i: not self.model_function.ignore_index[i],
- range(len(self.param_names)),
- ),
- )
- )
- feature_names += list(
- map(
- lambda i: f"arg{i-len(self.param_names)}",
- filter(
- lambda i: not self.model_function.ignore_index[i],
- range(
- len(self.param_names),
- len(self.param_names) + self.arg_count,
- ),
- ),
- )
- )
-
if type(self.model_function) == df.CARTFunction:
import sklearn.tree
return sklearn.tree.export_graphviz(
self.model_function.regressor,
out_file=None,
- feature_names=feature_names,
+ feature_names=self.model_function.feature_names,
)
if type(self.model_function) == df.XGBoostFunction:
import xgboost
- self.model_function.regressor.get_booster().feature_names = feature_names
+ self.model_function.regressor.get_booster().feature_names = (
+ self.model_function.feature_names
+ )
return [
xgboost.to_graphviz(self.model_function.regressor, num_trees=i)
for i in range(self.model_function.regressor.n_estimators)
]
if type(self.model_function) == df.LMTFunction:
return self.model_function.regressor.model_to_dot(
- feature_names=feature_names
+ feature_names=self.model_function.feature_names
)
return None
@@ -921,7 +847,13 @@ class ModelAttribute:
for param_index, _ in enumerate(self.param_names):
if len(self.stats.distinct_values_by_param_index[param_index]) < 2:
ignore_param_indexes.append(param_index)
- x = df.FOLFunction(self.median, self.param_names, n_samples=self.data.shape[0])
+ x = df.FOLFunction(
+ self.median,
+ self.param_names,
+ n_samples=self.data.shape[0],
+ param_names=self.param_names,
+ arg_count=self.arg_count,
+ )
x.fit(self.param_values, self.data, ignore_param_indexes=ignore_param_indexes)
if x.fit_success:
self.model_function = x
@@ -1063,6 +995,7 @@ class ModelAttribute:
ignore_index,
n_samples=len(data),
param_names=self.param_names,
+ arg_count=self.arg_count,
)
logger.debug("Fitted sklearn CART")
return
@@ -1150,7 +1083,13 @@ class ModelAttribute:
return
xgb.fit(fit_parameters, np.reshape(data, (-1, 1)))
self.model_function = df.XGBoostFunction(
- np.mean(data), xgb, category_to_index, ignore_index, n_samples=len(data)
+ np.mean(data),
+ xgb,
+ category_to_index,
+ ignore_index,
+ n_samples=len(data),
+ param_names=self.param_names,
+ arg_count=self.arg_count,
)
output_filename = os.getenv("DFATOOL_XGB_DUMP_MODEL", None)
if output_filename:
@@ -1247,6 +1186,7 @@ class ModelAttribute:
ignore_index,
n_samples=len(data),
param_names=self.param_names,
+ arg_count=self.arg_count,
)
return
@@ -1510,4 +1450,10 @@ class ModelAttribute:
assert len(child.values()) >= 2
- return df.SplitFunction(np.mean(data), symbol_index, child, n_samples=len(data))
+ return df.SplitFunction(
+ np.mean(data),
+ symbol_index,
+ self.log_param_names[symbol_index],
+ child,
+ n_samples=len(data),
+ )
diff --git a/lib/utils.py b/lib/utils.py
index 989c830..d6cdfc5 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -315,8 +315,8 @@ def param_to_ndarray(
for i, paramset in distinct_values.items():
distinct_values[i] = sorted(paramset)
category_to_scalar[i] = dict()
- for j, param in enumerate(distinct_values[i]):
- category_to_scalar[i][param] = j
+ for j, param_value in enumerate(distinct_values[i]):
+ category_to_scalar[i][param_value] = j
ignore_index = dict()
for i in range(len(param_tuples[0])):