summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbin/observations-enum-to-boolean.py26
-rw-r--r--lib/utils.py44
2 files changed, 42 insertions, 28 deletions
diff --git a/bin/observations-enum-to-boolean.py b/bin/observations-enum-to-boolean.py
index 20cff32..eae9bd1 100755
--- a/bin/observations-enum-to-boolean.py
+++ b/bin/observations-enum-to-boolean.py
@@ -13,31 +13,7 @@ def main():
with lzma.open(infile, "rt") as f:
observations = json.load(f)
- distinct_param_values = dict()
- replace_map = dict()
-
- for observation in observations:
- for k, v in observation["param"].items():
- if not k in distinct_param_values:
- distinct_param_values[k] = set()
- if v is not None:
- distinct_param_values[k].add(v)
-
- for param_name, distinct_values in distinct_param_values.items():
- if len(distinct_values) > 2 and not all(
- map(lambda x: x is None or dfatool.utils.is_numeric(x), distinct_values)
- ):
- replace_map[param_name] = distinct_values
-
- for observation in observations:
- binary_keys = set()
- for k, v in replace_map.items():
- enum_value = observation["param"].pop(k)
- for binary_key in v:
- observation["param"][binary_key] = int(enum_value == binary_key)
- if binary_key in binary_keys:
- print(f"Error: key '{binary_key}' is not unique")
- binary_keys.add(binary_key)
+ dfatool.utils.observations_enum_to_bool(observations)
with lzma.open(outfile, "wt") as f:
json.dump(observations, f)
diff --git a/lib/utils.py b/lib/utils.py
index 7372995..7d5b5b9 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -211,6 +211,44 @@ def param_dict_to_list(param_dict, parameter_names, default=None):
return ret
+def observations_enum_to_bool(observations: list, kconfig=False):
+ """
+ Convert enum / categorial observations to boolean-only ones.
+ 'observations' is altered in-place.
+ """
+ distinct_param_values = dict()
+ replace_map = dict()
+
+ for observation in observations:
+ for k, v in observation["param"].items():
+ if not k in distinct_param_values:
+ distinct_param_values[k] = set()
+ if v is not None:
+ distinct_param_values[k].add(v)
+
+ for param_name, distinct_values in distinct_param_values.items():
+ if len(distinct_values) > 2 and not all(
+ map(lambda x: x is None or is_numeric(x), distinct_values)
+ ):
+ replace_map[param_name] = distinct_values
+
+ for observation in observations:
+ binary_keys = set()
+ for k, v in replace_map.items():
+ enum_value = observation["param"].pop(k)
+ for binary_key in v:
+ if kconfig:
+ if enum_value == binary_key:
+ observation["param"][binary_key] = "y"
+ else:
+ observation["param"][binary_key] = "n"
+ else:
+ observation["param"][binary_key] = int(enum_value == binary_key)
+ if binary_key in binary_keys:
+ print(f"Error: key '{binary_key}' is not unique")
+ binary_keys.add(binary_key)
+
+
def observations_to_by_name(observations: list):
"""
Convert observation list to by_name dictionary for AnalyticModel analysis
@@ -462,9 +500,9 @@ def regression_measures(predicted: np.ndarray, actual: np.ndarray):
return {}
measures = {
"mae": np.mean(np.abs(deviations), dtype=np.float64),
- "msd": np.mean(deviations ** 2, dtype=np.float64),
- "rmsd": np.sqrt(np.mean(deviations ** 2), dtype=np.float64),
- "ssr": np.sum(deviations ** 2, dtype=np.float64),
+ "msd": np.mean(deviations**2, dtype=np.float64),
+ "rmsd": np.sqrt(np.mean(deviations**2), dtype=np.float64),
+ "ssr": np.sum(deviations**2, dtype=np.float64),
"rsq": r2_score(actual, predicted),
"count": len(actual),
}