From 7d6d1984b010cd2ddb47f04065c528837b53bfba Mon Sep 17 00:00:00 2001 From: Birte Kristina Friesel Date: Fri, 28 Jun 2024 16:38:40 +0200 Subject: analyze-log: add --information-gain --- lib/parameters.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'lib/parameters.py') diff --git a/lib/parameters.py b/lib/parameters.py index 4047c10..352e7c7 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -604,6 +604,9 @@ class ModelAttribute: # The best model we have. May be Static, Split, or Param (and later perhaps Substate) self.model_function = None + # Information gain cache. Used for statistical analysis + self.mutual_information_cache = None + self._check_codependent_param() # There must be at least 3 distinct data values (≠ None) if an analytic model @@ -699,6 +702,29 @@ class ModelAttribute: def webconf_function_map(self): return self.model_function.webconf_function_map() + def mutual_information(self): + if self.mutual_information_cache is not None: + return self.mutual_information_cache + + from sklearn.feature_selection import mutual_info_regression + + fit_parameters, _, ignore_index = param_to_ndarray( + self.param_values, with_nan=False, categorical_to_scalar=True + ) + + param_to_fit_param = dict() + j = 0 + for i in range(len(self.param_names)): + if not ignore_index[i]: + param_to_fit_param[i] = j + j += 1 + + self.mutual_information_cache = mutual_info_regression( + fit_parameters, self.data + ) + + return self.mutual_information_cache + @classmethod def from_json(cls, name, attr, data): param_names = data["paramNames"] -- cgit v1.2.3