diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2020-07-09 14:55:08 +0200 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2020-07-09 14:55:08 +0200 |
commit | 016210b6c64881a683b45a91af213a1d1a1128ab (patch) | |
tree | 48f700bf5c1039ca79cd7f3e69c1f48fdd4595e3 | |
parent | 2f5fa87125ccf44f1c6e208ed736070274ea6e2e (diff) |
working candidate detection for decision trees
-rwxr-xr-x | bin/analyze-archive.py | 4 | ||||
-rw-r--r-- | lib/model.py | 112 | ||||
-rw-r--r-- | lib/parameters.py | 4 | ||||
-rw-r--r-- | lib/utils.py | 1 |
4 files changed, 118 insertions, 3 deletions
diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py index c531bb6..4dd2f90 100755 --- a/bin/analyze-archive.py +++ b/bin/analyze-archive.py @@ -112,7 +112,7 @@ import sys from dfatool import plotter from dfatool.loader import RawData, pta_trace_to_aggregate from dfatool.functions import gplearn_to_function -from dfatool.model import PTAModel +from dfatool.model import PTAModel, DecisionTreeModel from dfatool.validation import CrossValidator from dfatool.utils import filter_aggregate_by_param from dfatool.automata import PTA @@ -454,6 +454,8 @@ if __name__ == "__main__": pta=pta, ) + dtreemodel = DecisionTreeModel(by_name, parameters,) + if xv_method: xv = CrossValidator(PTAModel, by_name, parameters, arg_count) diff --git a/lib/model.py b/lib/model.py index 082fe8a..68286c9 100644 --- a/lib/model.py +++ b/lib/model.py @@ -10,6 +10,7 @@ from .functions import AnalyticFunction from .parameters import ParamStats from .utils import is_numeric, soft_cast_int, param_slice_eq, remove_index_from_tuple from .utils import by_name_to_by_param, match_parameter_values +from .utils import filter_aggregate_by_param logger = logging.getLogger(__name__) arg_support_enabled = True @@ -374,7 +375,7 @@ def _num_args_from_by_name(by_name): class AnalyticModel: - u""" + """ Parameter-aware analytic energy/data size/... model. Supports both static and parameter-based model attributes, and automatic detection of parameter-dependence. @@ -661,8 +662,115 @@ class AnalyticModel: pass +def grep_aggregate_by_state_and_param(aggregate, name, param_index, param_value): + new_aggregate = dict() + new_aggregate[name] = { + "attributes": aggregate[name]["attributes"], + } + + param_index = soft_cast_int(param_index) + indices_to_copy = list( + map(lambda x: x[param_index] == param_value, aggregate[name]["param"]) + ) + + if len(indices_to_copy) == 0: + raise RuntimeError("empty result") + + new_aggregate[name]["param"] = list( + map( + lambda iv: iv[1], + filter( + lambda iv: indices_to_copy[iv[0]], enumerate(aggregate[name]["param"]), + ), + ) + ) + + for attribute in aggregate[name]["attributes"]: + new_aggregate[name][attribute] = aggregate[name][attribute][indices_to_copy] + + return new_aggregate + + +class DecisionTreeModel: + def __init__(self, by_name, parameters): + self.by_name = by_name + self.by_param = by_name_to_by_param(by_name) + self.parameters = sorted(parameters) + + # Dtree-Konzept: Für jeden Param: Split auf ==wert1 / == wert2 (-> zwei Partitionen der Daten) + # Falls Menge beeinflussender Parameter in den beiden Partitionen unterschiedlich oder + # Art der Abhängigkeit unterschiedlich: Hiernach muss der DTree unterscheiden. + # Auswahl der ersten Ebene anhand TODO (Parameter, bei dem die beiden Partitionen den niedrigsten absolute static model error haben? + # Oder bei dem es die größte Anzahl unterschiedlciher Parameter / Abhängigkeitstypen gibt? So eine Gütefunktion wäre dafür nice) + + stats = ParamStats(self.by_name, self.by_param, self.parameters, dict(), False) + pre_candidates = list() + for state in self.states(): + for i, param in enumerate(self.parameters): + if len(stats.distinct_values[state][param]) == 2: + logger.debug(f"{state} has binary param {param}") + pre_candidates.append( + (state, i, param, stats.distinct_values[state][param]) + ) + + candidates = list() + for state, param_index, param_name, param_values in pre_candidates: + by_name_sub1 = grep_aggregate_by_state_and_param( + by_name, state, param_index, param_values[0] + ) + by_param_sub1 = by_name_to_by_param(by_name_sub1) + by_name_sub2 = grep_aggregate_by_state_and_param( + by_name, state, param_index, param_values[1] + ) + by_param_sub2 = by_name_to_by_param(by_name_sub2) + stats_sub1 = ParamStats( + by_name_sub1, by_param_sub1, self.parameters, dict(), False + ) + stats_sub2 = ParamStats( + by_name_sub2, by_param_sub2, self.parameters, dict(), False + ) + for attribute in by_name[state]["attributes"]: + param1 = stats_sub1.stats[state][attribute]["depends_on_params"] + param2 = stats_sub2.stats[state][attribute]["depends_on_params"] + if param1 != param2: + logger.debug( + f"{state} {attribute} by {param_name} is dtree candidate" + ) + logger.debug( + " {} == {} -> depends on {}".format( + param_name, param_values[0], param1 + ) + ) + logger.debug( + " {} == {} -> depends on {}".format( + param_name, param_values[1], param2 + ) + ) + candidates.append( + (state, attribute, param_index, param_name, param_values) + ) + + candidates_by_state_attribute = dict() + for state, attribute, param_index, param_name, param_values in candidates: + if (state, attribute) not in candidates_by_state_attribute: + candidates_by_state_attribute[(state, attribute)] = list() + candidates_by_state_attribute[(state, attribute)].append( + (param_index, param_name, param_values) + ) + + print(candidates_by_state_attribute) + + def states(self): + """Return sorted list of state names.""" + return sorted( + list( + filter(lambda k: self.by_name[k]["isa"] == "state", self.by_name.keys()) + ) + ) + + class PTAModel: - u""" + """ Parameter-aware PTA-based energy model. Supports both static and parameter-based model attributes, and automatic detection of parameter-dependence. diff --git a/lib/parameters.py b/lib/parameters.py index 5c6b978..3a393f6 100644 --- a/lib/parameters.py +++ b/lib/parameters.py @@ -272,6 +272,7 @@ def _compute_param_statistics( "corr_by_arg": [], "depends_on_param": {}, "depends_on_arg": [], + "depends_on_params": list(), } np.seterr("raise") @@ -297,6 +298,9 @@ def _compute_param_statistics( ret["std_param_lut"], ) + if ret["depends_on_param"][param]: + ret["depends_on_params"].append(param) + if state_or_trans in arg_count: for arg_index in range(arg_count[state_or_trans]): std_matrix, mean_std, lut_matrix = _std_by_param( diff --git a/lib/utils.py b/lib/utils.py index d28ecda..990ad01 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -204,6 +204,7 @@ def filter_aggregate_by_param(aggregate, parameters, parameter_filter): indices_to_keep = list( map(lambda x: x[param_index] == param_value, aggregate[name]["param"]) ) + # ["param"] is not a numpy array, so we can't use ["param"][indices_to_keep] and rely on this map-filter-expression instead aggregate[name]["param"] = list( map( lambda iv: iv[1], |