diff options
author | Daniel Friesel <daniel.friesel@uos.de> | 2020-07-01 10:25:47 +0200 |
---|---|---|
committer | Daniel Friesel <daniel.friesel@uos.de> | 2020-07-01 10:25:47 +0200 |
commit | 2833115dff3da0e9b9a84fc5642b3a43034b27af (patch) | |
tree | d255c3f925d46383956f278bab50128663334c8e /lib/dfatool.py | |
parent | 08b1449e27da52e186f951914290b56b18bc64b2 (diff) |
Restore k-fold cross validation support
Diffstat (limited to 'lib/dfatool.py')
-rw-r--r-- | lib/dfatool.py | 99 |
1 files changed, 91 insertions, 8 deletions
diff --git a/lib/dfatool.py b/lib/dfatool.py index 63639d3..0da8cc9 100644 --- a/lib/dfatool.py +++ b/lib/dfatool.py @@ -194,10 +194,19 @@ class KeysightCSV: return timestamps, currents -def _xv_partitions_kfold(length, num_slices): +def _xv_partitions_kfold(length, k=10): + """ + Return k pairs of training and validation sets for k-fold cross-validation on `length` items. + + In k-fold cross-validation, every k-th item is used for validation and the remainder is used for training. + As there are k ways to do this (items 0, k, 2k, ... vs. items 1, k+1, 2k+1, ... etc), this function returns k pairs of training and validation set. + + Note that this function operates on indices, not data. + """ pairs = [] + num_slices = k indexes = np.arange(length) - for i in range(0, num_slices): + for i in range(num_slices): training = np.delete(indexes, slice(i, None, num_slices)) validation = indexes[i::num_slices] pairs.append((training, validation)) @@ -205,6 +214,15 @@ def _xv_partitions_kfold(length, num_slices): def _xv_partition_montecarlo(length): + """ + Return training and validation set for Monte Carlo cross-validation on `length` items. + + This function operates on indices, not data. It randomly partitions range(length) into a list of training indices and a list of validation indices. + + The training set contains 2/3 of all indices; the validation set consits of the remaining 1/3. + + Example: 9 items -> training = [7, 3, 8, 0, 4, 2], validation = [ 1, 6, 5] + """ shuffled = np.random.permutation(np.arange(length)) border = int(length * float(2) / 3) training = shuffled[:border] @@ -233,7 +251,7 @@ class CrossValidator: model_class -- model class/type used for model synthesis, e.g. PTAModel or AnalyticModel. model_class must have a constructor accepting (by_name, parameters, arg_count, verbose = False) - and provide an assess method. + and provide an `assess` method. by_name -- measurements aggregated by state/transition/function/... name. Layout: by_name[name][attribute] = list of data. Additionally, by_name[name]['attributes'] must be set to the list of attributes, @@ -245,6 +263,53 @@ class CrossValidator: self.parameters = sorted(parameters) self.arg_count = arg_count + def kfold(self, model_getter, k=10): + """ + Perform k-fold cross-validation and return average model quality. + + The by_name data is divided into 1-1/k training and 1/k validation in a deterministic manner. + After creating a model for the training set, the + model type returned by model_getter is evaluated on the validation set. + This is repeated k times; the average of all measures is returned to the user. + + arguments: + model_getter -- function with signature (model_object) -> model, + e.g. lambda m: m.get_fitted()[0] to evaluate the parameter-aware + model with automatic parameter detection. + k -- step size for k-fold cross-validation. The validation set contains 100/k % of data. + + return value: + dict of model quality measures. + { + 'by_name' : { + for each name: { + for each attribute: { + 'mae' : mean of all mean absolute errors + 'mae_list' : list of the individual MAE values encountered during cross-validation + 'smape' : mean of all symmetric mean absolute percentage errors + 'smape_list' : list of the individual SMAPE values encountered during cross-validation + } + } + } + } + """ + + # training / validation subsets for each state and transition + subsets_by_name = dict() + training_and_validation_sets = list() + + for name in self.names: + sample_count = len(self.by_name[name]["param"]) + subsets_by_name[name] = list() + subsets_by_name[name] = _xv_partitions_kfold(sample_count, k) + + for i in range(k): + training_and_validation_sets.append(dict()) + for name in self.names: + training_and_validation_sets[i][name] = subsets_by_name[name][i] + + return self._generic_xv(model_getter, training_and_validation_sets) + def montecarlo(self, model_getter, count=200): """ Perform Monte Carlo cross-validation and return average model quality. @@ -276,6 +341,25 @@ class CrossValidator: } } """ + + # training / validation subsets for each state and transition + subsets_by_name = dict() + training_and_validation_sets = list() + + for name in self.names: + sample_count = len(self.by_name[name]["param"]) + subsets_by_name[name] = list() + for _ in range(count): + subsets_by_name[name].append(_xv_partition_montecarlo(sample_count)) + + for i in range(count): + training_and_validation_sets.append(dict()) + for name in self.names: + training_and_validation_sets[i][name] = subsets_by_name[name][i] + + return self._generic_xv(model_getter, training_and_validation_sets) + + def _generic_xv(self, model_getter, training_and_validation_sets): ret = {"by_name": dict()} for name in self.names: @@ -286,8 +370,8 @@ class CrossValidator: "smape_list": list(), } - for _ in range(count): - res = self._single_montecarlo(model_getter) + for training_and_validation_by_name in training_and_validation_sets: + res = self._single_xv(model_getter, training_and_validation_by_name) for name in self.names: for attribute in self.by_name[name]["attributes"]: ret["by_name"][name][attribute]["mae_list"].append( @@ -308,7 +392,7 @@ class CrossValidator: return ret - def _single_montecarlo(self, model_getter): + def _single_xv(self, model_getter, tv_set_dict): training = dict() validation = dict() for name in self.names: @@ -319,8 +403,7 @@ class CrossValidator: training[name]["isa"] = self.by_name[name]["isa"] validation[name]["isa"] = self.by_name[name]["isa"] - data_count = len(self.by_name[name]["param"]) - training_subset, validation_subset = _xv_partition_montecarlo(data_count) + training_subset, validation_subset = tv_set_dict[name] for attribute in self.by_name[name]["attributes"]: self.by_name[name][attribute] = np.array(self.by_name[name][attribute]) |