summaryrefslogtreecommitdiff
path: root/lib/validation.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/validation.py')
-rw-r--r--lib/validation.py238
1 files changed, 238 insertions, 0 deletions
diff --git a/lib/validation.py b/lib/validation.py
new file mode 100644
index 0000000..ee147fe
--- /dev/null
+++ b/lib/validation.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def _xv_partitions_kfold(length, k=10):
+ """
+ Return k pairs of training and validation sets for k-fold cross-validation on `length` items.
+
+ In k-fold cross-validation, every k-th item is used for validation and the remainder is used for training.
+ As there are k ways to do this (items 0, k, 2k, ... vs. items 1, k+1, 2k+1, ... etc), this function returns k pairs of training and validation set.
+
+ Note that this function operates on indices, not data.
+ """
+ pairs = []
+ num_slices = k
+ indexes = np.arange(length)
+ for i in range(num_slices):
+ training = np.delete(indexes, slice(i, None, num_slices))
+ validation = indexes[i::num_slices]
+ pairs.append((training, validation))
+ return pairs
+
+
+def _xv_partition_montecarlo(length):
+ """
+ Return training and validation set for Monte Carlo cross-validation on `length` items.
+
+ This function operates on indices, not data. It randomly partitions range(length) into a list of training indices and a list of validation indices.
+
+ The training set contains 2/3 of all indices; the validation set consits of the remaining 1/3.
+
+ Example: 9 items -> training = [7, 3, 8, 0, 4, 2], validation = [ 1, 6, 5]
+ """
+ shuffled = np.random.permutation(np.arange(length))
+ border = int(length * float(2) / 3)
+ training = shuffled[:border]
+ validation = shuffled[border:]
+ return (training, validation)
+
+
+class CrossValidator:
+ """
+ Cross-Validation helper for model generation.
+
+ Given a set of measurements and a model class, it will partition the
+ data into training and validation sets, train the model on the training
+ set, and assess its quality on the validation set. This is repeated
+ several times depending on cross-validation algorithm and configuration.
+ Reports the mean model error over all cross-validation runs.
+ """
+
+ def __init__(self, model_class, by_name, parameters, arg_count):
+ """
+ Create a new CrossValidator object.
+
+ Does not perform cross-validation yet.
+
+ arguments:
+ model_class -- model class/type used for model synthesis,
+ e.g. PTAModel or AnalyticModel. model_class must have a
+ constructor accepting (by_name, parameters, arg_count)
+ and provide an `assess` method.
+ by_name -- measurements aggregated by state/transition/function/... name.
+ Layout: by_name[name][attribute] = list of data. Additionally,
+ by_name[name]['attributes'] must be set to the list of attributes,
+ e.g. ['power'] or ['duration', 'energy'].
+ """
+ self.model_class = model_class
+ self.by_name = by_name
+ self.names = sorted(by_name.keys())
+ self.parameters = sorted(parameters)
+ self.arg_count = arg_count
+
+ def kfold(self, model_getter, k=10):
+ """
+ Perform k-fold cross-validation and return average model quality.
+
+ The by_name data is divided into 1-1/k training and 1/k validation in a deterministic manner.
+ After creating a model for the training set, the
+ model type returned by model_getter is evaluated on the validation set.
+ This is repeated k times; the average of all measures is returned to the user.
+
+ arguments:
+ model_getter -- function with signature (model_object) -> model,
+ e.g. lambda m: m.get_fitted()[0] to evaluate the parameter-aware
+ model with automatic parameter detection.
+ k -- step size for k-fold cross-validation. The validation set contains 100/k % of data.
+
+ return value:
+ dict of model quality measures.
+ {
+ 'by_name' : {
+ for each name: {
+ for each attribute: {
+ 'mae' : mean of all mean absolute errors
+ 'mae_list' : list of the individual MAE values encountered during cross-validation
+ 'smape' : mean of all symmetric mean absolute percentage errors
+ 'smape_list' : list of the individual SMAPE values encountered during cross-validation
+ }
+ }
+ }
+ }
+ """
+
+ # training / validation subsets for each state and transition
+ subsets_by_name = dict()
+ training_and_validation_sets = list()
+
+ for name in self.names:
+ sample_count = len(self.by_name[name]["param"])
+ subsets_by_name[name] = list()
+ subsets_by_name[name] = _xv_partitions_kfold(sample_count, k)
+
+ for i in range(k):
+ training_and_validation_sets.append(dict())
+ for name in self.names:
+ training_and_validation_sets[i][name] = subsets_by_name[name][i]
+
+ return self._generic_xv(model_getter, training_and_validation_sets)
+
+ def montecarlo(self, model_getter, count=200):
+ """
+ Perform Monte Carlo cross-validation and return average model quality.
+
+ The by_name data is randomly divided into 2/3 training and 1/3
+ validation. After creating a model for the training set, the
+ model type returned by model_getter is evaluated on the validation set.
+ This is repeated count times (defaulting to 200); the average of all
+ measures is returned to the user.
+
+ arguments:
+ model_getter -- function with signature (model_object) -> model,
+ e.g. lambda m: m.get_fitted()[0] to evaluate the parameter-aware
+ model with automatic parameter detection.
+ count -- number of validation runs to perform, defaults to 200
+
+ return value:
+ dict of model quality measures.
+ {
+ 'by_name' : {
+ for each name: {
+ for each attribute: {
+ 'mae' : mean of all mean absolute errors
+ 'mae_list' : list of the individual MAE values encountered during cross-validation
+ 'smape' : mean of all symmetric mean absolute percentage errors
+ 'smape_list' : list of the individual SMAPE values encountered during cross-validation
+ }
+ }
+ }
+ }
+ """
+
+ # training / validation subsets for each state and transition
+ subsets_by_name = dict()
+ training_and_validation_sets = list()
+
+ for name in self.names:
+ sample_count = len(self.by_name[name]["param"])
+ subsets_by_name[name] = list()
+ for _ in range(count):
+ subsets_by_name[name].append(_xv_partition_montecarlo(sample_count))
+
+ for i in range(count):
+ training_and_validation_sets.append(dict())
+ for name in self.names:
+ training_and_validation_sets[i][name] = subsets_by_name[name][i]
+
+ return self._generic_xv(model_getter, training_and_validation_sets)
+
+ def _generic_xv(self, model_getter, training_and_validation_sets):
+ ret = {"by_name": dict()}
+
+ for name in self.names:
+ ret["by_name"][name] = dict()
+ for attribute in self.by_name[name]["attributes"]:
+ ret["by_name"][name][attribute] = {
+ "mae_list": list(),
+ "rmsd_list": list(),
+ "smape_list": list(),
+ }
+
+ for training_and_validation_by_name in training_and_validation_sets:
+ res = self._single_xv(model_getter, training_and_validation_by_name)
+ for name in self.names:
+ for attribute in self.by_name[name]["attributes"]:
+ for measure in ("mae", "rmsd", "smape"):
+ ret["by_name"][name][attribute][f"{measure}_list"].append(
+ res["by_name"][name][attribute][measure]
+ )
+
+ for name in self.names:
+ for attribute in self.by_name[name]["attributes"]:
+ for measure in ("mae", "rmsd", "smape"):
+ ret["by_name"][name][attribute][measure] = np.mean(
+ ret["by_name"][name][attribute][f"{measure}_list"]
+ )
+
+ return ret
+
+ def _single_xv(self, model_getter, tv_set_dict):
+ training = dict()
+ validation = dict()
+ for name in self.names:
+ training[name] = {"attributes": self.by_name[name]["attributes"]}
+ validation[name] = {"attributes": self.by_name[name]["attributes"]}
+
+ if "isa" in self.by_name[name]:
+ training[name]["isa"] = self.by_name[name]["isa"]
+ validation[name]["isa"] = self.by_name[name]["isa"]
+
+ training_subset, validation_subset = tv_set_dict[name]
+
+ for attribute in self.by_name[name]["attributes"]:
+ self.by_name[name][attribute] = np.array(self.by_name[name][attribute])
+ training[name][attribute] = self.by_name[name][attribute][
+ training_subset
+ ]
+ validation[name][attribute] = self.by_name[name][attribute][
+ validation_subset
+ ]
+
+ # We can't use slice syntax for 'param', which may contain strings and other odd values
+ training[name]["param"] = list()
+ validation[name]["param"] = list()
+ for idx in training_subset:
+ training[name]["param"].append(self.by_name[name]["param"][idx])
+ for idx in validation_subset:
+ validation[name]["param"].append(self.by_name[name]["param"][idx])
+
+ training_data = self.model_class(training, self.parameters, self.arg_count)
+ training_model = model_getter(training_data)
+ validation_data = self.model_class(validation, self.parameters, self.arg_count)
+
+ return validation_data.assess(training_model)