kconfig: model generation and validation with limited sample size

eval-kconfig estimates the generalization error in this case
author: Daniel Friesel <daniel.friesel@uos.de> 2020-09-24 09:43:09 +0200
committer: Daniel Friesel <daniel.friesel@uos.de> 2020-09-24 09:43:09 +0200
commit: ce11d3f77c7fb718124d54f6456b7a0d8f2ceaf0 (patch)
tree: 52668a7b4025c43a40fc3a3d3e65831a8f890f0e /bin/eval-kconfig.py
parent: 1ed7a66d836977ae9689f59b6dc5fca0f4637587 (diff)
1 files changed, 46 insertions, 2 deletions
diff --git a/bin/eval-kconfig.py b/bin/eval-kconfig.py
index 7f48b52..7bc0c41 100755
--- a/bin/eval-kconfig.py
+++ b/bin/eval-kconfig.py
@@ -37,6 +37,19 @@ def main():
     parser.add_argument(
         "--with-choice-node", action="store_true", help="Use non-binary Choice Nodes"
     )
+    parser.add_argument(
+        "--max-loss",
+        type=float,
+        help="Maximum acceptable model loss for DecisionTree Leaves",
+        default=10,
+    )
+    # Falls die population exhaustive ist, kann man nun den generalization error berechnen
+    parser.add_argument(
+        "--sample-size",
+        type=int,
+        help="Perform model generation and validation with N random samples from the population",
+        metavar="N",
+    )
     parser.add_argument("kconfig_path", type=str, help="Path to Kconfig file")
     parser.add_argument(
         "experiment_root", type=str, help="Experiment results directory"
@@ -54,11 +67,26 @@ def main():
 
     k = 10
 
-    partition_pairs = validation._xv_partitions_kfold(len(data.data), k)
+    if args.sample_size:
+        shuffled_data_indices = np.random.permutation(np.arange(len(data.data)))
+        sample_indices = shuffled_data_indices[: args.sample_size]
+        nonsample_indices = shuffled_data_indices[args.sample_size :]
+        partition_pairs = validation._xv_partitions_kfold(args.sample_size, k)
+        partition_pairs = list(
+            map(
+                lambda tv: (shuffled_data_indices[tv[0]], shuffled_data_indices[tv[1]]),
+                partition_pairs,
+            )
+        )
+    else:
+        partition_pairs = validation._xv_partitions_kfold(len(data.data), k)
+
     measures = list()
     for training_set, validation_set in partition_pairs:
         model = KConfigModel.from_benchmark(data, args.attribute, indices=training_set)
         model.with_choice_node = args.with_choice_node
+        if args.max_loss:
+            model.max_loss = args.max_loss
         model.build_tree()
         measures.append(model.assess_benchmark(data, indices=validation_set))
 
@@ -71,11 +99,27 @@ def main():
 
     print("10-fold Cross Validation:")
     print(f"MAE: {aggregate['mae']:.0f} B")
-    print(f"SMAPE: {aggregate['smape']:.0f} %")
+    print(f"SMAPE: {aggregate['smape']:.1f} %")
     print(f"Unpredictable Configurations: {aggregate['unpredictable_count']}")
 
     print(aggregate)
 
+    if args.sample_size:
+        print("Estimated Generalization Error")
+        model = KConfigModel.from_benchmark(
+            data, args.attribute, indices=sample_indices
+        )
+        model.with_choice_node = args.with_choice_node
+        if args.max_loss:
+            model.max_loss = args.max_loss
+        model.build_tree()
+        generalization_measure = model.assess_benchmark(data, indices=nonsample_indices)
+        print(f"MAE: {generalization_measure['mae']:.0f} B")
+        print(f"SMAPE: {generalization_measure['smape']:.1f} %")
+        print(
+            f"Unpredictable Configurations: {generalization_measure['unpredictable_count']}"
+        )
+
     """
     if args.model:
         with open(args.model, "r") as f:
author	Daniel Friesel <daniel.friesel@uos.de>	2020-09-24 09:43:09 +0200
committer	Daniel Friesel <daniel.friesel@uos.de>	2020-09-24 09:43:09 +0200
commit	ce11d3f77c7fb718124d54f6456b7a0d8f2ceaf0 (patch)
tree	52668a7b4025c43a40fc3a3d3e65831a8f890f0e /bin/eval-kconfig.py
parent	1ed7a66d836977ae9689f59b6dc5fca0f4637587 (diff)