From ce11d3f77c7fb718124d54f6456b7a0d8f2ceaf0 Mon Sep 17 00:00:00 2001
From: Daniel Friesel <daniel.friesel@uos.de>
Date: Thu, 24 Sep 2020 09:43:09 +0200
Subject: kconfig: model generation and validation with limited sample size

eval-kconfig estimates the generalization error in this case
---
 bin/analyze-kconfig.py | 24 +++++++++++++++++++++++-
 bin/eval-kconfig.py    | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/bin/analyze-kconfig.py b/bin/analyze-kconfig.py
index f7ae448..ff220b0 100755
--- a/bin/analyze-kconfig.py
+++ b/bin/analyze-kconfig.py
@@ -13,6 +13,8 @@ import kconfiglib
 import logging
 import os
 
+import numpy as np
+
 from dfatool.loader import KConfigAttributes
 from dfatool.model import KConfigModel
 
@@ -48,6 +50,15 @@ def main():
         type=lambda level: getattr(logging, level.upper()),
         help="Set log level",
     )
+    parser.add_argument(
+        "--info", action="store_true", help="Show Kconfig and benchmark information"
+    )
+    parser.add_argument(
+        "--sample-size",
+        type=int,
+        help="Restrict model generation to N random samples",
+        metavar="N",
+    )
     parser.add_argument("kconfig_path", type=str, help="Path to Kconfig file")
     parser.add_argument(
         "model",
@@ -64,7 +75,15 @@ def main():
 
     if os.path.isdir(args.model):
         data = KConfigAttributes(args.kconfig_path, args.model)
-        model = KConfigModel.from_benchmark(data, args.attribute)
+
+        if args.sample_size:
+            shuffled_data_indices = np.random.permutation(np.arange(len(data.data)))
+            sample_indices = shuffled_data_indices[: args.sample_size]
+            model = KConfigModel.from_benchmark(
+                data, args.attribute, indices=sample_indices
+            )
+        else:
+            model = KConfigModel.from_benchmark(data, args.attribute)
         if args.max_loss:
             model.max_loss = args.max_loss
         model.build_tree()
@@ -73,6 +92,9 @@ def main():
         with open(args.model, "r") as f:
             model = KConfigModel.from_json(json.load(f))
 
+    if args.info:
+        print("TODO")
+
     if args.export_tree:
         with open(args.export_tree, "w") as f:
             json.dump(model.to_json(), f)
diff --git a/bin/eval-kconfig.py b/bin/eval-kconfig.py
index 7f48b52..7bc0c41 100755
--- a/bin/eval-kconfig.py
+++ b/bin/eval-kconfig.py
@@ -37,6 +37,19 @@ def main():
     parser.add_argument(
         "--with-choice-node", action="store_true", help="Use non-binary Choice Nodes"
     )
+    parser.add_argument(
+        "--max-loss",
+        type=float,
+        help="Maximum acceptable model loss for DecisionTree Leaves",
+        default=10,
+    )
+    # Falls die population exhaustive ist, kann man nun den generalization error berechnen
+    parser.add_argument(
+        "--sample-size",
+        type=int,
+        help="Perform model generation and validation with N random samples from the population",
+        metavar="N",
+    )
     parser.add_argument("kconfig_path", type=str, help="Path to Kconfig file")
     parser.add_argument(
         "experiment_root", type=str, help="Experiment results directory"
@@ -54,11 +67,26 @@ def main():
 
     k = 10
 
-    partition_pairs = validation._xv_partitions_kfold(len(data.data), k)
+    if args.sample_size:
+        shuffled_data_indices = np.random.permutation(np.arange(len(data.data)))
+        sample_indices = shuffled_data_indices[: args.sample_size]
+        nonsample_indices = shuffled_data_indices[args.sample_size :]
+        partition_pairs = validation._xv_partitions_kfold(args.sample_size, k)
+        partition_pairs = list(
+            map(
+                lambda tv: (shuffled_data_indices[tv[0]], shuffled_data_indices[tv[1]]),
+                partition_pairs,
+            )
+        )
+    else:
+        partition_pairs = validation._xv_partitions_kfold(len(data.data), k)
+
     measures = list()
     for training_set, validation_set in partition_pairs:
         model = KConfigModel.from_benchmark(data, args.attribute, indices=training_set)
         model.with_choice_node = args.with_choice_node
+        if args.max_loss:
+            model.max_loss = args.max_loss
         model.build_tree()
         measures.append(model.assess_benchmark(data, indices=validation_set))
 
@@ -71,11 +99,27 @@ def main():
 
     print("10-fold Cross Validation:")
     print(f"MAE: {aggregate['mae']:.0f} B")
-    print(f"SMAPE: {aggregate['smape']:.0f} %")
+    print(f"SMAPE: {aggregate['smape']:.1f} %")
     print(f"Unpredictable Configurations: {aggregate['unpredictable_count']}")
 
     print(aggregate)
 
+    if args.sample_size:
+        print("Estimated Generalization Error")
+        model = KConfigModel.from_benchmark(
+            data, args.attribute, indices=sample_indices
+        )
+        model.with_choice_node = args.with_choice_node
+        if args.max_loss:
+            model.max_loss = args.max_loss
+        model.build_tree()
+        generalization_measure = model.assess_benchmark(data, indices=nonsample_indices)
+        print(f"MAE: {generalization_measure['mae']:.0f} B")
+        print(f"SMAPE: {generalization_measure['smape']:.1f} %")
+        print(
+            f"Unpredictable Configurations: {generalization_measure['unpredictable_count']}"
+        )
+
     """
     if args.model:
         with open(args.model, "r") as f:
-- 
cgit v1.2.3