From ee389fc21e87a373d2d7d3ed1c4047165344bad8 Mon Sep 17 00:00:00 2001
From: Daniel Friesel <daniel.friesel@uos.de>
Date: Wed, 16 Sep 2020 12:55:48 +0200
Subject: switch to sum of squared residuals as loss function

This is in line with DECART by Guo et al., 2017
---
 lib/model.py | 80 +++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 33 deletions(-)

(limited to 'lib/model.py')

diff --git a/lib/model.py b/lib/model.py
index e0ce056..f330327 100644
--- a/lib/model.py
+++ b/lib/model.py
@@ -1170,27 +1170,27 @@ class KConfigModel:
 
         :param value: A model value, e.g. expected ROM or RAM usage in Byte
         :type value: float
-        :param stddev: Standard deviation of benchmark data used to generate this leaf node
-        :type stddev: float
+        :param loss: Loss (sum of squared residuals) for the benchmark data used to generate this leaf node vs. `value`. Lower is better.
+        :type loss: float
         """
 
-        def __init__(self, value, stddev):
+        def __init__(self, value, loss):
             self.value = value
-            self.stddev = stddev
+            self.loss = loss
 
         @classmethod
         def from_json(cls, json_node):
-            node = cls(json_node["value"], json_node["stddev"])
+            node = cls(json_node["value"], json_node["loss"])
             return node
 
         def model(self, kconf):
             return self.value
 
         def __repr__(self):
-            return f"<Leaf({self.value}, {self.stddev})>"
+            return f"<Leaf({self.value}, {self.loss})>"
 
         def to_json(self):
-            return {"value": self.value, "stddev": self.stddev}
+            return {"value": self.value, "loss": self.loss}
 
     class BoolNode(Node):
         """
@@ -1280,7 +1280,10 @@ class KConfigModel:
             kconf_choice = next(
                 filter(lambda choice: choice.name == self.symbol, kconf.choices)
             )
-            return self.choice[kconf_choice.selection.name].model(kconf)
+            selection = kconf_choice.selection.name
+            if selection in self.choice:
+                return self.choice[selection].model(kconf)
+            return None
 
         def __repr__(self):
             choice_names = sorted(self.choice.keys())
@@ -1303,7 +1306,7 @@ class KConfigModel:
         self.choices = kconfig_benchmark.choice_names
         self.symbol = kconfig_benchmark.symbol
         self.choice = kconfig_benchmark.choice
-        self.max_stddev = 10
+        self.max_loss = 10
         if callable(attribute):
             self.attribute = "custom"
             self.attr_function = lambda x: attribute(x[1])
@@ -1334,6 +1337,13 @@ class KConfigModel:
             return cls.BoolNode.from_json(cls, json_node)
         return cls.Leaf.from_json(json_node)
 
+    def loss(self, values, model_value=None):
+        if type(values) is list:
+            values = np.array(values)
+        if model_value is None:
+            model_value = np.mean(values)
+        return np.sum((model_value - values) ** 2, dtype=np.float64)
+
     def build_tree(self):
         # without ChoiceNode:
         # self.model = self._build_tree(self.symbols, list(), self.data, 0)
@@ -1366,45 +1376,49 @@ class KConfigModel:
 
         rom_sizes = list(map(self.attr_function, this_data))
 
-        if np.std(rom_sizes) < self.max_stddev or len(this_symbols) == 0:
-            return self.Leaf(np.mean(rom_sizes), np.std(rom_sizes))
+        if self.loss(rom_sizes) < self.max_loss or len(this_symbols) == 0:
+            return self.Leaf(np.mean(rom_sizes), self.loss(rom_sizes))
 
-        sym_stds = list()
+        sym_losses = list()
         for symbol_name in this_symbols:
             enabled = list(filter(lambda vrr: vrr[0][symbol_name] == True, this_data))
             disabled = list(filter(lambda vrr: vrr[0][symbol_name] == False, this_data))
 
-            enabled_std_rom = np.std(list(map(self.attr_function, enabled)))
-            disabled_std_rom = np.std(list(map(self.attr_function, disabled)))
-            children = [enabled_std_rom, disabled_std_rom]
+            if len(enabled) == 0 or len(disabled) == 0:
+                sym_losses.append(np.inf)
+                continue
 
-            if np.any(np.isnan(children)):
-                sym_stds.append(np.inf)
-            else:
-                sym_stds.append(np.mean(children))
+            enabled_attr = list(map(self.attr_function, enabled))
+            disabled_attr = list(map(self.attr_function, disabled))
 
-        choice_stds = list()
+            children = [self.loss(enabled_attr), self.loss(disabled_attr)]
+
+            sym_losses.append(np.sum(children))
+
+        choice_losses = list()
         for choice in this_choices:
-            choice_foo = list()
-            choice_std = list()
+            choice_loss = list()
             num_configs = 0
             for symbol in self.choice[choice].syms:
                 sym_enabled = list(
                     filter(lambda vrr: vrr[0][symbol.name] == True, this_data)
                 )
-                num_configs += len(sym_enabled)
-                choice_foo.append(sym_enabled)
-                choice_std.append(np.std(list(map(self.attr_function, sym_enabled))))
+                enabled_attr = list(map(self.attr_function, sym_enabled))
+                if len(enabled_attr) == 0:
+                    continue
+
+                num_configs += len(enabled_attr)
+                choice_loss.append(self.loss(enabled_attr))
 
             # only split on a choice if it is present in _all_ configurations
-            if np.any(np.isnan(choice_std)) or num_configs != len(this_data):
-                choice_stds.append(np.inf)
+            if num_configs != len(this_data):
+                choice_losses.append(np.inf)
             else:
-                choice_stds.append(np.mean(choice_std))
+                choice_losses.append(np.sum(choice_loss))
 
-        min_index = np.argmin(sym_stds + choice_stds)
+        min_index = np.argmin(sym_losses + choice_losses)
 
-        if min_index < len(sym_stds):
+        if min_index < len(sym_losses):
             symbol_index = min_index
             symbol = this_symbols[symbol_index]
 
@@ -1416,7 +1430,7 @@ class KConfigModel:
             disabled = list(filter(lambda vrr: vrr[0][symbol] == False, this_data))
 
             logger.debug(
-                f"Level {level} split on {symbol} (mean std={sym_stds[symbol_index]}) has {len(enabled)} children when enabled and {len(disabled)} children when disabled"
+                f"Level {level} split on {symbol} (loss={sym_losses[symbol_index]}) has {len(enabled)} children when enabled and {len(disabled)} children when disabled"
             )
             if len(enabled):
                 node.set_child_y(
@@ -1427,7 +1441,7 @@ class KConfigModel:
                     self._build_tree(new_symbols, this_choices, disabled, level + 1)
                 )
         else:
-            choice_index = min_index - len(sym_stds)
+            choice_index = min_index - len(sym_losses)
             choice = this_choices[choice_index]
             node = self.ChoiceNode(choice)
 
@@ -1436,7 +1450,7 @@ class KConfigModel:
             for sym in self.choice[choice].syms:
                 enabled = list(filter(lambda vrr: vrr[0][sym.name] == True, this_data))
                 logger.debug(
-                    f"Level {level} split on {choice} (mean std={choice_stds[choice_index]}) has {len(enabled)} children for {sym.name}"
+                    f"Level {level} split on {choice} (loss={choice_losses[choice_index]}) has {len(enabled)} children for {sym.name}"
                 )
                 if len(enabled):
                     node.set_child(
-- 
cgit v1.2.3