2 files changed, 56 insertions, 2 deletions
diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py
index da60212..c369cf8 100755
--- a/bin/analyze-archive.py
+++ b/bin/analyze-archive.py
@@ -46,7 +46,7 @@ from dfatool.loader import RawData, pta_trace_to_aggregate
 from dfatool.functions import gplearn_to_function
 from dfatool.model import PTAModel
 from dfatool.validation import CrossValidator
-from dfatool.utils import filter_aggregate_by_param
+from dfatool.utils import filter_aggregate_by_param, detect_outliers_in_aggregate
 from dfatool.automata import PTA
 
 
@@ -312,6 +312,17 @@ if __name__ == "__main__":
         help="Plot power trace for state or transition NAME. X axis is wrong for non-MIMOSA measurements",
     )
     parser.add_argument(
+        "--remove-outliers",
+        action="store_true",
+        help="Remove outliers exceeding the configured z score (default: 10)",
+    )
+    parser.add_argument(
+        "--z-score",
+        type=int,
+        default=10,
+        help="Configure z score for outlier detection (and optional removel)",
+    )
+    parser.add_argument(
         "--show-models",
         choices=["static", "paramdetection", "param", "all", "tex", "html"],
         help="static: show static model values as well as parameter detection heuristic.\n"
@@ -522,6 +533,9 @@ if __name__ == "__main__":
     )
 
     filter_aggregate_by_param(by_name, parameters, args.filter_param)
+    detect_outliers_in_aggregate(
+        by_name, z_limit=args.z_score, remove_outliers=args.remove_outliers
+    )
 
     model = PTAModel(
         by_name,
diff --git a/lib/utils.py b/lib/utils.py
index c8f31c2..560ab79 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -37,7 +37,7 @@ def human_readable(value, unit):
     for prefix, factor in (
         ("p", 1e-12),
         ("n", 1e-9),
-        (u"µ", 1e-6),
+        ("µ", 1e-6),
         ("m", 1e-3),
         ("", 1),
         ("k", 1e3),
@@ -268,6 +268,46 @@ def filter_aggregate_by_param(aggregate, parameters, parameter_filter):
             aggregate.pop(name)
 
 
+def detect_outliers_in_aggregate(aggregate, z_limit=10, remove_outliers=False):
+    for name in aggregate.keys():
+        indices_to_remove = set()
+        attributes = list()
+        for attribute in aggregate[name]["attributes"]:
+            data = aggregate[name][attribute]
+            z_scores = (data - np.mean(data)) / np.std(data)
+            outliers = np.abs(z_scores) > z_limit
+            if np.any(outliers) and remove_outliers:
+                indices_to_remove = indices_to_remove.union(
+                    np.arange(len(outliers))[outliers]
+                )
+                attributes.append(attribute)
+            elif np.any(outliers):
+                logger.info(
+                    f"{name} {attribute} has {len(z_scores[outliers])} outliers"
+                )
+        if indices_to_remove:
+            # Assumption: len(aggregate[name][attribute]) is the same for each
+            # attribute.
+            logger.info(
+                f"Removing outliers {indices_to_remove} from {name}. Affected attributes: {attributes}"
+            )
+            indices_to_keep = map(
+                lambda x: x not in indices_to_remove, np.arange(len(outliers))
+            )
+            indices_to_keep = np.array(list(indices_to_keep))
+            for attribute in aggregate[name]["attributes"]:
+                aggregate[name][attribute] = aggregate[name][attribute][indices_to_keep]
+            aggregate[name]["param"] = list(
+                map(
+                    lambda iv: iv[1],
+                    filter(
+                        lambda iv: indices_to_keep[iv[0]],
+                        enumerate(aggregate[name]["param"]),
+                    ),
+                )
+            )
+
+
 class OptionalTimingAnalysis:
     def __init__(self, enabled=True):
         self.enabled = enabled