From 96eb50d12ede52aebb4ef4c116c72cc9280111d8 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 13:20:07 +0200
Subject: analyse-archive: fixed typo; Also added symlink for windows

---
 bin/analyze-archive.py | 4 +++-
 bin/dfatool            | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py
index cfb832f..4531d86 100755
--- a/bin/analyze-archive.py
+++ b/bin/analyze-archive.py
@@ -357,7 +357,9 @@ if __name__ == "__main__":
         if raw_data.version <= 1:
             data_source = "MIMOSA"
         elif raw_data.version == 2:
-            data_sourec = "MSP430 EnergyTrace"
+            data_source = "MSP430 EnergyTrace"
+        else:
+            data_source = "UNKNOWN"
         print(f"    Data source ID: {raw_data.version} ({data_source})")
 
     preprocessed_data = raw_data.get_preprocessed_data()
diff --git a/bin/dfatool b/bin/dfatool
index dc598c5..3995af5 120000
--- a/bin/dfatool
+++ b/bin/dfatool
@@ -1 +1 @@
-../lib
\ No newline at end of file
+/mnt/c/Users/Janis/Documents/JANIS/UNI/BSc/Bachelorarbeit/aemr/dfatool/lib
\ No newline at end of file
-- 
cgit v1.2.3


From 1d2cf70216e3faf7b82d3b96df4bc3ad7cbca291 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 13:21:07 +0200
Subject: added .idea to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 25b1be5..91b6250 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.pyc
 /htmlcov/
 /.coverage*
+.idea/
-- 
cgit v1.2.3


From 8aedd0a2ec227b3bc0233ac136d46ff55c8e6af7 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 13:21:44 +0200
Subject: Initial commit ProofofConcept-WIP tool

---
 bin/Proof_Of_Concept_PELT.py | 297 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 bin/Proof_Of_Concept_PELT.py

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
new file mode 100644
index 0000000..643a368
--- /dev/null
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -0,0 +1,297 @@
+def plot_data_from_json(filename, trace_num, xaxis, yaxis):
+    import matplotlib.pyplot as plt
+    import json
+    with open(filename, 'r') as f:
+        tx_data = json.load(f)
+    print(tx_data[trace_num]['parameter'])
+    plt.plot(tx_data[trace_num]['offline'][0]['uW'])
+    plt.xlabel(xaxis)
+    plt.ylabel(yaxis)
+    plt.show()
+
+
+def plot_data_vs_mean(signal, xaxis, yaxis):
+    import matplotlib.pyplot as plt
+    from statistics import mean
+    plt.plot(signal)
+    average = mean(signal)
+    plt.hlines(average, 0, len(signal))
+    plt.xlabel(xaxis)
+    plt.ylabel(yaxis)
+    plt.show()
+
+
+def plot_data_vs_data_vs_means(signal1, signal2, xaxis, yaxis):
+    import matplotlib.pyplot as plt
+    from statistics import mean
+    plt.plot(signal1)
+    lens = max(len(signal1), len(signal2))
+    average = mean(signal1)
+    plt.hlines(average, 0, lens, color='red')
+    plt.vlines(len(signal1), 0, 100000, color='red', linestyles='dashed')
+    plt.plot(signal2)
+    average = mean(signal2)
+    plt.hlines(average, 0, lens, color='green')
+    plt.vlines(len(signal2), 0, 100000, color='green', linestyles='dashed')
+    plt.xlabel(xaxis)
+    plt.ylabel(yaxis)
+    plt.show()
+
+
+def get_bkps(algo, pen, q):
+    res = pen, len(algo.predict(pen=pen))
+    q.put(pen)
+    return res
+
+
+def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False):
+    from kneed import KneeLocator
+    kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction)
+    if plotting:
+        kneedle.plot_knee()
+    kneepoint = (kneedle.knee, kneedle.knee_y)
+    return kneepoint
+
+
+def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1,
+              refresh_thresh=5, S=1.0, pen_override=None, plotting=False):
+    import ruptures as rpt
+    import time
+    import matplotlib.pylab as plt
+    from multiprocessing import Pool, Manager
+
+    # default params in Function
+    if model is None:
+        model = 'l1'
+    if jump is None:
+        jump = 5
+    if min_dist is None:
+        min_dist = 2
+    if range_min is None:
+        range_min = 1
+    if range_max is None:
+        range_max = 50
+    if num_processes is None:
+        num_processes = 8
+    if refresh_delay is None:
+        refresh_delay = 1
+    if refresh_thresh is None:
+        refresh_thresh = 5
+    if S is None:
+        S = 1.0
+    if plotting is None:
+        plotting = False
+
+    # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
+    # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
+    # model = "l1"   #"l1"  # "l2", "rbf"
+    algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
+
+    ### CALC BKPS WITH DIFF PENALTYS
+    if pen_override is None:
+        # building args array for parallelizing
+        args = []
+        # for displaying progression
+        m = Manager()
+        q = m.Queue()
+
+        for i in range(range_min, range_max):
+            args.append((algo, i, q))
+
+        print('starting kneepoint calculation')
+        # init Pool with num_proesses
+        with Pool(num_processes) as p:
+            # collect results from pool
+            result = p.starmap_async(get_bkps, args)
+            # monitor loop
+            last_percentage = -1
+            percentage = -100  # Force display of 0%
+            i = 0
+            while True:
+                if result.ready():
+                    break
+                else:
+                    size = q.qsize()
+                    last_percentage = percentage
+                    percentage = round(size / (range_max - range_min) * 100, 2)
+                    if percentage >= last_percentage + 2 or i >= refresh_thresh:
+                        print('Current progress: ' + str(percentage) + '%')
+                        i = 0
+                    else:
+                        i += 1
+                    time.sleep(refresh_delay)
+            res = result.get()
+
+        # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH
+        # split x and y coords to pass to kneedle
+        pen_val = [x[0] for x in res]
+        fittet_bkps_val = [x[1] for x in res]
+        # # plot to look at res
+
+        knee = find_knee_point(pen_val, fittet_bkps_val, S=S, plotting=plotting)
+        plt.xlabel('Penalty')
+        plt.ylabel('Number of Changepoints')
+        plt.plot(pen_val, fittet_bkps_val)
+        plt.vlines(knee[0], 0, max(fittet_bkps_val), linestyles='dashed')
+        print("knee: " + str(knee[0]))
+        plt.show()
+    else:
+        # use forced pen value for plotting
+        knee = (pen_override, None)
+
+
+    #plt.plot(pen_val, fittet_bkps_val)
+    if knee[0] is not None:
+        bkps = algo.predict(pen=knee[0])
+        if plotting:
+            fig, ax = rpt.display(signal, bkps)
+            plt.show()
+        return bkps
+    else:
+        print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.')
+
+
+if __name__ == '__main__':
+    import numpy as np
+    import json
+    import ruptures as rpt
+    import matplotlib.pylab as plt
+    import sys
+    import getopt
+    import re
+    from dfatool.dfatool import RawData
+    opt = dict()
+
+    optspec = (
+        "filename= "
+        "v "
+        "model= "
+        "jump= "
+        "min_dist= "
+        "range_min= "
+        "range_max= "
+        "num_processes= "
+        "refresh_delay= "
+        "refresh_thresh= "
+        "S= "
+        "pen_override= "
+        "plotting= "
+    )
+    opt_filename = None
+    opt_verbose = False
+    opt_model = None
+    opt_jump = None
+    opt_min_dist = None
+    opt_range_min = None
+    opt_range_max = None
+    opt_num_processes = None
+    opt_refresh_delay = None
+    opt_refresh_thresh = None
+    opt_S = None
+    opt_pen_override = None
+    opt_plotting = False
+    try:
+        raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" "))
+
+        for option, parameter in raw_opts:
+            optname = re.sub(r"^--", "", option)
+            opt[optname] = parameter
+
+        if 'filename' not in opt:
+            print("No file specified!", file=sys.stderr)
+            sys.exit(2)
+        else:
+            opt_filename = opt['filename']
+        if 'v' in opt:
+            opt_verbose = True
+            opt_plotting = True
+        if 'model' in opt:
+            opt_model = opt['model']
+        if 'jump' in opt:
+            try:
+                opt_jump = int(opt['jump'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'min_dist' in opt:
+            try:
+                opt_min_dist = int(opt['min_dist'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'range_min' in opt:
+            try:
+                opt_range_min = int(opt['range_min'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'range_max' in opt:
+            try:
+                opt_range_max = int(opt['range_max'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'num_processes' in opt:
+            try:
+                opt_num_processes = int(opt['num_processes'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'refresh_delay' in opt:
+            try:
+                opt_refresh_delay = int(opt['refresh_delay'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'refresh_thresh' in opt:
+            try:
+                opt_refresh_thresh = int(opt['refresh_thresh'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'S' in opt:
+            try:
+                opt_S = float(opt['S'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+        if 'pen_override' in opt:
+            try:
+                opt_pen_override = int(opt['pen_override'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
+    except getopt.GetoptError as err:
+        print(err, file=sys.stderr)
+        sys.exit(2)
+
+    if ".json" in opt_filename:
+        # open file with trace data from json
+        with open(opt['filename'], 'r') as f:
+            tx_data = json.load(f)
+    elif ".tar" in opt_filename:
+        # open with dfatool
+        raw_data_args = list()
+        raw_data_args.append(opt_filename)
+        raw_data = RawData(
+            raw_data_args, with_traces=True
+        )
+        print("Preprocessing file. Depending on its size, this could take a while.")
+        preprocessed_data = raw_data.get_preprocessed_data()
+        print("File fully preprocessed")
+
+    else:
+        print("Unknown dataformat", file=sys.stderr)
+        sys.exit(2)
+
+    print(tx_data[1]['parameter'])
+    # parse json to array for PELT
+    signal = np.array(tx_data[1]['offline'][0]['uW'])
+
+    for i in range(0, len(signal)):
+        signal[i] = signal[i]/1000
+    bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
+    fig, ax = rpt.display(signal, bkps)
+    plt.xlabel('Time [us]')
+    plt.ylabel('Power [mW]')
+    plt.show()
-- 
cgit v1.2.3


From 2c50b0996563ae2eb313b3d74f762e50c8ca9f6a Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 17:48:24 +0200
Subject: Proof_Of_Concept_Pelt - Implementation of decision whether to refine
 state or skip it

---
 bin/Proof_Of_Concept_PELT.py | 137 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 124 insertions(+), 13 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 643a368..2ed7675 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -151,6 +151,89 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
         print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.')
 
 
+# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
+def needs_refinement_no_sort(signal, mean, thresh):
+    # linear search for the top 10%/ bottom 10%
+    # should be sufficient
+    length_of_signal = len(signal)
+    percentile_size = int()
+    percentile_size = length_of_signal // 100
+    upper_percentile = [None] * percentile_size
+    lower_percentile = [None] * percentile_size
+    fill_index_upper = percentile_size - 1
+    fill_index_lower = percentile_size - 1
+    index_smallest_val = fill_index_upper
+    index_largest_val = fill_index_lower
+
+    for x in signal:
+        if x > mean:
+            # will be in upper percentile
+            if fill_index_upper >= 0:
+                upper_percentile[fill_index_upper] = x
+                if x < upper_percentile[index_smallest_val]:
+                    index_smallest_val = fill_index_upper
+                fill_index_upper = fill_index_upper - 1
+                continue
+
+            if x > upper_percentile[index_smallest_val]:
+                # replace smallest val. Find next smallest val
+                upper_percentile[index_smallest_val] = x
+                index_smallest_val = 0
+                i = 0
+                for y in upper_percentile:
+                    if upper_percentile[i] < upper_percentile[index_smallest_val]:
+                        index_smallest_val = i
+                    i = i + 1
+
+        else:
+            if fill_index_lower >= 0:
+                lower_percentile[fill_index_lower] = x
+                if x > lower_percentile[index_largest_val]:
+                    index_largest_val = fill_index_upper
+                fill_index_lower = fill_index_lower - 1
+                continue
+            if x < lower_percentile[index_largest_val]:
+                # replace smallest val. Find next smallest val
+                lower_percentile[index_largest_val] = x
+                index_largest_val = 0
+                i = 0
+                for y in lower_percentile:
+                    if lower_percentile[i] > lower_percentile[index_largest_val]:
+                        index_largest_val = i
+                    i = i + 1
+
+    # should have the percentiles
+    lower_percentile_mean = np.mean(lower_percentile)
+    upper_percentile_mean = np.mean(upper_percentile)
+    dist = mean - lower_percentile_mean
+    if dist > thresh:
+        return True
+    dist = upper_percentile_mean - mean
+    if dist > thresh:
+        return True
+    return False
+
+
+# Very short benchmark yielded approx. 3 times the speed of solution not using sort
+def needs_refinement_sort(signal, thresh):
+    sorted_signal = sorted(signal)
+    length_of_signal = len(signal)
+    percentile_size = int()
+    percentile_size = length_of_signal // 100
+    lower_percentile = sorted_signal[0:percentile_size]
+    upper_percentile = sorted_signal[length_of_signal - percentile_size : length_of_signal]
+    lower_percentile_mean = np.mean(lower_percentile)
+    upper_percentile_mean = np.mean(upper_percentile)
+    median = np.median(sorted_signal)
+    dist = median - lower_percentile_mean
+    if dist > thresh:
+        return True
+    dist = upper_percentile_mean - median
+    if dist > thresh:
+        return True
+    return False
+
+
 if __name__ == '__main__':
     import numpy as np
     import json
@@ -160,6 +243,7 @@ if __name__ == '__main__':
     import getopt
     import re
     from dfatool.dfatool import RawData
+    # OPTION RECOGNITION
     opt = dict()
 
     optspec = (
@@ -176,6 +260,7 @@ if __name__ == '__main__':
         "S= "
         "pen_override= "
         "plotting= "
+        "refinement_thresh= "
     )
     opt_filename = None
     opt_verbose = False
@@ -190,6 +275,7 @@ if __name__ == '__main__':
     opt_S = None
     opt_pen_override = None
     opt_plotting = False
+    opt_refinement_thresh = None
     try:
         raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" "))
 
@@ -261,14 +347,38 @@ if __name__ == '__main__':
             except ValueError as verr:
                 print(verr, file=sys.stderr)
                 sys.exit(2)
+        if 'refinement_thresh' in opt:
+            try:
+                opt_refinement_thresh = int(opt['refinement_thresh'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
     except getopt.GetoptError as err:
         print(err, file=sys.stderr)
         sys.exit(2)
 
+    #OPENING DATA
+    import time
     if ".json" in opt_filename:
         # open file with trace data from json
-        with open(opt['filename'], 'r') as f:
-            tx_data = json.load(f)
+        print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.")
+        with open(opt_filename, 'r') as f:
+            states = json.load(f)
+        # loop through all traces check if refinement is necessary
+        print("Checking if refinement is necessary...")
+        res = False
+        for measurements_by_state in states:
+            # loop through all occurrences of the looked at state
+            print("Looking at state '" + measurements_by_state['name'] + "'")
+            for measurement in measurements_by_state['offline']:
+                # loop through measurements of particular state
+                # an check if state needs refinement
+                signal = measurement['uW']
+                # mean = measurement['uW_mean']
+                # TODO: Decide if median is really the better baseline than mean
+                if needs_refinement_sort(signal, opt_refinement_thresh):
+                    print("Refinement is necessary!")
+                    break
     elif ".tar" in opt_filename:
         # open with dfatool
         raw_data_args = list()
@@ -280,18 +390,19 @@ if __name__ == '__main__':
         preprocessed_data = raw_data.get_preprocessed_data()
         print("File fully preprocessed")
 
+        # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json
     else:
         print("Unknown dataformat", file=sys.stderr)
         sys.exit(2)
 
-    print(tx_data[1]['parameter'])
-    # parse json to array for PELT
-    signal = np.array(tx_data[1]['offline'][0]['uW'])
-
-    for i in range(0, len(signal)):
-        signal[i] = signal[i]/1000
-    bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
-    fig, ax = rpt.display(signal, bkps)
-    plt.xlabel('Time [us]')
-    plt.ylabel('Power [mW]')
-    plt.show()
+    # print(tx_data[1]['parameter'])
+    # # parse json to array for PELT
+    # signal = np.array(tx_data[1]['offline'][0]['uW'])
+    #
+    # for i in range(0, len(signal)):
+    #     signal[i] = signal[i]/1000
+    # bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
+    # fig, ax = rpt.display(signal, bkps)
+    # plt.xlabel('Time [us]')
+    # plt.ylabel('Power [mW]')
+    # plt.show()
-- 
cgit v1.2.3


From e790c0ff3372b153c582b4adfc7f06a5ba86b5f6 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 17:51:23 +0200
Subject: Proof_Of_Concept_PELT - renamed decision function

---
 bin/Proof_Of_Concept_PELT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 2ed7675..6912b02 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -215,7 +215,7 @@ def needs_refinement_no_sort(signal, mean, thresh):
 
 
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
-def needs_refinement_sort(signal, thresh):
+def needs_refinement(signal, thresh):
     sorted_signal = sorted(signal)
     length_of_signal = len(signal)
     percentile_size = int()
@@ -376,7 +376,7 @@ if __name__ == '__main__':
                 signal = measurement['uW']
                 # mean = measurement['uW_mean']
                 # TODO: Decide if median is really the better baseline than mean
-                if needs_refinement_sort(signal, opt_refinement_thresh):
+                if needs_refinement(signal, opt_refinement_thresh):
                     print("Refinement is necessary!")
                     break
     elif ".tar" in opt_filename:
-- 
cgit v1.2.3


From 9075b8ffdbf15425e290747603450438513bca0c Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 2 Jul 2020 18:09:20 +0200
Subject: Proof_Of_Concept_PELT - Code aufgeräumt / Imports am Modulanfang /
 Typos fixed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 77 +++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 43 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 6912b02..452ff3f 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -1,40 +1,47 @@
-def plot_data_from_json(filename, trace_num, xaxis, yaxis):
-    import matplotlib.pyplot as plt
-    import json
+import matplotlib.pyplot as plt
+import json
+from kneed import KneeLocator
+import ruptures as rpt
+import time
+from multiprocessing import Pool, Manager
+import numpy as np
+import sys
+import getopt
+import re
+from dfatool.dfatool import RawData
+
+
+def plot_data_from_json(filename, trace_num, x_axis, y_axis):
     with open(filename, 'r') as f:
         tx_data = json.load(f)
     print(tx_data[trace_num]['parameter'])
     plt.plot(tx_data[trace_num]['offline'][0]['uW'])
-    plt.xlabel(xaxis)
-    plt.ylabel(yaxis)
+    plt.xlabel(x_axis)
+    plt.ylabel(y_axis)
     plt.show()
 
 
-def plot_data_vs_mean(signal, xaxis, yaxis):
-    import matplotlib.pyplot as plt
-    from statistics import mean
+def plot_data_vs_mean(signal, x_axis, y_axis):
     plt.plot(signal)
-    average = mean(signal)
+    average = np.mean(signal)
     plt.hlines(average, 0, len(signal))
-    plt.xlabel(xaxis)
-    plt.ylabel(yaxis)
+    plt.xlabel(x_axis)
+    plt.ylabel(y_axis)
     plt.show()
 
 
-def plot_data_vs_data_vs_means(signal1, signal2, xaxis, yaxis):
-    import matplotlib.pyplot as plt
-    from statistics import mean
+def plot_data_vs_data_vs_means(signal1, signal2, x_axis, y_axis):
     plt.plot(signal1)
     lens = max(len(signal1), len(signal2))
-    average = mean(signal1)
+    average = np.mean(signal1)
     plt.hlines(average, 0, lens, color='red')
     plt.vlines(len(signal1), 0, 100000, color='red', linestyles='dashed')
     plt.plot(signal2)
-    average = mean(signal2)
+    average = np.mean(signal2)
     plt.hlines(average, 0, lens, color='green')
     plt.vlines(len(signal2), 0, 100000, color='green', linestyles='dashed')
-    plt.xlabel(xaxis)
-    plt.ylabel(yaxis)
+    plt.xlabel(x_axis)
+    plt.ylabel(y_axis)
     plt.show()
 
 
@@ -45,7 +52,6 @@ def get_bkps(algo, pen, q):
 
 
 def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False):
-    from kneed import KneeLocator
     kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction)
     if plotting:
         kneedle.plot_knee()
@@ -53,13 +59,8 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing
     return kneepoint
 
 
-def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1,
+def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1,
               refresh_thresh=5, S=1.0, pen_override=None, plotting=False):
-    import ruptures as rpt
-    import time
-    import matplotlib.pylab as plt
-    from multiprocessing import Pool, Manager
-
     # default params in Function
     if model is None:
         model = 'l1'
@@ -104,7 +105,6 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
             # collect results from pool
             result = p.starmap_async(get_bkps, args)
             # monitor loop
-            last_percentage = -1
             percentage = -100  # Force display of 0%
             i = 0
             while True:
@@ -125,22 +125,21 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
         # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH
         # split x and y coords to pass to kneedle
         pen_val = [x[0] for x in res]
-        fittet_bkps_val = [x[1] for x in res]
+        fitted_bkps_val = [x[1] for x in res]
         # # plot to look at res
 
-        knee = find_knee_point(pen_val, fittet_bkps_val, S=S, plotting=plotting)
+        knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting)
         plt.xlabel('Penalty')
         plt.ylabel('Number of Changepoints')
-        plt.plot(pen_val, fittet_bkps_val)
-        plt.vlines(knee[0], 0, max(fittet_bkps_val), linestyles='dashed')
+        plt.plot(pen_val, fitted_bkps_val)
+        plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
         print("knee: " + str(knee[0]))
         plt.show()
     else:
         # use forced pen value for plotting
         knee = (pen_override, None)
 
-
-    #plt.plot(pen_val, fittet_bkps_val)
+    # plt.plot(pen_val, fittet_bkps_val)
     if knee[0] is not None:
         bkps = algo.predict(pen=knee[0])
         if plotting:
@@ -215,6 +214,7 @@ def needs_refinement_no_sort(signal, mean, thresh):
 
 
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
+# TODO: Decide whether median is really the better baseline than mean
 def needs_refinement(signal, thresh):
     sorted_signal = sorted(signal)
     length_of_signal = len(signal)
@@ -235,14 +235,6 @@ def needs_refinement(signal, thresh):
 
 
 if __name__ == '__main__':
-    import numpy as np
-    import json
-    import ruptures as rpt
-    import matplotlib.pylab as plt
-    import sys
-    import getopt
-    import re
-    from dfatool.dfatool import RawData
     # OPTION RECOGNITION
     opt = dict()
 
@@ -357,8 +349,7 @@ if __name__ == '__main__':
         print(err, file=sys.stderr)
         sys.exit(2)
 
-    #OPENING DATA
-    import time
+    # OPENING DATA
     if ".json" in opt_filename:
         # open file with trace data from json
         print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.")
@@ -401,7 +392,7 @@ if __name__ == '__main__':
     #
     # for i in range(0, len(signal)):
     #     signal[i] = signal[i]/1000
-    # bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
+    # bkps = calc_pelt(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
     # fig, ax = rpt.display(signal, bkps)
     # plt.xlabel('Time [us]')
     # plt.ylabel('Power [mW]')
-- 
cgit v1.2.3


From 23a07bf5da14980aeadf7c0e12b422117b3680bc Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sun, 5 Jul 2020 17:29:31 +0200
Subject: bin/Proof_of_Concept_PELT: States are now calculated per Measurement
 per State-config. Some statistics are calculated for that. Clustering pending

---
 bin/Proof_Of_Concept_PELT.py | 130 +++++++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 29 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 452ff3f..d4878c1 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -59,7 +59,7 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing
     return kneepoint
 
 
-def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1,
+def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1,
               refresh_thresh=5, S=1.0, pen_override=None, plotting=False):
     # default params in Function
     if model is None:
@@ -69,7 +69,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
     if min_dist is None:
         min_dist = 2
     if range_min is None:
-        range_min = 1
+        range_min = 0
     if range_max is None:
         range_max = 50
     if num_processes is None:
@@ -82,24 +82,23 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
         S = 1.0
     if plotting is None:
         plotting = False
-
     # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
     # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
     # model = "l1"   #"l1"  # "l2", "rbf"
     algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
 
     ### CALC BKPS WITH DIFF PENALTYS
-    if pen_override is None:
+    if pen_override is None and range_max != range_min:
         # building args array for parallelizing
         args = []
         # for displaying progression
         m = Manager()
         q = m.Queue()
 
-        for i in range(range_min, range_max):
+        for i in range(range_min, range_max + 1):
             args.append((algo, i, q))
 
-        print('starting kneepoint calculation')
+        print('[INFO]starting kneepoint calculation.')
         # init Pool with num_proesses
         with Pool(num_processes) as p:
             # collect results from pool
@@ -115,30 +114,32 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
                     last_percentage = percentage
                     percentage = round(size / (range_max - range_min) * 100, 2)
                     if percentage >= last_percentage + 2 or i >= refresh_thresh:
-                        print('Current progress: ' + str(percentage) + '%')
+                        print('[INFO]Current progress: ' + str(percentage) + '%')
                         i = 0
                     else:
                         i += 1
                     time.sleep(refresh_delay)
             res = result.get()
-
+        print_info("Finished kneepoint calculation.")
         # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH
         # split x and y coords to pass to kneedle
         pen_val = [x[0] for x in res]
         fitted_bkps_val = [x[1] for x in res]
         # # plot to look at res
-
         knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting)
-        plt.xlabel('Penalty')
-        plt.ylabel('Number of Changepoints')
-        plt.plot(pen_val, fitted_bkps_val)
-        plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
-        print("knee: " + str(knee[0]))
-        plt.show()
+        # plt.xlabel('Penalty')
+        # plt.ylabel('Number of Changepoints')
+        # plt.plot(pen_val, fitted_bkps_val)
+        # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
+        # print("knee: " + str(knee[0]))
+        # plt.show()
     else:
-        # use forced pen value for plotting
-        knee = (pen_override, None)
-
+        # use forced pen value for plotting if specified. Else use only pen in range
+        if pen_override is not None:
+            knee = (pen_override, None)
+        else:
+            knee = (range_min, None)
+    print_info("" + str(knee[0]) + " has been selected as kneepoint.")
     # plt.plot(pen_val, fittet_bkps_val)
     if knee[0] is not None:
         bkps = algo.predict(pen=knee[0])
@@ -147,7 +148,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50,
             plt.show()
         return bkps
     else:
-        print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.')
+        print_error('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.')
+        exit()
 
 
 # very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
@@ -221,7 +223,7 @@ def needs_refinement(signal, thresh):
     percentile_size = int()
     percentile_size = length_of_signal // 100
     lower_percentile = sorted_signal[0:percentile_size]
-    upper_percentile = sorted_signal[length_of_signal - percentile_size : length_of_signal]
+    upper_percentile = sorted_signal[length_of_signal - percentile_size: length_of_signal]
     lower_percentile_mean = np.mean(lower_percentile)
     upper_percentile_mean = np.mean(upper_percentile)
     median = np.median(sorted_signal)
@@ -234,6 +236,18 @@ def needs_refinement(signal, thresh):
     return False
 
 
+def print_info(str):
+    print("[INFO]" + str)
+
+
+def print_warning(str):
+    print("[WARNING]" + str)
+
+
+def print_error(str):
+    print("ERROR" + str, file=sys.stderr)
+
+
 if __name__ == '__main__':
     # OPTION RECOGNITION
     opt = dict()
@@ -276,7 +290,7 @@ if __name__ == '__main__':
             opt[optname] = parameter
 
         if 'filename' not in opt:
-            print("No file specified!", file=sys.stderr)
+            print_error("No file specified!")
             sys.exit(2)
         else:
             opt_filename = opt['filename']
@@ -352,15 +366,16 @@ if __name__ == '__main__':
     # OPENING DATA
     if ".json" in opt_filename:
         # open file with trace data from json
-        print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.")
+        print_info(" Will only refine the state which is present in " + opt_filename + " if necessary.")
         with open(opt_filename, 'r') as f:
             states = json.load(f)
         # loop through all traces check if refinement is necessary
-        print("Checking if refinement is necessary...")
-        res = False
+        print_info("Checking if refinement is necessary...")
         for measurements_by_state in states:
             # loop through all occurrences of the looked at state
-            print("Looking at state '" + measurements_by_state['name'] + "'")
+            print_info("Looking at state '" + measurements_by_state['name'] + "' with params: "
+                       + str(measurements_by_state['parameter']))
+            refine = False
             for measurement in measurements_by_state['offline']:
                 # loop through measurements of particular state
                 # an check if state needs refinement
@@ -368,8 +383,65 @@ if __name__ == '__main__':
                 # mean = measurement['uW_mean']
                 # TODO: Decide if median is really the better baseline than mean
                 if needs_refinement(signal, opt_refinement_thresh):
-                    print("Refinement is necessary!")
+                    print_info("Refinement is necessary!")
+                    refine = True
                     break
+            if not refine:
+                print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'")
+            else:
+                # calc and save all bkpts for the given state and param config
+                state_list = list()
+                for measurement in measurements_by_state['offline']:
+                    signal = np.array(measurement['uW'])
+                    normed_signal = np.zeros(shape=len(signal))
+                    for i in range(0, len(signal)):
+                        normed_signal[i] = signal[i] / 1000
+                    bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max,
+                                      num_processes=opt_num_processes, jump=opt_jump, S=opt_S,
+                                      pen_override=opt_pen_override)
+                    calced_states = list()
+                    start_time = 0
+                    end_time = 0
+                    for bkpt in bkpts:
+                        # start_time of state is end_time of previous one(Transitions are instantaneous)
+                        start_time = end_time
+                        end_time = bkpt
+                        power_vals = signal[start_time: end_time]
+                        mean_power = np.mean(power_vals)
+                        std_dev = np.std(power_vals)
+                        calced_state = (start_time, end_time, mean_power, std_dev)
+                        calced_states.append(calced_state)
+                    num = 0
+                    new_avg_std = 0
+                    for s in calced_states:
+                        print_info("State " + str(num) + " starts at t=" + str(s[0]) + " and ends at t=" + str(s[1])
+                                   + " while using " + str(s[2]) + "uW with  sigma=" + str(s[3]))
+                        num = num + 1
+                        new_avg_std = new_avg_std + s[3]
+                    new_avg_std = new_avg_std / len(calced_states)
+                    change_avg_std = measurement['uW_std'] - new_avg_std
+                    print_info("The average standard deviation for the newly found states is " + str(new_avg_std)
+                               + ".\n[INFO]That is a reduction of " + str(change_avg_std))
+                    state_list.append(calced_states)
+                num_states_array = np.zeros(shape=len(measurements_by_state['offline']))
+                i = 0
+                for x in state_list:
+                    num_states_array[i] = len(x)
+                    i = i + 1
+                avg_num_states = np.mean(num_states_array)
+                num_states_dev = np.std(num_states_array)
+                print_info("On average " + str(avg_num_states) + " States have been found. The standard deviation"
+                           + " is " + str(num_states_dev))
+                # TODO: MAGIC NUMBER
+                if num_states_dev > 1:
+                    print_warning("The number of states varies strongly across measurements. Consider choosing a "
+                                  "larger value for S.")
+                    time.sleep(5)
+                # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
+                # Einfach Durchschnitt nehmen?
+                # TODO: TESTING PURPOSES
+                exit()
+
     elif ".tar" in opt_filename:
         # open with dfatool
         raw_data_args = list()
@@ -377,13 +449,13 @@ if __name__ == '__main__':
         raw_data = RawData(
             raw_data_args, with_traces=True
         )
-        print("Preprocessing file. Depending on its size, this could take a while.")
+        print_info("Preprocessing file. Depending on its size, this could take a while.")
         preprocessed_data = raw_data.get_preprocessed_data()
-        print("File fully preprocessed")
+        print_info("File fully preprocessed")
 
         # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json
     else:
-        print("Unknown dataformat", file=sys.stderr)
+        print_error("Unknown dataformat")
         sys.exit(2)
 
     # print(tx_data[1]['parameter'])
-- 
cgit v1.2.3


From bd1b0c578ab7049f2826c653831d700caa59f7ac Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sun, 5 Jul 2020 17:35:06 +0200
Subject: save unixsymlink

---
 bin/dfatool(UNIX) | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 bin/dfatool(UNIX)

diff --git a/bin/dfatool(UNIX) b/bin/dfatool(UNIX)
new file mode 120000
index 0000000..dc598c5
--- /dev/null
+++ b/bin/dfatool(UNIX)
@@ -0,0 +1 @@
+../lib
\ No newline at end of file
-- 
cgit v1.2.3


From dd5533aca7cd8d13e23c49e3dd81141347a51dfb Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sun, 5 Jul 2020 17:47:05 +0200
Subject: bin/: Auf einmal funktioniert auch der UNIX Symlink für Windows...
 Nehm ich wohl. Aber wieso nicht gleich so?
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/dfatool       | 2 +-
 bin/dfatool(UNIX) | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 120000 bin/dfatool(UNIX)

diff --git a/bin/dfatool b/bin/dfatool
index 3995af5..dc598c5 120000
--- a/bin/dfatool
+++ b/bin/dfatool
@@ -1 +1 @@
-/mnt/c/Users/Janis/Documents/JANIS/UNI/BSc/Bachelorarbeit/aemr/dfatool/lib
\ No newline at end of file
+../lib
\ No newline at end of file
diff --git a/bin/dfatool(UNIX) b/bin/dfatool(UNIX)
deleted file mode 120000
index dc598c5..0000000
--- a/bin/dfatool(UNIX)
+++ /dev/null
@@ -1 +0,0 @@
-../lib
\ No newline at end of file
-- 
cgit v1.2.3


From bb19cc60ffad666afb7970fc36de2093be445166 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Mon, 6 Jul 2020 18:51:33 +0200
Subject: bin/Proof_Of_Concept_PELT: Added initial clustering via sklearn
 AgglomerativeClustering with affinity euclidean and ward linkage. added
 pen_modifier option, with which the penalty can be manipulated. e.g. Doubled,
 halved or otherwise modified

---
 bin/Proof_Of_Concept_PELT.py | 111 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 14 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index d4878c1..80f7c04 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -10,6 +10,11 @@ import getopt
 import re
 from dfatool.dfatool import RawData
 
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster.hierarchy import dendrogram, linkage
+
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100
+
 
 def plot_data_from_json(filename, trace_num, x_axis, y_axis):
     with open(filename, 'r') as f:
@@ -60,7 +65,7 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing
 
 
 def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1,
-              refresh_thresh=5, S=1.0, pen_override=None, plotting=False):
+              refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, plotting=False):
     # default params in Function
     if model is None:
         model = 'l1'
@@ -82,6 +87,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         S = 1.0
     if plotting is None:
         plotting = False
+    if pen_modifier is None:
+        pen_modifier = 1
     # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
     # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
     # model = "l1"   #"l1"  # "l2", "rbf"
@@ -98,7 +105,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         for i in range(range_min, range_max + 1):
             args.append((algo, i, q))
 
-        print('[INFO]starting kneepoint calculation.')
+        print_info('starting kneepoint calculation.')
         # init Pool with num_proesses
         with Pool(num_processes) as p:
             # collect results from pool
@@ -114,7 +121,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
                     last_percentage = percentage
                     percentage = round(size / (range_max - range_min) * 100, 2)
                     if percentage >= last_percentage + 2 or i >= refresh_thresh:
-                        print('[INFO]Current progress: ' + str(percentage) + '%')
+                        print_info('Current progress: ' + str(percentage) + '%')
                         i = 0
                     else:
                         i += 1
@@ -133,6 +140,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
         # print("knee: " + str(knee[0]))
         # plt.show()
+        # modify knee according to options. Defaults to 1 * knee
+        knee = (knee[0] * pen_modifier, knee[1])
     else:
         # use forced pen value for plotting if specified. Else use only pen in range
         if pen_override is not None:
@@ -237,15 +246,21 @@ def needs_refinement(signal, thresh):
 
 
 def print_info(str):
-    print("[INFO]" + str)
+    str_lst = str.split(sep='\n')
+    for str in str_lst:
+        print("[INFO]" + str)
 
 
 def print_warning(str):
-    print("[WARNING]" + str)
+    str_lst = str.split(sep='\n')
+    for str in str_lst:
+        print("[WARNING]" + str)
 
 
 def print_error(str):
-    print("ERROR" + str, file=sys.stderr)
+    str_lst = str.split(sep='\n')
+    for str in str_lst:
+        print("[ERROR]" + str, file=sys.stderr)
 
 
 if __name__ == '__main__':
@@ -265,6 +280,7 @@ if __name__ == '__main__':
         "refresh_thresh= "
         "S= "
         "pen_override= "
+        "pen_modifier= "
         "plotting= "
         "refinement_thresh= "
     )
@@ -280,6 +296,7 @@ if __name__ == '__main__':
     opt_refresh_thresh = None
     opt_S = None
     opt_pen_override = None
+    opt_pen_modifier = None
     opt_plotting = False
     opt_refinement_thresh = None
     try:
@@ -353,6 +370,12 @@ if __name__ == '__main__':
             except ValueError as verr:
                 print(verr, file=sys.stderr)
                 sys.exit(2)
+        if 'pen_modifier' in opt:
+            try:
+                opt_pen_modifier = float(opt['pen_modifier'])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(2)
         if 'refinement_thresh' in opt:
             try:
                 opt_refinement_thresh = int(opt['refinement_thresh'])
@@ -390,7 +413,7 @@ if __name__ == '__main__':
                 print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'")
             else:
                 # calc and save all bkpts for the given state and param config
-                state_list = list()
+                raw_states_list = list()
                 for measurement in measurements_by_state['offline']:
                     signal = np.array(measurement['uW'])
                     normed_signal = np.zeros(shape=len(signal))
@@ -398,7 +421,7 @@ if __name__ == '__main__':
                         normed_signal[i] = signal[i] / 1000
                     bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max,
                                       num_processes=opt_num_processes, jump=opt_jump, S=opt_S,
-                                      pen_override=opt_pen_override)
+                                      pen_override=opt_pen_override, pen_modifier=opt_pen_modifier)
                     calced_states = list()
                     start_time = 0
                     end_time = 0
@@ -420,12 +443,12 @@ if __name__ == '__main__':
                         new_avg_std = new_avg_std + s[3]
                     new_avg_std = new_avg_std / len(calced_states)
                     change_avg_std = measurement['uW_std'] - new_avg_std
-                    print_info("The average standard deviation for the newly found states is " + str(new_avg_std)
-                               + ".\n[INFO]That is a reduction of " + str(change_avg_std))
-                    state_list.append(calced_states)
-                num_states_array = np.zeros(shape=len(measurements_by_state['offline']))
+                    print_info("The average standard deviation for the newly found states is " + str(new_avg_std))
+                    print_info("That is a reduction of " + str(change_avg_std))
+                    raw_states_list.append(calced_states)
+                num_states_array = [int()] * len(raw_states_list)
                 i = 0
-                for x in state_list:
+                for x in raw_states_list:
                     num_states_array[i] = len(x)
                     i = i + 1
                 avg_num_states = np.mean(num_states_array)
@@ -435,10 +458,70 @@ if __name__ == '__main__':
                 # TODO: MAGIC NUMBER
                 if num_states_dev > 1:
                     print_warning("The number of states varies strongly across measurements. Consider choosing a "
-                                  "larger value for S.")
+                                  "larger value for S or using the pen_modifier option.")
                     time.sleep(5)
                 # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
                 # Einfach Durchschnitt nehmen?
+                # Preliminary decision: Further on only use the traces, which have the most frequent state count
+                counts = np.bincount(num_states_array)
+                num_raw_states = np.argmax(counts)
+                print_info("Choose " + str(num_raw_states) + " as number of raw_states.")
+                i = 0
+                cluster_labels_list = []
+                num_cluster_list = []
+                for raw_states in raw_states_list:
+                    # iterate through raw states from measurements
+                    if len(raw_states) == num_raw_states:
+                        # build array with power values to cluster these
+                        value_to_cluster = np.zeros((num_raw_states, 2))
+                        j = 0
+                        for s in raw_states:
+                            value_to_cluster[j][0] = s[2]
+                            value_to_cluster[j][1] = 0
+                            j = j + 1
+                        # linked = linkage(value_to_cluster, 'single')
+                        #
+                        # labelList = range(1, 11)
+                        #
+                        # plt.figure(figsize=(10, 7))
+                        # dendrogram(linked,
+                        #            orientation='top',
+                        #            distance_sort='descending',
+                        #            show_leaf_counts=True)
+                        # plt.show()
+                        # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
+                        # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
+                        #                                  linkage='ward', distance_threshold=opt_refinement_thresh)
+                        cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
+                        cluster.fit_predict(value_to_cluster)
+                        print_info("Cluster labels:\n" + str(cluster.labels_))
+                        # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
+                        # plt.show()
+                        # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.:
+                        # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3
+                        cluster_labels_list.append(cluster.labels_)
+                        num_cluster_list.append(cluster.n_clusters_)
+                        i = i + 1
+                if i != len(raw_states_list):
+                    print_info("Used " + str(i) + "/" + str(len(raw_states_list))
+                               + " Measurements for state clustering. "
+                                 "Others did not recognize number of states correctly.")
+                num_states = np.argmax(np.bincount(num_cluster_list))
+                resulting_sequence = [None] * num_raw_states
+                i = 0
+                for x in resulting_sequence:
+                    j = 0
+                    test_list = []
+                    for arr in cluster_labels_list:
+                        if num_cluster_list[j] != num_states:
+                            j = j + 1
+                        else:
+                            test_list.append(arr[i])
+                            j = j + 1
+                    resulting_sequence[i] = np.argmax(np.bincount(test_list))
+                    i = i + 1
+                print(resulting_sequence)
+
                 # TODO: TESTING PURPOSES
                 exit()
 
-- 
cgit v1.2.3


From 7dc6363cd7f17cf5a09f678da612e15a0e6bfbac Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Wed, 8 Jul 2020 14:08:13 +0200
Subject: bin/Proof_Of_Concept_PELT: Small bit of refactoring. Fixed some
 pylint violations

---
 bin/Proof_Of_Concept_PELT.py | 222 +++++++++++++++++++++++--------------------
 1 file changed, 117 insertions(+), 105 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 80f7c04..dbcc7c1 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -11,7 +11,9 @@ import re
 from dfatool.dfatool import RawData
 
 from sklearn.cluster import AgglomerativeClustering
-from scipy.cluster.hierarchy import dendrogram, linkage
+
+
+# from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
 
 # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100
 
@@ -56,16 +58,15 @@ def get_bkps(algo, pen, q):
     return res
 
 
-def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False):
+def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing'):
     kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction)
-    if plotting:
-        kneedle.plot_knee()
     kneepoint = (kneedle.knee, kneedle.knee_y)
     return kneepoint
 
 
-def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1,
-              refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, plotting=False):
+def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8,
+              refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None,
+              plotting=False):
     # default params in Function
     if model is None:
         model = 'l1'
@@ -116,16 +117,16 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
             while True:
                 if result.ready():
                     break
+
+                size = q.qsize()
+                last_percentage = percentage
+                percentage = round(size / (range_max - range_min) * 100, 2)
+                if percentage >= last_percentage + 2 or i >= refresh_thresh:
+                    print_info('Current progress: ' + str(percentage) + '%')
+                    i = 0
                 else:
-                    size = q.qsize()
-                    last_percentage = percentage
-                    percentage = round(size / (range_max - range_min) * 100, 2)
-                    if percentage >= last_percentage + 2 or i >= refresh_thresh:
-                        print_info('Current progress: ' + str(percentage) + '%')
-                        i = 0
-                    else:
-                        i += 1
-                    time.sleep(refresh_delay)
+                    i += 1
+                time.sleep(refresh_delay)
             res = result.get()
         print_info("Finished kneepoint calculation.")
         # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH
@@ -133,7 +134,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         pen_val = [x[0] for x in res]
         fitted_bkps_val = [x[1] for x in res]
         # # plot to look at res
-        knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting)
+        knee = find_knee_point(pen_val, fitted_bkps_val, S=S)
         # plt.xlabel('Penalty')
         # plt.ylabel('Number of Changepoints')
         # plt.plot(pen_val, fitted_bkps_val)
@@ -156,72 +157,73 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
             fig, ax = rpt.display(signal, bkps)
             plt.show()
         return bkps
-    else:
-        print_error('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.')
-        exit()
-
 
-# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
-def needs_refinement_no_sort(signal, mean, thresh):
-    # linear search for the top 10%/ bottom 10%
-    # should be sufficient
-    length_of_signal = len(signal)
-    percentile_size = int()
-    percentile_size = length_of_signal // 100
-    upper_percentile = [None] * percentile_size
-    lower_percentile = [None] * percentile_size
-    fill_index_upper = percentile_size - 1
-    fill_index_lower = percentile_size - 1
-    index_smallest_val = fill_index_upper
-    index_largest_val = fill_index_lower
-
-    for x in signal:
-        if x > mean:
-            # will be in upper percentile
-            if fill_index_upper >= 0:
-                upper_percentile[fill_index_upper] = x
-                if x < upper_percentile[index_smallest_val]:
-                    index_smallest_val = fill_index_upper
-                fill_index_upper = fill_index_upper - 1
-                continue
-
-            if x > upper_percentile[index_smallest_val]:
-                # replace smallest val. Find next smallest val
-                upper_percentile[index_smallest_val] = x
-                index_smallest_val = 0
-                i = 0
-                for y in upper_percentile:
-                    if upper_percentile[i] < upper_percentile[index_smallest_val]:
-                        index_smallest_val = i
-                    i = i + 1
+    print_error('With the current thresh-hold S=' + str(S)
+                + ' it is not possible to select a penalty value.')
+    sys.exit()
 
-        else:
-            if fill_index_lower >= 0:
-                lower_percentile[fill_index_lower] = x
-                if x > lower_percentile[index_largest_val]:
-                    index_largest_val = fill_index_upper
-                fill_index_lower = fill_index_lower - 1
-                continue
-            if x < lower_percentile[index_largest_val]:
-                # replace smallest val. Find next smallest val
-                lower_percentile[index_largest_val] = x
-                index_largest_val = 0
-                i = 0
-                for y in lower_percentile:
-                    if lower_percentile[i] > lower_percentile[index_largest_val]:
-                        index_largest_val = i
-                    i = i + 1
 
-    # should have the percentiles
-    lower_percentile_mean = np.mean(lower_percentile)
-    upper_percentile_mean = np.mean(upper_percentile)
-    dist = mean - lower_percentile_mean
-    if dist > thresh:
-        return True
-    dist = upper_percentile_mean - mean
-    if dist > thresh:
-        return True
-    return False
+# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
+# def needs_refinement_no_sort(signal, mean, thresh):
+#     # linear search for the top 10%/ bottom 10%
+#     # should be sufficient
+#     length_of_signal = len(signal)
+#     percentile_size = int()
+#     percentile_size = length_of_signal // 100
+#     upper_percentile = [None] * percentile_size
+#     lower_percentile = [None] * percentile_size
+#     fill_index_upper = percentile_size - 1
+#     fill_index_lower = percentile_size - 1
+#     index_smallest_val = fill_index_upper
+#     index_largest_val = fill_index_lower
+#
+#     for x in signal:
+#         if x > mean:
+#             # will be in upper percentile
+#             if fill_index_upper >= 0:
+#                 upper_percentile[fill_index_upper] = x
+#                 if x < upper_percentile[index_smallest_val]:
+#                     index_smallest_val = fill_index_upper
+#                 fill_index_upper = fill_index_upper - 1
+#                 continue
+#
+#             if x > upper_percentile[index_smallest_val]:
+#                 # replace smallest val. Find next smallest val
+#                 upper_percentile[index_smallest_val] = x
+#                 index_smallest_val = 0
+#                 i = 0
+#                 for y in upper_percentile:
+#                     if upper_percentile[i] < upper_percentile[index_smallest_val]:
+#                         index_smallest_val = i
+#                     i = i + 1
+#
+#         else:
+#             if fill_index_lower >= 0:
+#                 lower_percentile[fill_index_lower] = x
+#                 if x > lower_percentile[index_largest_val]:
+#                     index_largest_val = fill_index_upper
+#                 fill_index_lower = fill_index_lower - 1
+#                 continue
+#             if x < lower_percentile[index_largest_val]:
+#                 # replace smallest val. Find next smallest val
+#                 lower_percentile[index_largest_val] = x
+#                 index_largest_val = 0
+#                 i = 0
+#                 for y in lower_percentile:
+#                     if lower_percentile[i] > lower_percentile[index_largest_val]:
+#                         index_largest_val = i
+#                     i = i + 1
+#
+#     # should have the percentiles
+#     lower_percentile_mean = np.mean(lower_percentile)
+#     upper_percentile_mean = np.mean(upper_percentile)
+#     dist = mean - lower_percentile_mean
+#     if dist > thresh:
+#         return True
+#     dist = upper_percentile_mean - mean
+#     if dist > thresh:
+#         return True
+#     return False
 
 
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
@@ -245,22 +247,22 @@ def needs_refinement(signal, thresh):
     return False
 
 
-def print_info(str):
-    str_lst = str.split(sep='\n')
-    for str in str_lst:
-        print("[INFO]" + str)
+def print_info(str_to_prt):
+    str_lst = str_to_prt.split(sep='\n')
+    for str_prt in str_lst:
+        print("[INFO]" + str_prt)
 
 
-def print_warning(str):
-    str_lst = str.split(sep='\n')
-    for str in str_lst:
-        print("[WARNING]" + str)
+def print_warning(str_to_prt):
+    str_lst = str_to_prt.split(sep='\n')
+    for str_prt in str_lst:
+        print("[WARNING]" + str_prt)
 
 
-def print_error(str):
-    str_lst = str.split(sep='\n')
-    for str in str_lst:
-        print("[ERROR]" + str, file=sys.stderr)
+def print_error(str_to_prt):
+    str_lst = str_to_prt.split(sep='\n')
+    for str_prt in str_lst:
+        print("[ERROR]" + str_prt, file=sys.stderr)
 
 
 if __name__ == '__main__':
@@ -389,7 +391,8 @@ if __name__ == '__main__':
     # OPENING DATA
     if ".json" in opt_filename:
         # open file with trace data from json
-        print_info(" Will only refine the state which is present in " + opt_filename + " if necessary.")
+        print_info(
+            " Will only refine the state which is present in " + opt_filename + " if necessary.")
         with open(opt_filename, 'r') as f:
             states = json.load(f)
         # loop through all traces check if refinement is necessary
@@ -410,7 +413,8 @@ if __name__ == '__main__':
                     refine = True
                     break
             if not refine:
-                print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'")
+                print_info("No refinement necessary for state '" + measurements_by_state['name']
+                           + "'")
             else:
                 # calc and save all bkpts for the given state and param config
                 raw_states_list = list()
@@ -419,14 +423,16 @@ if __name__ == '__main__':
                     normed_signal = np.zeros(shape=len(signal))
                     for i in range(0, len(signal)):
                         normed_signal[i] = signal[i] / 1000
-                    bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max,
-                                      num_processes=opt_num_processes, jump=opt_jump, S=opt_S,
-                                      pen_override=opt_pen_override, pen_modifier=opt_pen_modifier)
+                    bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min,
+                                      range_max=opt_range_max, num_processes=opt_num_processes,
+                                      jump=opt_jump, S=opt_S, pen_override=opt_pen_override,
+                                      pen_modifier=opt_pen_modifier)
                     calced_states = list()
                     start_time = 0
                     end_time = 0
                     for bkpt in bkpts:
-                        # start_time of state is end_time of previous one(Transitions are instantaneous)
+                        # start_time of state is end_time of previous one
+                        # (Transitions are instantaneous)
                         start_time = end_time
                         end_time = bkpt
                         power_vals = signal[start_time: end_time]
@@ -437,13 +443,16 @@ if __name__ == '__main__':
                     num = 0
                     new_avg_std = 0
                     for s in calced_states:
-                        print_info("State " + str(num) + " starts at t=" + str(s[0]) + " and ends at t=" + str(s[1])
-                                   + " while using " + str(s[2]) + "uW with  sigma=" + str(s[3]))
+                        print_info("State " + str(num) + " starts at t=" + str(s[0])
+                                   + " and ends at t=" + str(s[1])
+                                   + " while using " + str(s[2])
+                                   + "uW with  sigma=" + str(s[3]))
                         num = num + 1
                         new_avg_std = new_avg_std + s[3]
                     new_avg_std = new_avg_std / len(calced_states)
                     change_avg_std = measurement['uW_std'] - new_avg_std
-                    print_info("The average standard deviation for the newly found states is " + str(new_avg_std))
+                    print_info("The average standard deviation for the newly found states is "
+                               + str(new_avg_std))
                     print_info("That is a reduction of " + str(change_avg_std))
                     raw_states_list.append(calced_states)
                 num_states_array = [int()] * len(raw_states_list)
@@ -453,12 +462,14 @@ if __name__ == '__main__':
                     i = i + 1
                 avg_num_states = np.mean(num_states_array)
                 num_states_dev = np.std(num_states_array)
-                print_info("On average " + str(avg_num_states) + " States have been found. The standard deviation"
+                print_info("On average " + str(avg_num_states)
+                           + " States have been found. The standard deviation"
                            + " is " + str(num_states_dev))
                 # TODO: MAGIC NUMBER
                 if num_states_dev > 1:
-                    print_warning("The number of states varies strongly across measurements. Consider choosing a "
-                                  "larger value for S or using the pen_modifier option.")
+                    print_warning("The number of states varies strongly across measurements."
+                                  " Consider choosing a larger value for S or using the pen_modifier"
+                                  " option.")
                     time.sleep(5)
                 # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
                 # Einfach Durchschnitt nehmen?
@@ -492,7 +503,8 @@ if __name__ == '__main__':
                         # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
                         # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
                         #                                  linkage='ward', distance_threshold=opt_refinement_thresh)
-                        cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
+                        cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
+                                                          linkage='ward')
                         cluster.fit_predict(value_to_cluster)
                         print_info("Cluster labels:\n" + str(cluster.labels_))
                         # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
@@ -523,7 +535,7 @@ if __name__ == '__main__':
                 print(resulting_sequence)
 
                 # TODO: TESTING PURPOSES
-                exit()
+                sys.exit()
 
     elif ".tar" in opt_filename:
         # open with dfatool
-- 
cgit v1.2.3


From 56c0cd63af5e34fc2e3da64018155f715825a343 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Wed, 8 Jul 2020 17:29:56 +0200
Subject: bin/Proof_Of_Concept_PELT: Trennen von Penalty-Berechnung und PELT ->
 Pro Messreihe/Paramkonig. nur noch einmaliges bestimmen der Penalty.
 Bestimmen der Penalty über KNEEDLE Ab Kneepoint dann suche nach Plateau ->
 Wahl der Mitte des Plateaus. Automatisches Clustern funktioniert jetzt auch
 scheinbar für alle Messreihen aus TX.json. Mit anderen nicht getestet.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 193 +++++++++++++++++++++++++++++++------------
 1 file changed, 139 insertions(+), 54 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index dbcc7c1..0e63b78 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -1,16 +1,16 @@
-import matplotlib.pyplot as plt
 import json
-from kneed import KneeLocator
-import ruptures as rpt
 import time
-from multiprocessing import Pool, Manager
-import numpy as np
 import sys
 import getopt
 import re
-from dfatool.dfatool import RawData
-
+from multiprocessing import Pool, Manager
+from kneed import KneeLocator
 from sklearn.cluster import AgglomerativeClustering
+from scipy.signal import find_peaks
+import matplotlib.pyplot as plt
+import ruptures as rpt
+import numpy as np
+from dfatool.dfatool import RawData
 
 
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
@@ -19,8 +19,8 @@ from sklearn.cluster import AgglomerativeClustering
 
 
 def plot_data_from_json(filename, trace_num, x_axis, y_axis):
-    with open(filename, 'r') as f:
-        tx_data = json.load(f)
+    with open(filename, 'r') as file:
+        tx_data = json.load(file)
     print(tx_data[trace_num]['parameter'])
     plt.plot(tx_data[trace_num]['offline'][0]['uW'])
     plt.xlabel(x_axis)
@@ -64,12 +64,38 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing
     return kneepoint
 
 
-def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8,
-              refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None,
-              plotting=False):
+def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
+    # default params in Function
+    if model is None:
+        model = "l1"
+    if jump is None:
+        jump = 5
+    if min_dist is None:
+        min_dist = 2
+    if plotting is None:
+        plotting = False
+    # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
+    # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
+    # model = "l1"   #"l1"  # "l2", "rbf"
+    algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
+
+    if penalty is not None:
+        bkps = algo.predict(pen=penalty)
+        if plotting:
+            fig, ax = rpt.display(signal, bkps)
+            plt.show()
+        return bkps
+
+    print_error("No Penalty specified.")
+    sys.exit()
+
+
+def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50,
+                            num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0,
+                            pen_modifier=None):
     # default params in Function
     if model is None:
-        model = 'l1'
+        model = "l1"
     if jump is None:
         jump = 5
     if min_dist is None:
@@ -86,8 +112,6 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         refresh_thresh = 5
     if S is None:
         S = 1.0
-    if plotting is None:
-        plotting = False
     if pen_modifier is None:
         pen_modifier = 1
     # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
@@ -96,7 +120,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
     algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
 
     ### CALC BKPS WITH DIFF PENALTYS
-    if pen_override is None and range_max != range_min:
+    if range_max != range_min:
         # building args array for parallelizing
         args = []
         # for displaying progression
@@ -106,7 +130,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         for i in range(range_min, range_max + 1):
             args.append((algo, i, q))
 
-        print_info('starting kneepoint calculation.')
+        print_info("starting kneepoint calculation.")
         # init Pool with num_proesses
         with Pool(num_processes) as p:
             # collect results from pool
@@ -122,7 +146,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
                 last_percentage = percentage
                 percentage = round(size / (range_max - range_min) * 100, 2)
                 if percentage >= last_percentage + 2 or i >= refresh_thresh:
-                    print_info('Current progress: ' + str(percentage) + '%')
+                    print_info("Current progress: " + str(percentage) + "%")
                     i = 0
                 else:
                     i += 1
@@ -135,31 +159,68 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50,
         fitted_bkps_val = [x[1] for x in res]
         # # plot to look at res
         knee = find_knee_point(pen_val, fitted_bkps_val, S=S)
+
+        # TODO: Find plateau on pen_val vs fitted_bkps_val
+        # scipy.find_peaks() does not find plateaus if they extend through the end of the data.
+        # to counter that, add one extremely large value to the right side of the data
+        # after negating it is extremely small -> Almost certainly smaller than the
+        # found plateau therefore the plateau does not extend through the border -> scipy.find_peaks
+        # finds it. Choose value from within that plateau.
+        # fitted_bkps_val.append(100000000)
+        # TODO: Approaching over find_peaks might not work if the initial decrease step to the
+        #   "correct" number of changepoints and additional decrease steps e.g. underfitting
+        #   take place within the given penalty interval. find_peak only finds plateaus
+        #   of peaks. If the number of chpts decreases after the wanted plateau the condition
+        #   for local peaks is not satisfied anymore. Therefore this approach will only work
+        #   if the plateau extends over the right border of the penalty interval.
+        # peaks, peak_plateaus = find_peaks(- np.array(fitted_bkps_val), plateau_size=1)
+        # Since the data is monotonously decreasing only one plateau can be found.
+
+        # assuming the plateau is constant
+        start_index = -1
+        end_index = -1
+        longest_start = -1
+        longest_end = -1
+        prev_val = -1
+        for i, num_bkpts in enumerate(fitted_bkps_val[knee[0]:]):
+            if num_bkpts != prev_val:
+                end_index = i - 1
+                if end_index - start_index > longest_end - longest_start:
+                    # currently found sequence is the longest found yet
+                    longest_start = start_index
+                    longest_end = end_index
+                start_index = i
+            if i == len(fitted_bkps_val[knee[0]:]) - 1:
+                # end sequence with last value
+                end_index = i
+                if end_index - start_index > longest_end - longest_start:
+                    # last found sequence is the longest found yet
+                    longest_start = start_index
+                    longest_end = end_index
+                start_index = i
+            prev_val = num_bkpts
         # plt.xlabel('Penalty')
         # plt.ylabel('Number of Changepoints')
         # plt.plot(pen_val, fitted_bkps_val)
-        # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
-        # print("knee: " + str(knee[0]))
+        # plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
+        # plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
         # plt.show()
+        # choosing pen from plateau
+        mid_of_plat = longest_start + (longest_end - longest_start) // 2
+        knee = (mid_of_plat + knee[0], fitted_bkps_val[mid_of_plat + knee[0]])
+
         # modify knee according to options. Defaults to 1 * knee
         knee = (knee[0] * pen_modifier, knee[1])
+
     else:
-        # use forced pen value for plotting if specified. Else use only pen in range
-        if pen_override is not None:
-            knee = (pen_override, None)
-        else:
-            knee = (range_min, None)
-    print_info("" + str(knee[0]) + " has been selected as kneepoint.")
-    # plt.plot(pen_val, fittet_bkps_val)
+        # range_min == range_max. has the same effect as pen_override
+        knee = (range_min, None)
+    print_info(str(knee[0]) + " has been selected as kneepoint.")
     if knee[0] is not None:
-        bkps = algo.predict(pen=knee[0])
-        if plotting:
-            fig, ax = rpt.display(signal, bkps)
-            plt.show()
-        return bkps
+        return knee
 
-    print_error('With the current thresh-hold S=' + str(S)
-                + ' it is not possible to select a penalty value.')
+    print_error("With the current thresh-hold S=" + str(S)
+                + " it is not possible to select a penalty value.")
     sys.exit()
 
 
@@ -265,6 +326,14 @@ def print_error(str_to_prt):
         print("[ERROR]" + str_prt, file=sys.stderr)
 
 
+def norm_signal(signal):
+    # TODO: maybe refine normalisation of signal
+    normed_signal = np.zeros(shape=len(signal))
+    for i, signal_i in enumerate(signal):
+        normed_signal[i] = signal_i / 1000
+    return normed_signal
+
+
 if __name__ == '__main__':
     # OPTION RECOGNITION
     opt = dict()
@@ -414,19 +483,28 @@ if __name__ == '__main__':
                     break
             if not refine:
                 print_info("No refinement necessary for state '" + measurements_by_state['name']
-                           + "'")
+                           + "' with params: " + str(measurements_by_state['parameter']))
             else:
+                # assume that all measurements of the same param configuration are fundamentally
+                # similar -> calculate penalty for first measurement, use it for all
+                if opt_pen_override is None:
+                    signal = np.array(measurements_by_state['offline'][0]['uW'])
+                    normed_signal = norm_signal(signal)
+                    penalty = calculate_penalty_value(normed_signal, model=opt_model,
+                                                      range_min=opt_range_min,
+                                                      range_max=opt_range_max,
+                                                      num_processes=opt_num_processes,
+                                                      jump=opt_jump, S=opt_S,
+                                                      pen_modifier=opt_pen_modifier)
+                    penalty = penalty[0]
+                else:
+                    penalty = opt_pen_override
                 # calc and save all bkpts for the given state and param config
                 raw_states_list = list()
                 for measurement in measurements_by_state['offline']:
                     signal = np.array(measurement['uW'])
-                    normed_signal = np.zeros(shape=len(signal))
-                    for i in range(0, len(signal)):
-                        normed_signal[i] = signal[i] / 1000
-                    bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min,
-                                      range_max=opt_range_max, num_processes=opt_num_processes,
-                                      jump=opt_jump, S=opt_S, pen_override=opt_pen_override,
-                                      pen_modifier=opt_pen_modifier)
+                    normed_signal = norm_signal(signal)
+                    bkpts = calc_pelt(normed_signal, penalty, model=opt_model, jump=opt_jump)
                     calced_states = list()
                     start_time = 0
                     end_time = 0
@@ -468,8 +546,8 @@ if __name__ == '__main__':
                 # TODO: MAGIC NUMBER
                 if num_states_dev > 1:
                     print_warning("The number of states varies strongly across measurements."
-                                  " Consider choosing a larger value for S or using the pen_modifier"
-                                  " option.")
+                                  " Consider choosing a larger value for S or using the "
+                                  "pen_modifier option.")
                     time.sleep(5)
                 # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
                 # Einfach Durchschnitt nehmen?
@@ -501,10 +579,10 @@ if __name__ == '__main__':
                         #            show_leaf_counts=True)
                         # plt.show()
                         # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
-                        # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
-                        #                                  linkage='ward', distance_threshold=opt_refinement_thresh)
-                        cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
-                                                          linkage='ward')
+                        cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
+                                                         linkage='ward', distance_threshold=opt_refinement_thresh*100)
+                        # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
+                        #                                   linkage='ward')
                         cluster.fit_predict(value_to_cluster)
                         print_info("Cluster labels:\n" + str(cluster.labels_))
                         # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
@@ -515,9 +593,19 @@ if __name__ == '__main__':
                         num_cluster_list.append(cluster.n_clusters_)
                         i = i + 1
                 if i != len(raw_states_list):
-                    print_info("Used " + str(i) + "/" + str(len(raw_states_list))
-                               + " Measurements for state clustering. "
-                                 "Others did not recognize number of states correctly.")
+                    if i / len(raw_states_list) <= 0.5:
+                        print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
+                                      + " Measurements for refinement. "
+                                        "Others did not recognize number of states correctly."
+                                        "\nYou should verify the integrity of the measurements.")
+                    else:
+                        print_info("Used " + str(i) + "/" + str(len(raw_states_list))
+                                   + " Measurements for refinement. "
+                                     "Others did not recognize number of states correctly.")
+                    sys.exit()
+                else:
+                    print_info("Used all available measurements.")
+
                 num_states = np.argmax(np.bincount(num_cluster_list))
                 resulting_sequence = [None] * num_raw_states
                 i = 0
@@ -534,9 +622,6 @@ if __name__ == '__main__':
                     i = i + 1
                 print(resulting_sequence)
 
-                # TODO: TESTING PURPOSES
-                sys.exit()
-
     elif ".tar" in opt_filename:
         # open with dfatool
         raw_data_args = list()
-- 
cgit v1.2.3


From 71b981c13c007d33f4042823703f98e41ff56770 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 10 Jul 2020 14:19:44 +0200
Subject: Proof_Of-Concept_PELT: Fixed Typo

---
 bin/Proof_Of_Concept_PELT.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 0e63b78..92d09fa 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -579,6 +579,7 @@ if __name__ == '__main__':
                         #            show_leaf_counts=True)
                         # plt.show()
                         # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
+                        #   im distance_threshold
                         cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
                                                          linkage='ward', distance_threshold=opt_refinement_thresh*100)
                         # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
-- 
cgit v1.2.3


From 98a7873ec1ce265e6d229af4fa8416b3a9ef018a Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 10 Jul 2020 16:19:08 +0200
Subject: bin/Proof_Of_Concept_PELT.py: Calculation of raw_states is now
 parallelized.

---
 bin/Proof_Of_Concept_PELT.py | 121 ++++++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 42 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 92d09fa..bcbd53e 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -6,7 +6,6 @@ import re
 from multiprocessing import Pool, Manager
 from kneed import KneeLocator
 from sklearn.cluster import AgglomerativeClustering
-from scipy.signal import find_peaks
 import matplotlib.pyplot as plt
 import ruptures as rpt
 import numpy as np
@@ -287,6 +286,50 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
 #     return False
 
 
+# raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model
+#                                                  , opt_jump))
+def calc_raw_states_func(num_trace, measurement, penalty, model, jump):
+    signal = np.array(measurement['uW'])
+    normed_signal = norm_signal(signal)
+    bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump)
+    calced_states = list()
+    start_time = 0
+    end_time = 0
+    for bkpt in bkpts:
+        # start_time of state is end_time of previous one
+        # (Transitions are instantaneous)
+        start_time = end_time
+        end_time = bkpt
+        power_vals = signal[start_time: end_time]
+        mean_power = np.mean(power_vals)
+        std_dev = np.std(power_vals)
+        calced_state = (start_time, end_time, mean_power, std_dev)
+        calced_states.append(calced_state)
+    num = 0
+    new_avg_std = 0
+    for s in calced_states:
+        # print_info("State " + str(num) + " starts at t=" + str(s[0])
+        #            + " and ends at t=" + str(s[1])
+        #            + " while using " + str(s[2])
+        #            + "uW with  sigma=" + str(s[3]))
+        num = num + 1
+        new_avg_std = new_avg_std + s[3]
+    new_avg_std = new_avg_std / len(calced_states)
+    change_avg_std = measurement['uW_std'] - new_avg_std
+    # print_info("The average standard deviation for the newly found states is "
+    #            + str(new_avg_std))
+    # print_info("That is a reduction of " + str(change_avg_std))
+    return num_trace, calced_states, new_avg_std, change_avg_std
+
+
+def calc_raw_states(arg_list, num_processes=8):
+    m = Manager()
+    with Pool(processes=num_processes) as p:
+        # collect results from pool
+        result = p.starmap(calc_raw_states_func, arg_list)
+    return result
+
+
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
 # TODO: Decide whether median is really the better baseline than mean
 def needs_refinement(signal, thresh):
@@ -477,10 +520,9 @@ if __name__ == '__main__':
                 signal = measurement['uW']
                 # mean = measurement['uW_mean']
                 # TODO: Decide if median is really the better baseline than mean
-                if needs_refinement(signal, opt_refinement_thresh):
+                if needs_refinement(signal, opt_refinement_thresh) and not refine:
                     print_info("Refinement is necessary!")
                     refine = True
-                    break
             if not refine:
                 print_info("No refinement necessary for state '" + measurements_by_state['name']
                            + "' with params: " + str(measurements_by_state['parameter']))
@@ -499,45 +541,34 @@ if __name__ == '__main__':
                     penalty = penalty[0]
                 else:
                     penalty = opt_pen_override
-                # calc and save all bkpts for the given state and param config
-                raw_states_list = list()
-                for measurement in measurements_by_state['offline']:
-                    signal = np.array(measurement['uW'])
-                    normed_signal = norm_signal(signal)
-                    bkpts = calc_pelt(normed_signal, penalty, model=opt_model, jump=opt_jump)
-                    calced_states = list()
-                    start_time = 0
-                    end_time = 0
-                    for bkpt in bkpts:
-                        # start_time of state is end_time of previous one
-                        # (Transitions are instantaneous)
-                        start_time = end_time
-                        end_time = bkpt
-                        power_vals = signal[start_time: end_time]
-                        mean_power = np.mean(power_vals)
-                        std_dev = np.std(power_vals)
-                        calced_state = (start_time, end_time, mean_power, std_dev)
-                        calced_states.append(calced_state)
-                    num = 0
-                    new_avg_std = 0
-                    for s in calced_states:
-                        print_info("State " + str(num) + " starts at t=" + str(s[0])
-                                   + " and ends at t=" + str(s[1])
-                                   + " while using " + str(s[2])
-                                   + "uW with  sigma=" + str(s[3]))
-                        num = num + 1
-                        new_avg_std = new_avg_std + s[3]
-                    new_avg_std = new_avg_std / len(calced_states)
-                    change_avg_std = measurement['uW_std'] - new_avg_std
-                    print_info("The average standard deviation for the newly found states is "
-                               + str(new_avg_std))
+                # build arguments for parallel excecution
+                print_info("Starting raw_states calculation.")
+                raw_states_calc_args = []
+                for num_measurement, measurement in enumerate(measurements_by_state['offline']):
+                    raw_states_calc_args.append((num_measurement, measurement, penalty,
+                                                 opt_model, opt_jump))
+
+                raw_states_list = [None] * len(measurements_by_state['offline'])
+                raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
+                # extracting result and putting it in correct order -> index of raw_states_list
+                # entry still corresponds with index of measurement in measurements_by_states
+                # -> If measurements are discarded the correct ones are easily recognized
+                for ret_val in raw_states_res:
+                    num_trace = ret_val[0]
+                    raw_states = ret_val[1]
+                    avg_std = ret_val[2]
+                    change_avg_std = ret_val[3]
+                    # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
+                    #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
+                    raw_states_list[num_trace] = raw_states
+                    print_info("The average standard deviation for the newly found states in "
+                               + "measurement No. " + str(num_trace) + " is " + str(avg_std))
                     print_info("That is a reduction of " + str(change_avg_std))
-                    raw_states_list.append(calced_states)
+                print_info("Finished raw_states calculation.")
                 num_states_array = [int()] * len(raw_states_list)
                 i = 0
-                for x in raw_states_list:
+                for i, x in enumerate(raw_states_list):
                     num_states_array[i] = len(x)
-                    i = i + 1
                 avg_num_states = np.mean(num_states_array)
                 num_states_dev = np.std(num_states_array)
                 print_info("On average " + str(avg_num_states)
@@ -558,7 +589,7 @@ if __name__ == '__main__':
                 i = 0
                 cluster_labels_list = []
                 num_cluster_list = []
-                for raw_states in raw_states_list:
+                for num_trace, raw_states in enumerate(raw_states_list):
                     # iterate through raw states from measurements
                     if len(raw_states) == num_raw_states:
                         # build array with power values to cluster these
@@ -580,12 +611,14 @@ if __name__ == '__main__':
                         # plt.show()
                         # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
                         #   im distance_threshold
-                        cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean',
-                                                         linkage='ward', distance_threshold=opt_refinement_thresh*100)
+                        cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
+                                                          affinity='euclidean',
+                                                          linkage='ward',
+                                                          distance_threshold=opt_refinement_thresh * 100)
                         # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
                         #                                   linkage='ward')
                         cluster.fit_predict(value_to_cluster)
-                        print_info("Cluster labels:\n" + str(cluster.labels_))
+                        # print_info("Cluster labels:\n" + str(cluster.labels_))
                         # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
                         # plt.show()
                         # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.:
@@ -593,6 +626,9 @@ if __name__ == '__main__':
                         cluster_labels_list.append(cluster.labels_)
                         num_cluster_list.append(cluster.n_clusters_)
                         i = i + 1
+                    else:
+                        print_info("Discarding measurement No. " + str(num_trace) + " because it "
+                                   + "did not recognize the number of raw_states correctly.")
                 if i != len(raw_states_list):
                     if i / len(raw_states_list) <= 0.5:
                         print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
@@ -603,6 +639,7 @@ if __name__ == '__main__':
                         print_info("Used " + str(i) + "/" + str(len(raw_states_list))
                                    + " Measurements for refinement. "
                                      "Others did not recognize number of states correctly.")
+                    # TODO: DEBUG Kram
                     sys.exit()
                 else:
                     print_info("Used all available measurements.")
-- 
cgit v1.2.3


From 4dc7c23ada35fc2b64685f1eb9df18a5104aaa2c Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 10 Jul 2020 20:02:07 +0200
Subject: bin/Proof_Of_Concept_PELT: exits haben jetzt errorcodes. Anfang für
 Vereinheitlichung der Labels ist gemacht
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 89 +++++++++++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 25 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index bcbd53e..dde99d8 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -3,7 +3,7 @@ import time
 import sys
 import getopt
 import re
-from multiprocessing import Pool, Manager
+from multiprocessing import Pool, Manager, cpu_count
 from kneed import KneeLocator
 from sklearn.cluster import AgglomerativeClustering
 import matplotlib.pyplot as plt
@@ -86,7 +86,7 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
         return bkps
 
     print_error("No Penalty specified.")
-    sys.exit()
+    sys.exit(-1)
 
 
 def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50,
@@ -220,7 +220,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
 
     print_error("With the current thresh-hold S=" + str(S)
                 + " it is not possible to select a penalty value.")
-    sys.exit()
+    sys.exit(-1)
 
 
 # very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
@@ -405,7 +405,7 @@ if __name__ == '__main__':
     opt_min_dist = None
     opt_range_min = None
     opt_range_max = None
-    opt_num_processes = None
+    opt_num_processes = cpu_count()
     opt_refresh_delay = None
     opt_refresh_thresh = None
     opt_S = None
@@ -422,7 +422,7 @@ if __name__ == '__main__':
 
         if 'filename' not in opt:
             print_error("No file specified!")
-            sys.exit(2)
+            sys.exit(-1)
         else:
             opt_filename = opt['filename']
         if 'v' in opt:
@@ -435,70 +435,70 @@ if __name__ == '__main__':
                 opt_jump = int(opt['jump'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'min_dist' in opt:
             try:
                 opt_min_dist = int(opt['min_dist'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'range_min' in opt:
             try:
                 opt_range_min = int(opt['range_min'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'range_max' in opt:
             try:
                 opt_range_max = int(opt['range_max'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'num_processes' in opt:
             try:
                 opt_num_processes = int(opt['num_processes'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'refresh_delay' in opt:
             try:
                 opt_refresh_delay = int(opt['refresh_delay'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'refresh_thresh' in opt:
             try:
                 opt_refresh_thresh = int(opt['refresh_thresh'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'S' in opt:
             try:
                 opt_S = float(opt['S'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'pen_override' in opt:
             try:
                 opt_pen_override = int(opt['pen_override'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'pen_modifier' in opt:
             try:
                 opt_pen_modifier = float(opt['pen_modifier'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
         if 'refinement_thresh' in opt:
             try:
                 opt_refinement_thresh = int(opt['refinement_thresh'])
             except ValueError as verr:
                 print(verr, file=sys.stderr)
-                sys.exit(2)
+                sys.exit(-1)
     except getopt.GetoptError as err:
         print(err, file=sys.stderr)
-        sys.exit(2)
+        sys.exit(-1)
 
     # OPENING DATA
     if ".json" in opt_filename:
@@ -623,8 +623,8 @@ if __name__ == '__main__':
                         # plt.show()
                         # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.:
                         # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3
-                        cluster_labels_list.append(cluster.labels_)
-                        num_cluster_list.append(cluster.n_clusters_)
+                        cluster_labels_list.append((num_trace, cluster.labels_))
+                        num_cluster_list.append((num_trace, cluster.n_clusters_))
                         i = i + 1
                     else:
                         print_info("Discarding measurement No. " + str(num_trace) + " because it "
@@ -640,18 +640,55 @@ if __name__ == '__main__':
                                    + " Measurements for refinement. "
                                      "Others did not recognize number of states correctly.")
                     # TODO: DEBUG Kram
-                    sys.exit()
+                    sys.exit(0)
                 else:
                     print_info("Used all available measurements.")
 
-                num_states = np.argmax(np.bincount(num_cluster_list))
+                num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
+                avg_per_state_list = [None] * len(cluster_labels_list)
+                used_clusters = 0
+                for number, (num_trace, labels) in enumerate(cluster_labels_list):
+                    if num_cluster_list[number][1] == num_states:
+                        avg_per_state = [0] * num_states
+                        count_per_state = [0] * num_states
+                        raw_states = raw_states_list[num_trace]
+                        for num_label, label in enumerate(labels):
+                            count_per_state[label] = count_per_state[label] + 1
+                            avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
+                        for i,_ in enumerate(avg_per_state):
+                            avg_per_state[i] = avg_per_state[i] / count_per_state[i]
+                        avg_per_state_list[number] = avg_per_state
+                        used_clusters = used_clusters + 1
+
+                # flattend version for clustering:
+                values_to_cluster = np.zeros((num_states * used_clusters, 2))
+                index = 0
+                for avg_per_state in avg_per_state_list:
+                    if None not in avg_per_state:
+                        for avg in avg_per_state:
+                            values_to_cluster[index][0] = avg
+                            values_to_cluster[index][1] = 0
+                            index = index + 1
+                # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
+                # plt.show()
+                cluster = AgglomerativeClustering(n_clusters=num_states)
+                cluster.fit_predict(values_to_cluster)
+                # HIER WEITER:
+                # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
+                # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
+                # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
+                # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
+
                 resulting_sequence = [None] * num_raw_states
                 i = 0
                 for x in resulting_sequence:
                     j = 0
                     test_list = []
-                    for arr in cluster_labels_list:
-                        if num_cluster_list[j] != num_states:
+                    for arr in [elem[1] for elem in cluster_labels_list]:
+                        if num_cluster_list[j][1] != num_states:
+                            # hopefully this does not happen regularly
+                            print_info("Discarding measurement " + str(j)
+                                       + " because the clustering yielded not matching results.")
                             j = j + 1
                         else:
                             test_list.append(arr[i])
@@ -659,6 +696,7 @@ if __name__ == '__main__':
                     resulting_sequence[i] = np.argmax(np.bincount(test_list))
                     i = i + 1
                 print(resulting_sequence)
+                sys.exit()
 
     elif ".tar" in opt_filename:
         # open with dfatool
@@ -670,11 +708,12 @@ if __name__ == '__main__':
         print_info("Preprocessing file. Depending on its size, this could take a while.")
         preprocessed_data = raw_data.get_preprocessed_data()
         print_info("File fully preprocessed")
-
         # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json
+        print_error("Not implemented yet. Please generate .json files first with dfatool and use"
+                    " those.")
     else:
         print_error("Unknown dataformat")
-        sys.exit(2)
+        sys.exit(-1)
 
     # print(tx_data[1]['parameter'])
     # # parse json to array for PELT
-- 
cgit v1.2.3


From 5b5fb3103d8305eed9d6828858509013bfe60e97 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sat, 11 Jul 2020 17:23:03 +0200
Subject: Removed black from gitlab-ci for the moment

---
 .gitlab-ci.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 52d6e1c..f397fcd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,13 +3,6 @@ image: debian:bullseye
 stages:
   - test
 
-lint_python:
-  stage: test
-  script:
-    - apt-get update -qy
-    - apt-get install -y black
-    - black --check --diff bin
-
 run_tests:
   stage: test
   script:
-- 
cgit v1.2.3


From e1f0618fb04e42b7d3e49055af83f58a803b28b8 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sat, 11 Jul 2020 17:24:09 +0200
Subject: bin/Proof_of_Concept_PELT.py: Vereinheitlichen der Zustandsbezeichner
 für eine Paramkonfig funktioniert jetzt sehr gut.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 98 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 22 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index dde99d8..0d5be54 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -504,17 +504,18 @@ if __name__ == '__main__':
     if ".json" in opt_filename:
         # open file with trace data from json
         print_info(
-            " Will only refine the state which is present in " + opt_filename + " if necessary.")
+            "Will only refine the state which is present in " + opt_filename + " if necessary.")
         with open(opt_filename, 'r') as f:
-            states = json.load(f)
+            configurations = json.load(f)
         # loop through all traces check if refinement is necessary
-        print_info("Checking if refinement is necessary...")
-        for measurements_by_state in states:
+        resulting_sequence_list = []
+        for num_config, measurements_by_configuration in enumerate(configurations):
             # loop through all occurrences of the looked at state
-            print_info("Looking at state '" + measurements_by_state['name'] + "' with params: "
-                       + str(measurements_by_state['parameter']))
+            print_info("Looking at state '" + measurements_by_configuration['name'] + "' with params: "
+                       + str(measurements_by_configuration['parameter']))
             refine = False
-            for measurement in measurements_by_state['offline']:
+            print_info("Checking if refinement is necessary...")
+            for measurement in measurements_by_configuration['offline']:
                 # loop through measurements of particular state
                 # an check if state needs refinement
                 signal = measurement['uW']
@@ -524,13 +525,13 @@ if __name__ == '__main__':
                     print_info("Refinement is necessary!")
                     refine = True
             if not refine:
-                print_info("No refinement necessary for state '" + measurements_by_state['name']
-                           + "' with params: " + str(measurements_by_state['parameter']))
+                print_info("No refinement necessary for state '" + measurements_by_configuration['name']
+                           + "' with params: " + str(measurements_by_configuration['parameter']))
             else:
                 # assume that all measurements of the same param configuration are fundamentally
                 # similar -> calculate penalty for first measurement, use it for all
                 if opt_pen_override is None:
-                    signal = np.array(measurements_by_state['offline'][0]['uW'])
+                    signal = np.array(measurements_by_configuration['offline'][0]['uW'])
                     normed_signal = norm_signal(signal)
                     penalty = calculate_penalty_value(normed_signal, model=opt_model,
                                                       range_min=opt_range_min,
@@ -544,11 +545,11 @@ if __name__ == '__main__':
                 # build arguments for parallel excecution
                 print_info("Starting raw_states calculation.")
                 raw_states_calc_args = []
-                for num_measurement, measurement in enumerate(measurements_by_state['offline']):
+                for num_measurement, measurement in enumerate(measurements_by_configuration['offline']):
                     raw_states_calc_args.append((num_measurement, measurement, penalty,
                                                  opt_model, opt_jump))
 
-                raw_states_list = [None] * len(measurements_by_state['offline'])
+                raw_states_list = [None] * len(measurements_by_configuration['offline'])
                 raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
                 # extracting result and putting it in correct order -> index of raw_states_list
                 # entry still corresponds with index of measurement in measurements_by_states
@@ -629,6 +630,7 @@ if __name__ == '__main__':
                     else:
                         print_info("Discarding measurement No. " + str(num_trace) + " because it "
                                    + "did not recognize the number of raw_states correctly.")
+                num_used_measurements = len(raw_states_list)
                 if i != len(raw_states_list):
                     if i / len(raw_states_list) <= 0.5:
                         print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
@@ -639,6 +641,7 @@ if __name__ == '__main__':
                         print_info("Used " + str(i) + "/" + str(len(raw_states_list))
                                    + " Measurements for refinement. "
                                      "Others did not recognize number of states correctly.")
+                    num_used_measurements = i
                     # TODO: DEBUG Kram
                     sys.exit(0)
                 else:
@@ -655,16 +658,24 @@ if __name__ == '__main__':
                         for num_label, label in enumerate(labels):
                             count_per_state[label] = count_per_state[label] + 1
                             avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
-                        for i,_ in enumerate(avg_per_state):
+                        for i, _ in enumerate(avg_per_state):
                             avg_per_state[i] = avg_per_state[i] / count_per_state[i]
                         avg_per_state_list[number] = avg_per_state
                         used_clusters = used_clusters + 1
-
+                    else:
+                        # hopefully this does not happen regularly
+                        print_info("Discarding measurement " + str(number)
+                                   + " because the clustering yielded not matching results.")
+                        num_used_measurements = num_used_measurements - 1
+                if num_used_measurements == 0:
+                    print_error("Something went terribly wrong. Discarded all measurements.")
+                    # continue
+                    sys.exit(-1)
                 # flattend version for clustering:
                 values_to_cluster = np.zeros((num_states * used_clusters, 2))
                 index = 0
                 for avg_per_state in avg_per_state_list:
-                    if None not in avg_per_state:
+                    if avg_per_state is not None:
                         for avg in avg_per_state:
                             values_to_cluster[index][0] = avg
                             values_to_cluster[index][1] = 0
@@ -673,30 +684,73 @@ if __name__ == '__main__':
                 # plt.show()
                 cluster = AgglomerativeClustering(n_clusters=num_states)
                 cluster.fit_predict(values_to_cluster)
-                # HIER WEITER:
                 # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
                 # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
                 # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
                 # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
-
+                new_labels_list = []
+                new_labels = []
+                i = 0
+                for label in cluster.labels_:
+                    new_labels.append(label)
+                    i = i + 1
+                    if i == num_states:
+                        new_labels_list.append(new_labels)
+                        new_labels = []
+                        i = 0
+                # only the selected measurements are present in new_labels.
+                # new_labels_index should not be incremented, if not selected_measurement is skipped
+                new_labels_index = 0
+                # cluster_labels_list contains all measurements -> if measurement is skipped
+                # still increment the index
+                index = 0
+                for elem in avg_per_state_list:
+                    if elem is not None:
+                        for number, label in enumerate(cluster_labels_list[index][1]):
+                            cluster_labels_list[index][1][number] = \
+                                new_labels_list[new_labels_index][label]
+                        new_labels_index = new_labels_index + 1
+                    else:
+                        # override not selected measurement labels to avoid choosing the wrong ones.
+                        for number, label in enumerate(cluster_labels_list[index][1]):
+                            cluster_labels_list[index][1][number] = -1
+                    index = index + 1
                 resulting_sequence = [None] * num_raw_states
                 i = 0
+                confidence = 0
                 for x in resulting_sequence:
                     j = 0
                     test_list = []
                     for arr in [elem[1] for elem in cluster_labels_list]:
                         if num_cluster_list[j][1] != num_states:
-                            # hopefully this does not happen regularly
-                            print_info("Discarding measurement " + str(j)
-                                       + " because the clustering yielded not matching results.")
                             j = j + 1
                         else:
+                            if -1 in arr:
+                                print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
+                                            " Zustände wahrscheinlich.")
+                                sys.exit(-1)
                             test_list.append(arr[i])
                             j = j + 1
-                    resulting_sequence[i] = np.argmax(np.bincount(test_list))
+                    bincount = np.bincount(test_list)
+                    resulting_sequence[i] = np.argmax(bincount)
+                    confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
                     i = i + 1
+                confidence = confidence / len(resulting_sequence)
+                print_info("Confidence of resulting sequence is " + str(confidence)
+                           + " while using " + str(num_used_measurements) + "/"
+                           + str(len(raw_states_list)) + " measurements.")
                 print(resulting_sequence)
-                sys.exit()
+                resulting_sequence_list.append((num_config, resulting_sequence))
+        # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
+        #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
+        #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
+        #   auftreten.
+        # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
+        #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
+        #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
+        #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
+        #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
+        #   Zustands ja nicht mehr ändern.
 
     elif ".tar" in opt_filename:
         # open with dfatool
-- 
cgit v1.2.3


From a00ffc0e32ddc72a8faceec4344432cdbf3b90c7 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 16 Jul 2020 16:34:20 +0200
Subject: bin/Proof_Of_Concept_PELT: kleine kosmetische Änderungen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 0d5be54..7726f53 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -160,11 +160,11 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
         knee = find_knee_point(pen_val, fitted_bkps_val, S=S)
 
         # TODO: Find plateau on pen_val vs fitted_bkps_val
-        # scipy.find_peaks() does not find plateaus if they extend through the end of the data.
-        # to counter that, add one extremely large value to the right side of the data
-        # after negating it is extremely small -> Almost certainly smaller than the
-        # found plateau therefore the plateau does not extend through the border -> scipy.find_peaks
-        # finds it. Choose value from within that plateau.
+        #   scipy.find_peaks() does not find plateaus if they extend through the end of the data.
+        #   to counter that, add one extremely large value to the right side of the data
+        #   after negating it is extremely small -> Almost certainly smaller than the
+        #   found plateau therefore the plateau does not extend through the border
+        #   -> scipy.find_peaks finds it. Choose value from within that plateau.
         # fitted_bkps_val.append(100000000)
         # TODO: Approaching over find_peaks might not work if the initial decrease step to the
         #   "correct" number of changepoints and additional decrease steps e.g. underfitting
@@ -331,7 +331,6 @@ def calc_raw_states(arg_list, num_processes=8):
 
 
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
-# TODO: Decide whether median is really the better baseline than mean
 def needs_refinement(signal, thresh):
     sorted_signal = sorted(signal)
     length_of_signal = len(signal)
@@ -509,29 +508,28 @@ if __name__ == '__main__':
             configurations = json.load(f)
         # loop through all traces check if refinement is necessary
         resulting_sequence_list = []
-        for num_config, measurements_by_configuration in enumerate(configurations):
+        for num_config, measurements_by_config in enumerate(configurations):
             # loop through all occurrences of the looked at state
-            print_info("Looking at state '" + measurements_by_configuration['name'] + "' with params: "
-                       + str(measurements_by_configuration['parameter']))
+            print_info("Looking at state '" + measurements_by_config['name'] + "' with params: "
+                       + str(measurements_by_config['parameter']))
             refine = False
             print_info("Checking if refinement is necessary...")
-            for measurement in measurements_by_configuration['offline']:
+            for measurement in measurements_by_config['offline']:
                 # loop through measurements of particular state
                 # an check if state needs refinement
                 signal = measurement['uW']
                 # mean = measurement['uW_mean']
-                # TODO: Decide if median is really the better baseline than mean
                 if needs_refinement(signal, opt_refinement_thresh) and not refine:
                     print_info("Refinement is necessary!")
                     refine = True
             if not refine:
-                print_info("No refinement necessary for state '" + measurements_by_configuration['name']
-                           + "' with params: " + str(measurements_by_configuration['parameter']))
+                print_info("No refinement necessary for state '" + measurements_by_config['name']
+                           + "' with params: " + str(measurements_by_config['parameter']))
             else:
                 # assume that all measurements of the same param configuration are fundamentally
                 # similar -> calculate penalty for first measurement, use it for all
                 if opt_pen_override is None:
-                    signal = np.array(measurements_by_configuration['offline'][0]['uW'])
+                    signal = np.array(measurements_by_config['offline'][0]['uW'])
                     normed_signal = norm_signal(signal)
                     penalty = calculate_penalty_value(normed_signal, model=opt_model,
                                                       range_min=opt_range_min,
@@ -545,11 +543,11 @@ if __name__ == '__main__':
                 # build arguments for parallel excecution
                 print_info("Starting raw_states calculation.")
                 raw_states_calc_args = []
-                for num_measurement, measurement in enumerate(measurements_by_configuration['offline']):
+                for num_measurement, measurement in enumerate(measurements_by_config['offline']):
                     raw_states_calc_args.append((num_measurement, measurement, penalty,
                                                  opt_model, opt_jump))
 
-                raw_states_list = [None] * len(measurements_by_configuration['offline'])
+                raw_states_list = [None] * len(measurements_by_config['offline'])
                 raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
                 # extracting result and putting it in correct order -> index of raw_states_list
                 # entry still corresponds with index of measurement in measurements_by_states
@@ -622,8 +620,6 @@ if __name__ == '__main__':
                         # print_info("Cluster labels:\n" + str(cluster.labels_))
                         # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
                         # plt.show()
-                        # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.:
-                        # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3
                         cluster_labels_list.append((num_trace, cluster.labels_))
                         num_cluster_list.append((num_trace, cluster.n_clusters_))
                         i = i + 1
@@ -739,7 +735,7 @@ if __name__ == '__main__':
                 print_info("Confidence of resulting sequence is " + str(confidence)
                            + " while using " + str(num_used_measurements) + "/"
                            + str(len(raw_states_list)) + " measurements.")
-                print(resulting_sequence)
+                #print(resulting_sequence)
                 resulting_sequence_list.append((num_config, resulting_sequence))
         # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
         #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
@@ -750,7 +746,10 @@ if __name__ == '__main__':
         #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
         #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
         #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
-        #   Zustands ja nicht mehr ändern.
+        #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
+        for num_config, sequence in resulting_sequence_list:
+            print_info("NO. config:" + str(num_config))
+            print_info(sequence)
 
     elif ".tar" in opt_filename:
         # open with dfatool
-- 
cgit v1.2.3


From e15ac967c7e9b1b9f781ee9478f3b1e723d6177a Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Thu, 16 Jul 2020 16:41:19 +0200
Subject: Proof_Of_Concept_PELT: Fixed imports after merge

---
 bin/Proof_Of_Concept_PELT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 7726f53..de47d4a 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -9,7 +9,7 @@ from sklearn.cluster import AgglomerativeClustering
 import matplotlib.pyplot as plt
 import ruptures as rpt
 import numpy as np
-from dfatool.dfatool import RawData
+from dfatool.loader import RawData
 
 
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
-- 
cgit v1.2.3


From bf49cf3ccee8c6d3c91c6a2ac81d7923a35b198e Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Mon, 20 Jul 2020 23:48:21 +0200
Subject: bin/Proof_Of_Concept_PELT: Parametrisierung von raw_states sollte
 eigentlich vernünftig klappen. Für mindestens TX klappt das aber nicht.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 694 ++++++++++++++++++++++++++++---------------
 1 file changed, 456 insertions(+), 238 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index de47d4a..75cdce6 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -1,4 +1,5 @@
 import json
+import os
 import time
 import sys
 import getopt
@@ -9,7 +10,12 @@ from sklearn.cluster import AgglomerativeClustering
 import matplotlib.pyplot as plt
 import ruptures as rpt
 import numpy as np
+
+from dfatool.functions import analytic
 from dfatool.loader import RawData
+from dfatool import parameters
+from dfatool.model import ParallelParamFit
+from dfatool.utils import by_name_to_by_param
 
 
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
@@ -396,6 +402,8 @@ if __name__ == '__main__':
         "pen_modifier= "
         "plotting= "
         "refinement_thresh= "
+        "cache_dicts "
+        "cache_loc= "
     )
     opt_filename = None
     opt_verbose = False
@@ -412,6 +420,7 @@ if __name__ == '__main__':
     opt_pen_modifier = None
     opt_plotting = False
     opt_refinement_thresh = None
+    opt_cache_loc = None
     try:
         raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" "))
 
@@ -495,6 +504,12 @@ if __name__ == '__main__':
             except ValueError as verr:
                 print(verr, file=sys.stderr)
                 sys.exit(-1)
+        if 'cache_dicts' in opt:
+            if 'cache_loc' in opt:
+                opt_cache_loc = opt['cache_loc']
+            else:
+                print_error("If \"cache_dicts\" is set, \"cache_loc\" must be provided.")
+                sys.exit(-1)
     except getopt.GetoptError as err:
         print(err, file=sys.stderr)
         sys.exit(-1)
@@ -506,250 +521,453 @@ if __name__ == '__main__':
             "Will only refine the state which is present in " + opt_filename + " if necessary.")
         with open(opt_filename, 'r') as f:
             configurations = json.load(f)
+
+        # for i in range(0, 7):
+        #     signal = np.array(configurations[i]['offline'][0]['uW'])
+        #     plt.plot(signal)
+        # plt.xlabel('Time [us]')
+        # plt.ylabel('Power [mW]')
+        # plt.show()
+        # sys.exit()
+
         # loop through all traces check if refinement is necessary
-        resulting_sequence_list = []
-        for num_config, measurements_by_config in enumerate(configurations):
-            # loop through all occurrences of the looked at state
-            print_info("Looking at state '" + measurements_by_config['name'] + "' with params: "
-                       + str(measurements_by_config['parameter']))
-            refine = False
-            print_info("Checking if refinement is necessary...")
-            for measurement in measurements_by_config['offline']:
-                # loop through measurements of particular state
-                # an check if state needs refinement
-                signal = measurement['uW']
-                # mean = measurement['uW_mean']
-                if needs_refinement(signal, opt_refinement_thresh) and not refine:
-                    print_info("Refinement is necessary!")
-                    refine = True
-            if not refine:
-                print_info("No refinement necessary for state '" + measurements_by_config['name']
-                           + "' with params: " + str(measurements_by_config['parameter']))
+        # resulting_sequence_list = []
+        # search for param_names, by_param and by_name files
+        by_param_file = None
+        by_name_file = None
+        param_names_file = None
+        if opt_cache_loc is not None:
+            flag = False
+            by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
+            by_param_loc = os.path.join(opt_cache_loc, "by_param.txt")
+            param_names_loc = os.path.join(opt_cache_loc, "param_names.txt")
+            if os.path.isfile(by_name_loc) and os.path.getsize(by_name_loc) > 0:
+                by_name_file = open(by_name_loc, "r")
+            else:
+                print_error("In " + opt_cache_loc + " is no by_name.txt.")
+                flag = True
+            if os.path.isfile(by_param_loc) and os.path.getsize(by_param_loc) > 0:
+                by_param_file = open(by_param_loc, "r")
+            else:
+                print_error("In " + opt_cache_loc + " is no by_param.txt.")
+                flag = True
+            if os.path.isfile(param_names_loc) and os.path.getsize(param_names_loc) > 0:
+                param_names_file = open(param_names_loc, "r")
             else:
-                # assume that all measurements of the same param configuration are fundamentally
-                # similar -> calculate penalty for first measurement, use it for all
-                if opt_pen_override is None:
-                    signal = np.array(measurements_by_config['offline'][0]['uW'])
-                    normed_signal = norm_signal(signal)
-                    penalty = calculate_penalty_value(normed_signal, model=opt_model,
-                                                      range_min=opt_range_min,
-                                                      range_max=opt_range_max,
-                                                      num_processes=opt_num_processes,
-                                                      jump=opt_jump, S=opt_S,
-                                                      pen_modifier=opt_pen_modifier)
-                    penalty = penalty[0]
+                print_error("In " + opt_cache_loc + " is no param_names.txt.")
+                flag = True
+            if flag:
+                print_info("The cache will be build.")
+
+        if None in (by_param_file, by_name_file, param_names_file):
+            state_durations_by_config = []
+            state_consumptions_by_config = []
+            for num_config, measurements_by_config in enumerate(configurations):
+                # loop through all occurrences of the looked at state
+                print_info("Looking at state '" + measurements_by_config['name'] + "' with params: "
+                           + str(measurements_by_config['parameter']) + "(" + str(num_config + 1) + "/"
+                           + str(len(configurations)) + ")")
+                refine = False
+                print_info("Checking if refinement is necessary...")
+                for measurement in measurements_by_config['offline']:
+                    # loop through measurements of particular state
+                    # an check if state needs refinement
+                    signal = measurement['uW']
+                    # mean = measurement['uW_mean']
+                    if needs_refinement(signal, opt_refinement_thresh) and not refine:
+                        print_info("Refinement is necessary!")
+                        refine = True
+                if not refine:
+                    print_info("No refinement necessary for state '" + measurements_by_config['name']
+                               + "' with params: " + str(measurements_by_config['parameter']))
                 else:
-                    penalty = opt_pen_override
-                # build arguments for parallel excecution
-                print_info("Starting raw_states calculation.")
-                raw_states_calc_args = []
-                for num_measurement, measurement in enumerate(measurements_by_config['offline']):
-                    raw_states_calc_args.append((num_measurement, measurement, penalty,
-                                                 opt_model, opt_jump))
-
-                raw_states_list = [None] * len(measurements_by_config['offline'])
-                raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
-                # extracting result and putting it in correct order -> index of raw_states_list
-                # entry still corresponds with index of measurement in measurements_by_states
-                # -> If measurements are discarded the correct ones are easily recognized
-                for ret_val in raw_states_res:
-                    num_trace = ret_val[0]
-                    raw_states = ret_val[1]
-                    avg_std = ret_val[2]
-                    change_avg_std = ret_val[3]
-                    # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
-                    #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
-                    raw_states_list[num_trace] = raw_states
-                    print_info("The average standard deviation for the newly found states in "
-                               + "measurement No. " + str(num_trace) + " is " + str(avg_std))
-                    print_info("That is a reduction of " + str(change_avg_std))
-                print_info("Finished raw_states calculation.")
-                num_states_array = [int()] * len(raw_states_list)
-                i = 0
-                for i, x in enumerate(raw_states_list):
-                    num_states_array[i] = len(x)
-                avg_num_states = np.mean(num_states_array)
-                num_states_dev = np.std(num_states_array)
-                print_info("On average " + str(avg_num_states)
-                           + " States have been found. The standard deviation"
-                           + " is " + str(num_states_dev))
-                # TODO: MAGIC NUMBER
-                if num_states_dev > 1:
-                    print_warning("The number of states varies strongly across measurements."
-                                  " Consider choosing a larger value for S or using the "
-                                  "pen_modifier option.")
-                    time.sleep(5)
-                # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
-                # Einfach Durchschnitt nehmen?
-                # Preliminary decision: Further on only use the traces, which have the most frequent state count
-                counts = np.bincount(num_states_array)
-                num_raw_states = np.argmax(counts)
-                print_info("Choose " + str(num_raw_states) + " as number of raw_states.")
-                i = 0
-                cluster_labels_list = []
-                num_cluster_list = []
-                for num_trace, raw_states in enumerate(raw_states_list):
-                    # iterate through raw states from measurements
-                    if len(raw_states) == num_raw_states:
-                        # build array with power values to cluster these
-                        value_to_cluster = np.zeros((num_raw_states, 2))
-                        j = 0
-                        for s in raw_states:
-                            value_to_cluster[j][0] = s[2]
-                            value_to_cluster[j][1] = 0
-                            j = j + 1
-                        # linked = linkage(value_to_cluster, 'single')
-                        #
-                        # labelList = range(1, 11)
-                        #
-                        # plt.figure(figsize=(10, 7))
-                        # dendrogram(linked,
-                        #            orientation='top',
-                        #            distance_sort='descending',
-                        #            show_leaf_counts=True)
-                        # plt.show()
-                        # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
-                        #   im distance_threshold
-                        cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
-                                                          affinity='euclidean',
-                                                          linkage='ward',
-                                                          distance_threshold=opt_refinement_thresh * 100)
-                        # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
-                        #                                   linkage='ward')
-                        cluster.fit_predict(value_to_cluster)
-                        # print_info("Cluster labels:\n" + str(cluster.labels_))
-                        # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
-                        # plt.show()
-                        cluster_labels_list.append((num_trace, cluster.labels_))
-                        num_cluster_list.append((num_trace, cluster.n_clusters_))
-                        i = i + 1
+                    # assume that all measurements of the same param configuration are fundamentally
+                    # similar -> calculate penalty for first measurement, use it for all
+                    if opt_pen_override is None:
+                        signal = np.array(measurements_by_config['offline'][0]['uW'])
+                        normed_signal = norm_signal(signal)
+                        penalty = calculate_penalty_value(normed_signal, model=opt_model,
+                                                          range_min=opt_range_min,
+                                                          range_max=opt_range_max,
+                                                          num_processes=opt_num_processes,
+                                                          jump=opt_jump, S=opt_S,
+                                                          pen_modifier=opt_pen_modifier)
+                        penalty = penalty[0]
                     else:
-                        print_info("Discarding measurement No. " + str(num_trace) + " because it "
-                                   + "did not recognize the number of raw_states correctly.")
-                num_used_measurements = len(raw_states_list)
-                if i != len(raw_states_list):
-                    if i / len(raw_states_list) <= 0.5:
-                        print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
-                                      + " Measurements for refinement. "
-                                        "Others did not recognize number of states correctly."
-                                        "\nYou should verify the integrity of the measurements.")
+                        penalty = opt_pen_override
+                    # build arguments for parallel excecution
+                    print_info("Starting raw_states calculation.")
+                    raw_states_calc_args = []
+                    for num_measurement, measurement in enumerate(measurements_by_config['offline']):
+                        raw_states_calc_args.append((num_measurement, measurement, penalty,
+                                                     opt_model, opt_jump))
+
+                    raw_states_list = [None] * len(measurements_by_config['offline'])
+                    raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
+                    # extracting result and putting it in correct order -> index of raw_states_list
+                    # entry still corresponds with index of measurement in measurements_by_states
+                    # -> If measurements are discarded the correct ones are easily recognized
+                    for ret_val in raw_states_res:
+                        num_trace = ret_val[0]
+                        raw_states = ret_val[1]
+                        avg_std = ret_val[2]
+                        change_avg_std = ret_val[3]
+                        # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
+                        #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
+                        raw_states_list[num_trace] = raw_states
+                        print_info("The average standard deviation for the newly found states in "
+                                   + "measurement No. " + str(num_trace) + " is " + str(avg_std))
+                        print_info("That is a reduction of " + str(change_avg_std))
+                    print_info("Finished raw_states calculation.")
+                    num_states_array = [int()] * len(raw_states_list)
+                    i = 0
+                    for i, x in enumerate(raw_states_list):
+                        num_states_array[i] = len(x)
+                    avg_num_states = np.mean(num_states_array)
+                    num_states_dev = np.std(num_states_array)
+                    print_info("On average " + str(avg_num_states)
+                               + " States have been found. The standard deviation"
+                               + " is " + str(num_states_dev))
+                    # TODO: MAGIC NUMBER
+                    if num_states_dev > 1:
+                        print_warning("The number of states varies strongly across measurements."
+                                      " Consider choosing a larger value for S or using the "
+                                      "pen_modifier option.")
+                        time.sleep(5)
+                    # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
+                    #   Einfach Durchschnitt nehmen?
+                    #   Preliminary decision: Further on only use the traces, which have the most
+                    #   frequent state count
+                    counts = np.bincount(num_states_array)
+                    num_raw_states = np.argmax(counts)
+                    print_info("Choose " + str(num_raw_states) + " as number of raw_states.")
+                    # iterate through all found breakpoints and determine start and end points as well
+                    # as power consumption
+                    states_duration_list = [0] * num_raw_states
+                    states_consumption_list = [0] * num_raw_states
+                    num_used_measurements = 0
+                    for num_trace, raw_states in enumerate(raw_states_list):
+                        if len(raw_states) == num_raw_states:
+                            num_used_measurements = num_used_measurements + 1
+                            # calced_state = (start_time, end_time, mean_power, std_dev)
+                            for num_state, s in enumerate(raw_states):
+                                state_duration = s[1] - s[0]
+                                state_consumption = s[2]
+                                states_duration_list[num_state] = \
+                                    states_duration_list[num_state] + state_duration
+                                states_consumption_list[num_state] = \
+                                    states_consumption_list[num_state] + state_consumption
+                        else:
+                            print_info("Discarding measurement No. " + str(num_trace) + " because it "
+                                       + "did not recognize the number of raw_states correctly.")
+                    for i, x in enumerate(states_duration_list):
+                        states_duration_list[i] = x / num_used_measurements
+                    for i, x in enumerate(states_consumption_list):
+                        states_consumption_list[i] = x / num_used_measurements
+                    if num_used_measurements != len(raw_states_list):
+                        if num_used_measurements / len(raw_states_list) <= 0.5:
+                            print_warning("Only used " + str(num_used_measurements) + "/"
+                                          + str(len(raw_states_list)) + " Measurements for refinement. "
+                                          + "Others did not recognize number of states correctly."
+                                          + "\nYou should verify the integrity of the measurements.")
+                        else:
+                            print_info("Used " + str(num_used_measurements) + "/"
+                                       + str(len(raw_states_list)) + " Measurements for refinement. "
+                                       + "Others did not recognize number of states correctly.")
+                        num_used_measurements = i
+                        # TODO: DEBUG Kram
+                        sys.exit(0)
                     else:
-                        print_info("Used " + str(i) + "/" + str(len(raw_states_list))
-                                   + " Measurements for refinement. "
-                                     "Others did not recognize number of states correctly.")
-                    num_used_measurements = i
-                    # TODO: DEBUG Kram
-                    sys.exit(0)
+                        print_info("Used all available measurements.")
+
+                    state_durations_by_config.append((num_config, states_duration_list))
+                    state_consumptions_by_config.append((num_config, states_consumption_list))
+                    # # TODO:
+                    # if num_config == 6:
+                    #     print("BRECHE AUS")
+                    #     break
+
+            # combine all state durations and consumptions to parametrized model
+
+            # this is only necessary because at this state only linear automatons can be modeled.
+            num_states_array = [int()] * len(state_consumptions_by_config)
+            for i, (_, states_consumption_list) in enumerate(state_consumptions_by_config):
+                num_states_array[i] = len(states_consumption_list)
+            counts = np.bincount(num_states_array)
+            num_raw_states = np.argmax(counts)
+            usable_configs = len(state_consumptions_by_config)
+            # param_list identical for each raw_state
+            # TODO: Kann man die echt einfach rausziehen aus der json? Ich hab sie nicht gefunden...
+            #   Nur für jede Messung. Aber da sind die ja ohnehin identisch.
+            param_list = []
+            param_names = configurations[0]['offline_aggregates']['paramkeys'][0]
+            print_info("param_names: " + str(param_names))
+            for num_config, states_consumption_list in state_consumptions_by_config:
+                if len(states_consumption_list) != num_raw_states:
+                    print_warning("Config No." + str(num_config) + " not usable yet due to different "
+                                  + "number of states. This hints a correlation between parameters and "
+                                  + "the structure of the resulting automaton. This will be possibly be"
+                                  + " supported in a future version of this tool.")
+                    usable_configs = usable_configs - 1
                 else:
-                    print_info("Used all available measurements.")
-
-                num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
-                avg_per_state_list = [None] * len(cluster_labels_list)
-                used_clusters = 0
-                for number, (num_trace, labels) in enumerate(cluster_labels_list):
-                    if num_cluster_list[number][1] == num_states:
-                        avg_per_state = [0] * num_states
-                        count_per_state = [0] * num_states
-                        raw_states = raw_states_list[num_trace]
-                        for num_label, label in enumerate(labels):
-                            count_per_state[label] = count_per_state[label] + 1
-                            avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
-                        for i, _ in enumerate(avg_per_state):
-                            avg_per_state[i] = avg_per_state[i] / count_per_state[i]
-                        avg_per_state_list[number] = avg_per_state
-                        used_clusters = used_clusters + 1
-                    else:
-                        # hopefully this does not happen regularly
-                        print_info("Discarding measurement " + str(number)
-                                   + " because the clustering yielded not matching results.")
-                        num_used_measurements = num_used_measurements - 1
-                if num_used_measurements == 0:
-                    print_error("Something went terribly wrong. Discarded all measurements.")
-                    # continue
-                    sys.exit(-1)
-                # flattend version for clustering:
-                values_to_cluster = np.zeros((num_states * used_clusters, 2))
-                index = 0
-                for avg_per_state in avg_per_state_list:
-                    if avg_per_state is not None:
-                        for avg in avg_per_state:
-                            values_to_cluster[index][0] = avg
-                            values_to_cluster[index][1] = 0
-                            index = index + 1
-                # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
-                # plt.show()
-                cluster = AgglomerativeClustering(n_clusters=num_states)
-                cluster.fit_predict(values_to_cluster)
-                # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
-                # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
-                # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
-                # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
-                new_labels_list = []
-                new_labels = []
-                i = 0
-                for label in cluster.labels_:
-                    new_labels.append(label)
-                    i = i + 1
-                    if i == num_states:
-                        new_labels_list.append(new_labels)
-                        new_labels = []
-                        i = 0
-                # only the selected measurements are present in new_labels.
-                # new_labels_index should not be incremented, if not selected_measurement is skipped
-                new_labels_index = 0
-                # cluster_labels_list contains all measurements -> if measurement is skipped
-                # still increment the index
-                index = 0
-                for elem in avg_per_state_list:
-                    if elem is not None:
-                        for number, label in enumerate(cluster_labels_list[index][1]):
-                            cluster_labels_list[index][1][number] = \
-                                new_labels_list[new_labels_index][label]
-                        new_labels_index = new_labels_index + 1
-                    else:
-                        # override not selected measurement labels to avoid choosing the wrong ones.
-                        for number, label in enumerate(cluster_labels_list[index][1]):
-                            cluster_labels_list[index][1][number] = -1
-                    index = index + 1
-                resulting_sequence = [None] * num_raw_states
-                i = 0
-                confidence = 0
-                for x in resulting_sequence:
-                    j = 0
-                    test_list = []
-                    for arr in [elem[1] for elem in cluster_labels_list]:
-                        if num_cluster_list[j][1] != num_states:
-                            j = j + 1
-                        else:
-                            if -1 in arr:
-                                print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
-                                            " Zustände wahrscheinlich.")
-                                sys.exit(-1)
-                            test_list.append(arr[i])
-                            j = j + 1
-                    bincount = np.bincount(test_list)
-                    resulting_sequence[i] = np.argmax(bincount)
-                    confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
-                    i = i + 1
-                confidence = confidence / len(resulting_sequence)
-                print_info("Confidence of resulting sequence is " + str(confidence)
-                           + " while using " + str(num_used_measurements) + "/"
-                           + str(len(raw_states_list)) + " measurements.")
-                #print(resulting_sequence)
-                resulting_sequence_list.append((num_config, resulting_sequence))
-        # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
-        #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
-        #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
-        #   auftreten.
-        # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
-        #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
-        #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
-        #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
-        #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
-        #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
-        for num_config, sequence in resulting_sequence_list:
-            print_info("NO. config:" + str(num_config))
-            print_info(sequence)
+                    param_list.append(configurations[num_config]['offline_aggregates']['param'][0])
+            print_info("param_list: " + str(param_list))
+
+            if usable_configs == len(state_consumptions_by_config):
+                print_info("All configs usable.")
+            else:
+                print_info("Using only " + str(usable_configs) + " Configs.")
+            by_name = {}
+            for i in range(num_raw_states):
+                consumptions_for_state = []
+                durations_for_state = []
+                for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config):
+                    consumptions_for_state.append(states_consumption_list[i])
+                    durations_for_state.append(state_durations_by_config[j][1][i])
+                name = "state_" + str(i)
+                state_dict = {
+                    "param": param_list,
+                    "power": consumptions_for_state,
+                    "duration": durations_for_state,
+                    "attributes": ["power", "duration"]
+                }
+                by_name[name] = state_dict
+            by_param = by_name_to_by_param(by_name)
+            if opt_cache_loc is not None:
+                by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
+                by_param_loc = os.path.join(opt_cache_loc, "by_param.txt")
+                param_names_loc = os.path.join(opt_cache_loc, "param_names.txt")
+                f = open(by_name_loc, "w")
+                f.write(str(by_name))
+                f.close()
+                f = open(by_param_loc, "w")
+                f.write(str(by_param))
+                f.close()
+                f = open(param_names_loc, "w")
+                f.write(str(param_names))
+                f.close()
+        else:
+            by_name_text = str(by_name_file.read())
+            by_name = eval(by_name_text)
+            by_param_text = str(by_param_file.read())
+            by_param = eval(by_param_text)
+            param_names_text = str(param_names_file.read())
+            param_names = eval(param_names_text)
+
+        # t = 0
+        # last_pow = 0
+        # for key in by_name.keys():
+        #     end_t = t + np.mean(by_name[key]["duration"])
+        #     power = np.mean(by_name[key]["power"])
+        #     plt.vlines(t, min(last_pow, power), max(last_pow, power))
+        #     plt.hlines(power, t, end_t)
+        #     t = end_t
+        #     last_pow = power
+        # plt.show()
+        stats = parameters.ParamStats(by_name, by_param, param_names, dict())
+        paramfit = ParallelParamFit(by_param)
+        for state_name in by_name.keys():
+            for num_param, param_name in enumerate(param_names):
+                if stats.depends_on_param(state_name, "power", param_name):
+                    paramfit.enqueue(state_name, "power", num_param, param_name)
+                if stats.depends_on_param(state_name, "duration", param_name):
+                    paramfit.enqueue(state_name, "duration", num_param, param_name)
+                print_info("State " + state_name + "s power depends on param " + param_name + ":" +
+                           str(stats.depends_on_param(state_name, "power", param_name))
+                           )
+                print_info("State " + state_name + "s duration depends on param " + param_name + ":"
+                           + str(stats.depends_on_param(state_name, "duration", param_name))
+                           )
+        paramfit.fit()
+        fit_res_dur_list = []
+        fit_res_pow_list = []
+        for state_name in by_name.keys():
+            fit_power = paramfit.get_result(state_name, "power")
+            fit_duration = paramfit.get_result(state_name, "duration")
+            combined_fit_power = analytic.function_powerset(fit_power, param_names, 0)
+            combined_fit_duration = analytic.function_powerset(fit_duration, param_names, 0)
+            combined_fit_power.fit(by_param, state_name, "power")
+            if not combined_fit_power.fit_success:
+                print_warning("Fitting(power) for state " + state_name + " was not succesful!")
+            combined_fit_duration.fit(by_param, state_name, "duration")
+            if not combined_fit_duration.fit_success:
+                print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
+            fit_res_pow_list.append(combined_fit_power)
+            fit_res_dur_list.append(combined_fit_duration)
+
+
+        #         TODO: removed clustering (temporarily), since it provided too much dificultys
+        #           at the current state
+        #         i = 0
+        #         cluster_labels_list = []
+        #         num_cluster_list = []
+        #         for num_trace, raw_states in enumerate(raw_states_list):
+        #             # iterate through raw states from measurements
+        #             if len(raw_states) == num_raw_states:
+        #                 # build array with power values to cluster these
+        #                 value_to_cluster = np.zeros((num_raw_states, 2))
+        #                 j = 0
+        #                 for s in raw_states:
+        #                     value_to_cluster[j][0] = s[2]
+        #                     value_to_cluster[j][1] = 0
+        #                     j = j + 1
+        #                 # linked = linkage(value_to_cluster, 'single')
+        #                 #
+        #                 # labelList = range(1, 11)
+        #                 #
+        #                 # plt.figure(figsize=(10, 7))
+        #                 # dendrogram(linked,
+        #                 #            orientation='top',
+        #                 #            distance_sort='descending',
+        #                 #            show_leaf_counts=True)
+        #                 # plt.show()
+        #                 # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
+        #                 #   im distance_threshold
+        #                 cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
+        #                                                   affinity='euclidean',
+        #                                                   linkage='ward',
+        #                                                   distance_threshold=opt_refinement_thresh * 100)
+        #                 # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
+        #                 #                                   linkage='ward')
+        #                 cluster.fit_predict(value_to_cluster)
+        #                 # print_info("Cluster labels:\n" + str(cluster.labels_))
+        #                 # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
+        #                 # plt.show()
+        #                 cluster_labels_list.append((num_trace, cluster.labels_))
+        #                 num_cluster_list.append((num_trace, cluster.n_clusters_))
+        #                 i = i + 1
+        #             else:
+        #                 print_info("Discarding measurement No. " + str(num_trace) + " because it "
+        #                            + "did not recognize the number of raw_states correctly.")
+        #         num_used_measurements = len(raw_states_list)
+        #         if i != len(raw_states_list):
+        #             if i / len(raw_states_list) <= 0.5:
+        #                 print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
+        #                               + " Measurements for refinement. "
+        #                                 "Others did not recognize number of states correctly."
+        #                                 "\nYou should verify the integrity of the measurements.")
+        #             else:
+        #                 print_info("Used " + str(i) + "/" + str(len(raw_states_list))
+        #                            + " Measurements for refinement. "
+        #                              "Others did not recognize number of states correctly.")
+        #             num_used_measurements = i
+        #             # TODO: DEBUG Kram
+        #             sys.exit(0)
+        #         else:
+        #             print_info("Used all available measurements.")
+        #
+        #         num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
+        #         avg_per_state_list = [None] * len(cluster_labels_list)
+        #         used_clusters = 0
+        #         for number, (num_trace, labels) in enumerate(cluster_labels_list):
+        #             if num_cluster_list[number][1] == num_states:
+        #                 avg_per_state = [0] * num_states
+        #                 count_per_state = [0] * num_states
+        #                 raw_states = raw_states_list[num_trace]
+        #                 for num_label, label in enumerate(labels):
+        #                     count_per_state[label] = count_per_state[label] + 1
+        #                     avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
+        #                 for i, _ in enumerate(avg_per_state):
+        #                     avg_per_state[i] = avg_per_state[i] / count_per_state[i]
+        #                 avg_per_state_list[number] = avg_per_state
+        #                 used_clusters = used_clusters + 1
+        #             else:
+        #                 # hopefully this does not happen regularly
+        #                 print_info("Discarding measurement " + str(number)
+        #                            + " because the clustering yielded not matching results.")
+        #                 num_used_measurements = num_used_measurements - 1
+        #         if num_used_measurements == 0:
+        #             print_error("Something went terribly wrong. Discarded all measurements.")
+        #             # continue
+        #             sys.exit(-1)
+        #         # flattend version for clustering:
+        #         values_to_cluster = np.zeros((num_states * used_clusters, 2))
+        #         index = 0
+        #         for avg_per_state in avg_per_state_list:
+        #             if avg_per_state is not None:
+        #                 for avg in avg_per_state:
+        #                     values_to_cluster[index][0] = avg
+        #                     values_to_cluster[index][1] = 0
+        #                     index = index + 1
+        #         # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
+        #         # plt.show()
+        #         cluster = AgglomerativeClustering(n_clusters=num_states)
+        #         cluster.fit_predict(values_to_cluster)
+        #         # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
+        #         # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
+        #         # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
+        #         # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
+        #         new_labels_list = []
+        #         new_labels = []
+        #         i = 0
+        #         for label in cluster.labels_:
+        #             new_labels.append(label)
+        #             i = i + 1
+        #             if i == num_states:
+        #                 new_labels_list.append(new_labels)
+        #                 new_labels = []
+        #                 i = 0
+        #         # only the selected measurements are present in new_labels.
+        #         # new_labels_index should not be incremented, if not selected_measurement is skipped
+        #         new_labels_index = 0
+        #         # cluster_labels_list contains all measurements -> if measurement is skipped
+        #         # still increment the index
+        #         index = 0
+        #         for elem in avg_per_state_list:
+        #             if elem is not None:
+        #                 for number, label in enumerate(cluster_labels_list[index][1]):
+        #                     cluster_labels_list[index][1][number] = \
+        #                         new_labels_list[new_labels_index][label]
+        #                 new_labels_index = new_labels_index + 1
+        #             else:
+        #                 # override not selected measurement labels to avoid choosing the wrong ones.
+        #                 for number, label in enumerate(cluster_labels_list[index][1]):
+        #                     cluster_labels_list[index][1][number] = -1
+        #             index = index + 1
+        #         resulting_sequence = [None] * num_raw_states
+        #         i = 0
+        #         confidence = 0
+        #         for x in resulting_sequence:
+        #             j = 0
+        #             test_list = []
+        #             for arr in [elem[1] for elem in cluster_labels_list]:
+        #                 if num_cluster_list[j][1] != num_states:
+        #                     j = j + 1
+        #                 else:
+        #                     if -1 in arr:
+        #                         print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
+        #                                     " Zustände wahrscheinlich.")
+        #                         sys.exit(-1)
+        #                     test_list.append(arr[i])
+        #                     j = j + 1
+        #             bincount = np.bincount(test_list)
+        #             resulting_sequence[i] = np.argmax(bincount)
+        #             confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
+        #             i = i + 1
+        #         confidence = confidence / len(resulting_sequence)
+        #         print_info("Confidence of resulting sequence is " + str(confidence)
+        #                    + " while using " + str(num_used_measurements) + "/"
+        #                    + str(len(raw_states_list)) + " measurements.")
+        #         #print(resulting_sequence)
+        #         resulting_sequence_list.append((num_config, resulting_sequence))
+        # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
+        # #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
+        # #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
+        # #   auftreten.
+        # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
+        # #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
+        # #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
+        # #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
+        # #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
+        # #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
+        # for num_config, sequence in resulting_sequence_list:
+        #     print_info("NO. config:" + str(num_config))
+        #     print_info(sequence)
+        #
+        #
+        #
+        #
+
 
     elif ".tar" in opt_filename:
         # open with dfatool
-- 
cgit v1.2.3


From 61fb6094a33c4855c763f1925e61aec90294daa3 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Sun, 9 Aug 2020 15:11:42 +0200
Subject: Parametrisierung scheint vernünftig zu klappen. Vermutlich fertig.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 637 ++++++++++++++++++++++++++++---------------
 1 file changed, 418 insertions(+), 219 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 75cdce6..40c405d 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -14,13 +14,14 @@ import numpy as np
 from dfatool.functions import analytic
 from dfatool.loader import RawData
 from dfatool import parameters
-from dfatool.model import ParallelParamFit
+from dfatool.model import ParallelParamFit, PTAModel
 from dfatool.utils import by_name_to_by_param
 
 
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
 
-# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX_cache"
 
 
 def plot_data_from_json(filename, trace_num, x_axis, y_axis):
@@ -294,7 +295,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
 
 # raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model
 #                                                  , opt_jump))
-def calc_raw_states_func(num_trace, measurement, penalty, model, jump):
+def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
     signal = np.array(measurement['uW'])
     normed_signal = norm_signal(signal)
     bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump)
@@ -325,7 +326,7 @@ def calc_raw_states_func(num_trace, measurement, penalty, model, jump):
     # print_info("The average standard deviation for the newly found states is "
     #            + str(new_avg_std))
     # print_info("That is a reduction of " + str(change_avg_std))
-    return num_trace, calced_states, new_avg_std, change_avg_std
+    return num_measurement, calced_states, new_avg_std, change_avg_std
 
 
 def calc_raw_states(arg_list, num_processes=8):
@@ -382,6 +383,27 @@ def norm_signal(signal):
     return normed_signal
 
 
+def norm_values_to_cluster(values_to_cluster):
+    new_vals = np.array(values_to_cluster)
+    num_samples = len(values_to_cluster)
+    num_params = len(values_to_cluster[0])
+    for i in range(num_params):
+        param_vals = []
+        for sample in new_vals:
+            param_vals.append(sample[i])
+        max_val = np.max(np.abs(param_vals))
+        for num_sample, sample in enumerate(new_vals):
+            values_to_cluster[num_sample][i] = sample[i] / max_val
+    return new_vals
+
+
+def get_state_num(state_name, distinct_states):
+    for state_num, states in enumerate(distinct_states):
+        if state_name in states:
+            return state_num
+    return -1
+
+
 if __name__ == '__main__':
     # OPTION RECOGNITION
     opt = dict()
@@ -536,6 +558,7 @@ if __name__ == '__main__':
         by_param_file = None
         by_name_file = None
         param_names_file = None
+        from_cache = False
         if opt_cache_loc is not None:
             flag = False
             by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
@@ -558,6 +581,12 @@ if __name__ == '__main__':
                 flag = True
             if flag:
                 print_info("The cache will be build.")
+            else:
+                print_warning("THE OPTION \"cache_dicts\" IS FOR DEBUGGING PURPOSES ONLY! "
+                              "\nDO NOT USE FOR REGULAR APPLICATIONS!"
+                              "\nThe script will not run to the end properly."
+                              "\nNo final parametrization will be done.")
+                from_cache = True
 
         if None in (by_param_file, by_name_file, param_names_file):
             state_durations_by_config = []
@@ -565,7 +594,8 @@ if __name__ == '__main__':
             for num_config, measurements_by_config in enumerate(configurations):
                 # loop through all occurrences of the looked at state
                 print_info("Looking at state '" + measurements_by_config['name'] + "' with params: "
-                           + str(measurements_by_config['parameter']) + "(" + str(num_config + 1) + "/"
+                           + str(measurements_by_config['parameter']) + "(" + str(
+                    num_config + 1) + "/"
                            + str(len(configurations)) + ")")
                 refine = False
                 print_info("Checking if refinement is necessary...")
@@ -578,8 +608,9 @@ if __name__ == '__main__':
                         print_info("Refinement is necessary!")
                         refine = True
                 if not refine:
-                    print_info("No refinement necessary for state '" + measurements_by_config['name']
-                               + "' with params: " + str(measurements_by_config['parameter']))
+                    print_info(
+                        "No refinement necessary for state '" + measurements_by_config['name']
+                        + "' with params: " + str(measurements_by_config['parameter']))
                 else:
                     # assume that all measurements of the same param configuration are fundamentally
                     # similar -> calculate penalty for first measurement, use it for all
@@ -598,7 +629,8 @@ if __name__ == '__main__':
                     # build arguments for parallel excecution
                     print_info("Starting raw_states calculation.")
                     raw_states_calc_args = []
-                    for num_measurement, measurement in enumerate(measurements_by_config['offline']):
+                    for num_measurement, measurement in enumerate(
+                            measurements_by_config['offline']):
                         raw_states_calc_args.append((num_measurement, measurement, penalty,
                                                      opt_model, opt_jump))
 
@@ -608,15 +640,16 @@ if __name__ == '__main__':
                     # entry still corresponds with index of measurement in measurements_by_states
                     # -> If measurements are discarded the correct ones are easily recognized
                     for ret_val in raw_states_res:
-                        num_trace = ret_val[0]
+                        num_measurement = ret_val[0]
                         raw_states = ret_val[1]
                         avg_std = ret_val[2]
                         change_avg_std = ret_val[3]
                         # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
                         #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
-                        raw_states_list[num_trace] = raw_states
+                        raw_states_list[num_measurement] = raw_states
                         print_info("The average standard deviation for the newly found states in "
-                                   + "measurement No. " + str(num_trace) + " is " + str(avg_std))
+                                   + "measurement No. " + str(num_measurement) + " is " + str(
+                            avg_std))
                         print_info("That is a reduction of " + str(change_avg_std))
                     print_info("Finished raw_states calculation.")
                     num_states_array = [int()] * len(raw_states_list)
@@ -643,37 +676,46 @@ if __name__ == '__main__':
                     print_info("Choose " + str(num_raw_states) + " as number of raw_states.")
                     # iterate through all found breakpoints and determine start and end points as well
                     # as power consumption
-                    states_duration_list = [0] * num_raw_states
-                    states_consumption_list = [0] * num_raw_states
+                    num_measurements = len(raw_states_list)
+                    states_duration_list = [list()] * num_raw_states
+                    states_consumption_list = [list()] * num_raw_states
+                    for num_elem, _ in enumerate(states_duration_list):
+                        states_duration_list[num_elem] = [0] * num_measurements
+                        states_consumption_list[num_elem] = [0] * num_measurements
                     num_used_measurements = 0
-                    for num_trace, raw_states in enumerate(raw_states_list):
+                    for num_measurement, raw_states in enumerate(raw_states_list):
                         if len(raw_states) == num_raw_states:
                             num_used_measurements = num_used_measurements + 1
-                            # calced_state = (start_time, end_time, mean_power, std_dev)
                             for num_state, s in enumerate(raw_states):
-                                state_duration = s[1] - s[0]
-                                state_consumption = s[2]
-                                states_duration_list[num_state] = \
-                                    states_duration_list[num_state] + state_duration
-                                states_consumption_list[num_state] = \
-                                    states_consumption_list[num_state] + state_consumption
+                                states_duration_list[num_state][num_measurement] = s[1] - s[0]
+                                states_consumption_list[num_state][num_measurement] = s[2]
+                            # calced_state = (start_time, end_time, mean_power, std_dev)
+                            # for num_state, s in enumerate(raw_states):
+                            #     state_duration = s[1] - s[0]
+                            #     state_consumption = s[2]
+                            #     states_duration_list[num_state] = \
+                            #         states_duration_list[num_state] + state_duration
+                            #     states_consumption_list[num_state] = \
+                            #         states_consumption_list[num_state] + state_consumption
                         else:
-                            print_info("Discarding measurement No. " + str(num_trace) + " because it "
-                                       + "did not recognize the number of raw_states correctly.")
-                    for i, x in enumerate(states_duration_list):
-                        states_duration_list[i] = x / num_used_measurements
-                    for i, x in enumerate(states_consumption_list):
-                        states_consumption_list[i] = x / num_used_measurements
+                            print_info("Discarding measurement No. " + str(num_measurement)
+                                       + " because it did not recognize the number of "
+                                         "raw_states correctly.")
+                    # for i, x in enumerate(states_duration_list):
+                    #     states_duration_list[i] = x / num_used_measurements
+                    # for i, x in enumerate(states_consumption_list):
+                    #     states_consumption_list[i] = x / num_used_measurements
                     if num_used_measurements != len(raw_states_list):
                         if num_used_measurements / len(raw_states_list) <= 0.5:
                             print_warning("Only used " + str(num_used_measurements) + "/"
-                                          + str(len(raw_states_list)) + " Measurements for refinement. "
+                                          + str(
+                                len(raw_states_list)) + " Measurements for refinement. "
                                           + "Others did not recognize number of states correctly."
                                           + "\nYou should verify the integrity of the measurements.")
                         else:
                             print_info("Used " + str(num_used_measurements) + "/"
-                                       + str(len(raw_states_list)) + " Measurements for refinement. "
-                                       + "Others did not recognize number of states correctly.")
+                                       + str(len(raw_states_list)) + " Measurements for refinement."
+                                       + " Others did not recognize number of states correctly.")
                         num_used_measurements = i
                         # TODO: DEBUG Kram
                         sys.exit(0)
@@ -697,20 +739,19 @@ if __name__ == '__main__':
             num_raw_states = np.argmax(counts)
             usable_configs = len(state_consumptions_by_config)
             # param_list identical for each raw_state
-            # TODO: Kann man die echt einfach rausziehen aus der json? Ich hab sie nicht gefunden...
-            #   Nur für jede Messung. Aber da sind die ja ohnehin identisch.
             param_list = []
             param_names = configurations[0]['offline_aggregates']['paramkeys'][0]
             print_info("param_names: " + str(param_names))
             for num_config, states_consumption_list in state_consumptions_by_config:
                 if len(states_consumption_list) != num_raw_states:
-                    print_warning("Config No." + str(num_config) + " not usable yet due to different "
-                                  + "number of states. This hints a correlation between parameters and "
-                                  + "the structure of the resulting automaton. This will be possibly be"
-                                  + " supported in a future version of this tool.")
+                    print_warning(
+                        "Config No." + str(num_config) + " not usable yet due to different "
+                        + "number of states. This hints a correlation between parameters and "
+                        + "the structure of the resulting automaton. This will be possibly"
+                        + " supported in a future version of this tool.")
                     usable_configs = usable_configs - 1
                 else:
-                    param_list.append(configurations[num_config]['offline_aggregates']['param'][0])
+                    param_list.extend(configurations[num_config]['offline_aggregates']['param'])
             print_info("param_list: " + str(param_list))
 
             if usable_configs == len(state_consumptions_by_config):
@@ -722,16 +763,16 @@ if __name__ == '__main__':
                 consumptions_for_state = []
                 durations_for_state = []
                 for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config):
-                    consumptions_for_state.append(states_consumption_list[i])
-                    durations_for_state.append(state_durations_by_config[j][1][i])
-                name = "state_" + str(i)
+                    consumptions_for_state.extend(states_consumption_list[i])
+                    durations_for_state.extend(state_durations_by_config[j][1][i])
+                state_name = "state_" + str(i)
                 state_dict = {
                     "param": param_list,
                     "power": consumptions_for_state,
                     "duration": durations_for_state,
                     "attributes": ["power", "duration"]
                 }
-                by_name[name] = state_dict
+                by_name[state_name] = state_dict
             by_param = by_name_to_by_param(by_name)
             if opt_cache_loc is not None:
                 by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
@@ -779,8 +820,8 @@ if __name__ == '__main__':
                            + str(stats.depends_on_param(state_name, "duration", param_name))
                            )
         paramfit.fit()
-        fit_res_dur_list = []
-        fit_res_pow_list = []
+        fit_res_dur_dict = {}
+        fit_res_pow_dict = {}
         for state_name in by_name.keys():
             fit_power = paramfit.get_result(state_name, "power")
             fit_duration = paramfit.get_result(state_name, "duration")
@@ -792,182 +833,340 @@ if __name__ == '__main__':
             combined_fit_duration.fit(by_param, state_name, "duration")
             if not combined_fit_duration.fit_success:
                 print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
-            fit_res_pow_list.append(combined_fit_power)
-            fit_res_dur_list.append(combined_fit_duration)
-
-
-        #         TODO: removed clustering (temporarily), since it provided too much dificultys
-        #           at the current state
-        #         i = 0
-        #         cluster_labels_list = []
-        #         num_cluster_list = []
-        #         for num_trace, raw_states in enumerate(raw_states_list):
-        #             # iterate through raw states from measurements
-        #             if len(raw_states) == num_raw_states:
-        #                 # build array with power values to cluster these
-        #                 value_to_cluster = np.zeros((num_raw_states, 2))
-        #                 j = 0
-        #                 for s in raw_states:
-        #                     value_to_cluster[j][0] = s[2]
-        #                     value_to_cluster[j][1] = 0
-        #                     j = j + 1
-        #                 # linked = linkage(value_to_cluster, 'single')
-        #                 #
-        #                 # labelList = range(1, 11)
-        #                 #
-        #                 # plt.figure(figsize=(10, 7))
-        #                 # dendrogram(linked,
-        #                 #            orientation='top',
-        #                 #            distance_sort='descending',
-        #                 #            show_leaf_counts=True)
-        #                 # plt.show()
-        #                 # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
-        #                 #   im distance_threshold
-        #                 cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
-        #                                                   affinity='euclidean',
-        #                                                   linkage='ward',
-        #                                                   distance_threshold=opt_refinement_thresh * 100)
-        #                 # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
-        #                 #                                   linkage='ward')
-        #                 cluster.fit_predict(value_to_cluster)
-        #                 # print_info("Cluster labels:\n" + str(cluster.labels_))
-        #                 # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
-        #                 # plt.show()
-        #                 cluster_labels_list.append((num_trace, cluster.labels_))
-        #                 num_cluster_list.append((num_trace, cluster.n_clusters_))
-        #                 i = i + 1
-        #             else:
-        #                 print_info("Discarding measurement No. " + str(num_trace) + " because it "
-        #                            + "did not recognize the number of raw_states correctly.")
-        #         num_used_measurements = len(raw_states_list)
-        #         if i != len(raw_states_list):
-        #             if i / len(raw_states_list) <= 0.5:
-        #                 print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
-        #                               + " Measurements for refinement. "
-        #                                 "Others did not recognize number of states correctly."
-        #                                 "\nYou should verify the integrity of the measurements.")
-        #             else:
-        #                 print_info("Used " + str(i) + "/" + str(len(raw_states_list))
-        #                            + " Measurements for refinement. "
-        #                              "Others did not recognize number of states correctly.")
-        #             num_used_measurements = i
-        #             # TODO: DEBUG Kram
-        #             sys.exit(0)
-        #         else:
-        #             print_info("Used all available measurements.")
-        #
-        #         num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
-        #         avg_per_state_list = [None] * len(cluster_labels_list)
-        #         used_clusters = 0
-        #         for number, (num_trace, labels) in enumerate(cluster_labels_list):
-        #             if num_cluster_list[number][1] == num_states:
-        #                 avg_per_state = [0] * num_states
-        #                 count_per_state = [0] * num_states
-        #                 raw_states = raw_states_list[num_trace]
-        #                 for num_label, label in enumerate(labels):
-        #                     count_per_state[label] = count_per_state[label] + 1
-        #                     avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
-        #                 for i, _ in enumerate(avg_per_state):
-        #                     avg_per_state[i] = avg_per_state[i] / count_per_state[i]
-        #                 avg_per_state_list[number] = avg_per_state
-        #                 used_clusters = used_clusters + 1
-        #             else:
-        #                 # hopefully this does not happen regularly
-        #                 print_info("Discarding measurement " + str(number)
-        #                            + " because the clustering yielded not matching results.")
-        #                 num_used_measurements = num_used_measurements - 1
-        #         if num_used_measurements == 0:
-        #             print_error("Something went terribly wrong. Discarded all measurements.")
-        #             # continue
-        #             sys.exit(-1)
-        #         # flattend version for clustering:
-        #         values_to_cluster = np.zeros((num_states * used_clusters, 2))
-        #         index = 0
-        #         for avg_per_state in avg_per_state_list:
-        #             if avg_per_state is not None:
-        #                 for avg in avg_per_state:
-        #                     values_to_cluster[index][0] = avg
-        #                     values_to_cluster[index][1] = 0
-        #                     index = index + 1
-        #         # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
-        #         # plt.show()
-        #         cluster = AgglomerativeClustering(n_clusters=num_states)
-        #         cluster.fit_predict(values_to_cluster)
-        #         # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
-        #         # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
-        #         # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
-        #         # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
-        #         new_labels_list = []
-        #         new_labels = []
-        #         i = 0
-        #         for label in cluster.labels_:
-        #             new_labels.append(label)
-        #             i = i + 1
-        #             if i == num_states:
-        #                 new_labels_list.append(new_labels)
-        #                 new_labels = []
-        #                 i = 0
-        #         # only the selected measurements are present in new_labels.
-        #         # new_labels_index should not be incremented, if not selected_measurement is skipped
-        #         new_labels_index = 0
-        #         # cluster_labels_list contains all measurements -> if measurement is skipped
-        #         # still increment the index
-        #         index = 0
-        #         for elem in avg_per_state_list:
-        #             if elem is not None:
-        #                 for number, label in enumerate(cluster_labels_list[index][1]):
-        #                     cluster_labels_list[index][1][number] = \
-        #                         new_labels_list[new_labels_index][label]
-        #                 new_labels_index = new_labels_index + 1
-        #             else:
-        #                 # override not selected measurement labels to avoid choosing the wrong ones.
-        #                 for number, label in enumerate(cluster_labels_list[index][1]):
-        #                     cluster_labels_list[index][1][number] = -1
-        #             index = index + 1
-        #         resulting_sequence = [None] * num_raw_states
-        #         i = 0
-        #         confidence = 0
-        #         for x in resulting_sequence:
-        #             j = 0
-        #             test_list = []
-        #             for arr in [elem[1] for elem in cluster_labels_list]:
-        #                 if num_cluster_list[j][1] != num_states:
-        #                     j = j + 1
-        #                 else:
-        #                     if -1 in arr:
-        #                         print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
-        #                                     " Zustände wahrscheinlich.")
-        #                         sys.exit(-1)
-        #                     test_list.append(arr[i])
-        #                     j = j + 1
-        #             bincount = np.bincount(test_list)
-        #             resulting_sequence[i] = np.argmax(bincount)
-        #             confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
-        #             i = i + 1
-        #         confidence = confidence / len(resulting_sequence)
-        #         print_info("Confidence of resulting sequence is " + str(confidence)
-        #                    + " while using " + str(num_used_measurements) + "/"
-        #                    + str(len(raw_states_list)) + " measurements.")
-        #         #print(resulting_sequence)
-        #         resulting_sequence_list.append((num_config, resulting_sequence))
-        # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
-        # #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
-        # #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
-        # #   auftreten.
-        # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
-        # #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
-        # #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
-        # #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
-        # #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
-        # #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
-        # for num_config, sequence in resulting_sequence_list:
-        #     print_info("NO. config:" + str(num_config))
-        #     print_info(sequence)
-        #
-        #
-        #
-        #
-
+            fit_res_pow_dict[state_name] = combined_fit_power
+            fit_res_dur_dict[state_name] = combined_fit_duration
+        # only raw_states with the same number of function parameters can be similar
+        num_param_pow_dict = {}
+        num_param_dur_dict = {}
+        for state_name in by_name.keys():
+            model_function = str(fit_res_pow_dict[state_name].model_function)
+            model_args = fit_res_pow_dict[state_name].model_args
+            num_param_pow_dict[state_name] = len(model_args)
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print_info("Power-Function for state " + state_name + ": "
+                       + model_function)
+        for state_name in by_name.keys():
+            model_function = str(fit_res_dur_dict[state_name].model_function)
+            model_args = fit_res_dur_dict[state_name].model_args
+            num_param_dur_dict[state_name] = len(model_args)
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print_info("Duration-Function for state " + state_name + ": "
+                       + model_function)
+        similar_raw_state_buckets = {}
+        for state_name in by_name.keys():
+            pow_model_function = str(fit_res_pow_dict[state_name].model_function)
+            dur_model_function = str(fit_res_dur_dict[state_name].model_function)
+            key_tuple = (pow_model_function, dur_model_function)
+            if key_tuple not in similar_raw_state_buckets:
+                similar_raw_state_buckets[key_tuple] = []
+            similar_raw_state_buckets[key_tuple].append(state_name)
+
+        # cluster for each Key-Tuple using the function parameters
+        distinct_states = []
+        for key_tuple in similar_raw_state_buckets.keys():
+            print_info("Key-Tuple " + str(key_tuple) + ": "
+                       + str(similar_raw_state_buckets[key_tuple]))
+            similar_states = similar_raw_state_buckets[key_tuple]
+            if len(similar_states) > 1:
+                # functions are identical -> num_params is identical
+                num_params = num_param_dur_dict[similar_states[0]] + num_param_pow_dict[
+                    similar_states[0]]
+                values_to_cluster = np.zeros((len(similar_states), num_params))
+                for num_state, state_name in enumerate(similar_states):
+                    dur_params = fit_res_dur_dict[state_name].model_args
+                    pow_params = fit_res_pow_dict[state_name].model_args
+                    j = 0
+                    for param in pow_params:
+                        values_to_cluster[num_state][j] = param
+                        j = j + 1
+                    for param in dur_params:
+                        values_to_cluster[num_state][j] = param
+                        j = j + 1
+                normed_vals_to_cluster = norm_values_to_cluster(values_to_cluster)
+                cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
+                                                  affinity='euclidean',
+                                                  linkage='ward',
+                                                  # TODO: Magic Number. Beim Evaluieren finetunen
+                                                  distance_threshold=1)
+                cluster.fit_predict(values_to_cluster)
+                cluster_labels = cluster.labels_
+                print_info("Cluster labels:\n" + str(cluster_labels))
+                if cluster.n_clusters_ > 1:
+                    # more than one distinct state found
+                    distinct_state_dict = {}
+                    for num_state, label in enumerate(cluster_labels):
+                        if label not in distinct_state_dict.keys():
+                            distinct_state_dict[label] = []
+                        distinct_state_dict[label].append(similar_states[num_state])
+                    for distinct_state_key in distinct_state_dict.keys():
+                        distinct_states.append(distinct_state_dict[distinct_state_key])
+                else:
+                    distinct_states.append(similar_states)
+            else:
+                distinct_states.append(similar_states)
+        for num_state, distinct_state in enumerate(distinct_states):
+            print("State " + str(num_state) + ": " + str(distinct_state))
+        num_raw_states = len(by_name.keys())
+        resulting_sequence = [int] * num_raw_states
+        for i in range(num_raw_states):
+            state_name = "state_" + str(i)
+            state_num = get_state_num(state_name, distinct_states)
+            if state_num == -1:
+                print_error("Critical Error when creating the resulting sequence. raw_state state_"
+                            + str(i) + " could not be mapped to a state.")
+                sys.exit(-1)
+            resulting_sequence[i] = state_num
+        print("Resulting sequence is: " + str(resulting_sequence))
+        # if from_cache:
+        #     print_warning(
+        #         "YOU USED THE OPTION \"cache_dicts\". THIS IS FOR DEBUGGING PURPOSES ONLY!"
+        #         "\nTHE SCRIPT WILL NOW STOP PREMATURELY,"
+        #         "SINCE DATA FOR FURTHER COMPUTATION IS MISSING!")
+        #     sys.exit(0)
+
+        new_by_name = {}
+        for num_state, distinct_state in enumerate(distinct_states):
+            state_name = "State_" + str(num_state)
+            consumptions_for_state = []
+            durations_for_state = []
+            param_list = []
+            for raw_state in distinct_state:
+                original_state_dict = by_name[raw_state]
+                param_list.extend(original_state_dict["param"])
+                consumptions_for_state.extend(original_state_dict["power"])
+                durations_for_state.extend(original_state_dict["duration"])
+            new_state_dict = {
+                "param": param_list,
+                "power": consumptions_for_state,
+                "duration": durations_for_state,
+                "attributes": ["power", "duration"]
+            }
+            new_by_name[state_name] = new_state_dict
+        new_by_param = by_name_to_by_param(new_by_name)
+        new_stats = parameters.ParamStats(new_by_name, new_by_param, param_names, dict())
+        new_paramfit = ParallelParamFit(new_by_param)
+        for state_name in new_by_name.keys():
+            for num_param, param_name in enumerate(param_names):
+                if new_stats.depends_on_param(state_name, "power", param_name):
+                    new_paramfit.enqueue(state_name, "power", num_param, param_name)
+                if new_stats.depends_on_param(state_name, "duration", param_name):
+                    new_paramfit.enqueue(state_name, "duration", num_param, param_name)
+                print_info("State " + state_name + "s power depends on param " + param_name + ":" +
+                           str(new_stats.depends_on_param(state_name, "power", param_name))
+                           )
+                print_info("State " + state_name + "s duration depends on param " + param_name + ":"
+                           + str(new_stats.depends_on_param(state_name, "duration", param_name))
+                           )
+        new_paramfit.fit()
+        new_fit_res_dur_dict = {}
+        new_fit_res_pow_dict = {}
+        for state_name in new_by_name.keys():
+            fit_power = new_paramfit.get_result(state_name, "power")
+            fit_duration = new_paramfit.get_result(state_name, "duration")
+            combined_fit_power = analytic.function_powerset(fit_power, param_names, 0)
+            combined_fit_duration = analytic.function_powerset(fit_duration, param_names, 0)
+            combined_fit_power.fit(new_by_param, state_name, "power")
+            if not combined_fit_power.fit_success:
+                print_warning("Fitting(power) for state " + state_name + " was not succesful!")
+            combined_fit_duration.fit(new_by_param, state_name, "duration")
+            if not combined_fit_duration.fit_success:
+                print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
+            new_fit_res_pow_dict[state_name] = combined_fit_power
+            new_fit_res_dur_dict[state_name] = combined_fit_duration
+        for state_name in new_by_name.keys():
+            model_function = str(new_fit_res_pow_dict[state_name].model_function)
+            model_args = new_fit_res_pow_dict[state_name].model_args
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print("Power-Function for state " + state_name + ": "
+                  + model_function)
+        for state_name in new_by_name.keys():
+            model_function = str(new_fit_res_dur_dict[state_name].model_function)
+            model_args = new_fit_res_dur_dict[state_name].model_args
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print("Duration-Function for state " + state_name + ": "
+                  + model_function)
+        model = PTAModel(by_name, param_names, dict())
+
+
+    #         TODO: removed clustering (temporarily), since it provided too much dificultys
+    #           at the current state
+    #         i = 0
+    #         cluster_labels_list = []
+    #         num_cluster_list = []
+    #         for num_trace, raw_states in enumerate(raw_states_list):
+    #             # iterate through raw states from measurements
+    #             if len(raw_states) == num_raw_states:
+    #                 # build array with power values to cluster these
+    #                 value_to_cluster = np.zeros((num_raw_states, 2))
+    #                 j = 0
+    #                 for s in raw_states:
+    #                     value_to_cluster[j][0] = s[2]
+    #                     value_to_cluster[j][1] = 0
+    #                     j = j + 1
+    #                 # linked = linkage(value_to_cluster, 'single')
+    #                 #
+    #                 # labelList = range(1, 11)
+    #                 #
+    #                 # plt.figure(figsize=(10, 7))
+    #                 # dendrogram(linked,
+    #                 #            orientation='top',
+    #                 #            distance_sort='descending',
+    #                 #            show_leaf_counts=True)
+    #                 # plt.show()
+    #                 # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
+    #                 #   im distance_threshold
+    #                 cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
+    #                                                   affinity='euclidean',
+    #                                                   linkage='ward',
+    #                                                   distance_threshold=opt_refinement_thresh * 100)
+    #                 # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
+    #                 #                                   linkage='ward')
+    #                 cluster.fit_predict(value_to_cluster)
+    #                 # print_info("Cluster labels:\n" + str(cluster.labels_))
+    #                 # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
+    #                 # plt.show()
+    #                 cluster_labels_list.append((num_trace, cluster.labels_))
+    #                 num_cluster_list.append((num_trace, cluster.n_clusters_))
+    #                 i = i + 1
+    #             else:
+    #                 print_info("Discarding measurement No. " + str(num_trace) + " because it "
+    #                            + "did not recognize the number of raw_states correctly.")
+    #         num_used_measurements = len(raw_states_list)
+    #         if i != len(raw_states_list):
+    #             if i / len(raw_states_list) <= 0.5:
+    #                 print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
+    #                               + " Measurements for refinement. "
+    #                                 "Others did not recognize number of states correctly."
+    #                                 "\nYou should verify the integrity of the measurements.")
+    #             else:
+    #                 print_info("Used " + str(i) + "/" + str(len(raw_states_list))
+    #                            + " Measurements for refinement. "
+    #                              "Others did not recognize number of states correctly.")
+    #             num_used_measurements = i
+    #             # TODO: DEBUG Kram
+    #             sys.exit(0)
+    #         else:
+    #             print_info("Used all available measurements.")
+    #
+    #         num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
+    #         avg_per_state_list = [None] * len(cluster_labels_list)
+    #         used_clusters = 0
+    #         for number, (num_trace, labels) in enumerate(cluster_labels_list):
+    #             if num_cluster_list[number][1] == num_states:
+    #                 avg_per_state = [0] * num_states
+    #                 count_per_state = [0] * num_states
+    #                 raw_states = raw_states_list[num_trace]
+    #                 for num_label, label in enumerate(labels):
+    #                     count_per_state[label] = count_per_state[label] + 1
+    #                     avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
+    #                 for i, _ in enumerate(avg_per_state):
+    #                     avg_per_state[i] = avg_per_state[i] / count_per_state[i]
+    #                 avg_per_state_list[number] = avg_per_state
+    #                 used_clusters = used_clusters + 1
+    #             else:
+    #                 # hopefully this does not happen regularly
+    #                 print_info("Discarding measurement " + str(number)
+    #                            + " because the clustering yielded not matching results.")
+    #                 num_used_measurements = num_used_measurements - 1
+    #         if num_used_measurements == 0:
+    #             print_error("Something went terribly wrong. Discarded all measurements.")
+    #             # continue
+    #             sys.exit(-1)
+    #         # flattend version for clustering:
+    #         values_to_cluster = np.zeros((num_states * used_clusters, 2))
+    #         index = 0
+    #         for avg_per_state in avg_per_state_list:
+    #             if avg_per_state is not None:
+    #                 for avg in avg_per_state:
+    #                     values_to_cluster[index][0] = avg
+    #                     values_to_cluster[index][1] = 0
+    #                     index = index + 1
+    #         # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
+    #         # plt.show()
+    #         cluster = AgglomerativeClustering(n_clusters=num_states)
+    #         cluster.fit_predict(values_to_cluster)
+    #         # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
+    #         # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
+    #         # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
+    #         # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
+    #         new_labels_list = []
+    #         new_labels = []
+    #         i = 0
+    #         for label in cluster.labels_:
+    #             new_labels.append(label)
+    #             i = i + 1
+    #             if i == num_states:
+    #                 new_labels_list.append(new_labels)
+    #                 new_labels = []
+    #                 i = 0
+    #         # only the selected measurements are present in new_labels.
+    #         # new_labels_index should not be incremented, if not selected_measurement is skipped
+    #         new_labels_index = 0
+    #         # cluster_labels_list contains all measurements -> if measurement is skipped
+    #         # still increment the index
+    #         index = 0
+    #         for elem in avg_per_state_list:
+    #             if elem is not None:
+    #                 for number, label in enumerate(cluster_labels_list[index][1]):
+    #                     cluster_labels_list[index][1][number] = \
+    #                         new_labels_list[new_labels_index][label]
+    #                 new_labels_index = new_labels_index + 1
+    #             else:
+    #                 # override not selected measurement labels to avoid choosing the wrong ones.
+    #                 for number, label in enumerate(cluster_labels_list[index][1]):
+    #                     cluster_labels_list[index][1][number] = -1
+    #             index = index + 1
+    #         resulting_sequence = [None] * num_raw_states
+    #         i = 0
+    #         confidence = 0
+    #         for x in resulting_sequence:
+    #             j = 0
+    #             test_list = []
+    #             for arr in [elem[1] for elem in cluster_labels_list]:
+    #                 if num_cluster_list[j][1] != num_states:
+    #                     j = j + 1
+    #                 else:
+    #                     if -1 in arr:
+    #                         print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
+    #                                     " Zustände wahrscheinlich.")
+    #                         sys.exit(-1)
+    #                     test_list.append(arr[i])
+    #                     j = j + 1
+    #             bincount = np.bincount(test_list)
+    #             resulting_sequence[i] = np.argmax(bincount)
+    #             confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
+    #             i = i + 1
+    #         confidence = confidence / len(resulting_sequence)
+    #         print_info("Confidence of resulting sequence is " + str(confidence)
+    #                    + " while using " + str(num_used_measurements) + "/"
+    #                    + str(len(raw_states_list)) + " measurements.")
+    #         #print(resulting_sequence)
+    #         resulting_sequence_list.append((num_config, resulting_sequence))
+    # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
+    # #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
+    # #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
+    # #   auftreten.
+    # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
+    # #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
+    # #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
+    # #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
+    # #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
+    # #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
+    # for num_config, sequence in resulting_sequence_list:
+    #     print_info("NO. config:" + str(num_config))
+    #     print_info(sequence)
+    #
+    #
+    #
+    #
 
     elif ".tar" in opt_filename:
         # open with dfatool
-- 
cgit v1.2.3


From 2a1aee9b92085e50050ea22b547db450da820eab Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Mon, 10 Aug 2020 16:40:46 +0200
Subject: Proof_Of_Concept_PELT: Kleine Bugfixes, für den Fall dass nicht alle
 Messungen verwendet werden können. Verbesserung der Normierung des Signals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 69 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 40c405d..4819f64 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -221,7 +221,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
     else:
         # range_min == range_max. has the same effect as pen_override
         knee = (range_min, None)
-    print_info(str(knee[0]) + " has been selected as kneepoint.")
+    print_info(str(knee[0]) + " has been selected as penalty.")
     if knee[0] is not None:
         return knee
 
@@ -375,11 +375,15 @@ def print_error(str_to_prt):
         print("[ERROR]" + str_prt, file=sys.stderr)
 
 
-def norm_signal(signal):
+def norm_signal(signal, scaler=50):
     # TODO: maybe refine normalisation of signal
+    max_val = max(signal)
     normed_signal = np.zeros(shape=len(signal))
     for i, signal_i in enumerate(signal):
-        normed_signal[i] = signal_i / 1000
+        normed_signal[i] = signal_i / max_val
+        normed_signal[i] = normed_signal[i] * scaler
+    # plt.plot(normed_signal)
+    # plt.show()
     return normed_signal
 
 
@@ -559,6 +563,7 @@ if __name__ == '__main__':
         by_name_file = None
         param_names_file = None
         from_cache = False
+        not_accurate = False
         if opt_cache_loc is not None:
             flag = False
             by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
@@ -701,6 +706,10 @@ if __name__ == '__main__':
                             print_info("Discarding measurement No. " + str(num_measurement)
                                        + " because it did not recognize the number of "
                                          "raw_states correctly.")
+                        # l_signal = measurements_by_config['offline'][num_measurement]['uW']
+                        # l_bkpts = [s[1] for s in raw_states]
+                        # fig, ax = rpt.display(np.array(l_signal), l_bkpts)
+                        # plt.show()
                     # for i, x in enumerate(states_duration_list):
                     #     states_duration_list[i] = x / num_used_measurements
                     # for i, x in enumerate(states_consumption_list):
@@ -718,7 +727,7 @@ if __name__ == '__main__':
                                        + " Others did not recognize number of states correctly.")
                         num_used_measurements = i
                         # TODO: DEBUG Kram
-                        sys.exit(0)
+                        #sys.exit(0)
                     else:
                         print_info("Used all available measurements.")
 
@@ -730,7 +739,25 @@ if __name__ == '__main__':
                     #     break
 
             # combine all state durations and consumptions to parametrized model
-
+            if len(state_durations_by_config) == 0:
+                print("No refinement necessary for this state. The macromodel is usable.")
+                sys.exit()
+            if len(state_durations_by_config) / len(configurations) > 1 / 2 \
+                    and len(state_durations_by_config) != len(configurations):
+                print_warning(
+                    "Some measurements(>50%) need to be refined, however that is not true for"
+                    " all measurements. This hints a correlation between the structure of"
+                    " the underlying automaton and parameters. Only the ones which need to"
+                    " be refined will be refined. THE RESULT WILL NOT ACCURATELY DEPICT "
+                    " THE REAL WORLD.")
+                not_accurate = True
+            if len(state_durations_by_config) / len(configurations) < 1 / 2:
+                print_warning(
+                    "Some measurements(<50%) need to be refined, however that is not true for"
+                    " all measurements. This hints a correlation between the structure of"
+                    " the underlying automaton and parameters. Or a poor quality of measurements."
+                    " No Refinement will be done.")
+                sys.exit(-1)
             # this is only necessary because at this state only linear automatons can be modeled.
             num_states_array = [int()] * len(state_consumptions_by_config)
             for i, (_, states_consumption_list) in enumerate(state_consumptions_by_config):
@@ -748,7 +775,9 @@ if __name__ == '__main__':
                         "Config No." + str(num_config) + " not usable yet due to different "
                         + "number of states. This hints a correlation between parameters and "
                         + "the structure of the resulting automaton. This will be possibly"
-                        + " supported in a future version of this tool.")
+                        + " supported in a future version of this tool. HOWEVER AT THE MOMENT"
+                          " THIS WILL LEAD TO INACCURATE RESULTS!")
+                    not_accurate = True
                     usable_configs = usable_configs - 1
                 else:
                     param_list.extend(configurations[num_config]['offline_aggregates']['param'])
@@ -759,18 +788,28 @@ if __name__ == '__main__':
             else:
                 print_info("Using only " + str(usable_configs) + " Configs.")
             by_name = {}
+            usable_configs_2 = len(state_consumptions_by_config)
             for i in range(num_raw_states):
                 consumptions_for_state = []
                 durations_for_state = []
                 for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config):
-                    consumptions_for_state.extend(states_consumption_list[i])
-                    durations_for_state.extend(state_durations_by_config[j][1][i])
+                    if len(states_consumption_list) == num_raw_states:
+                        consumptions_for_state.extend(states_consumption_list[i])
+                        durations_for_state.extend(state_durations_by_config[j][1][i])
+                    else:
+                        not_accurate = True
+                        usable_configs_2 = usable_configs_2 - 1
+                if usable_configs_2 != usable_configs:
+                    print_error("an zwei unterschiedlichen Stellen wurden unterschiedlich viele "
+                                "Messungen rausgeworfen. Bei Janis beschweren.")
                 state_name = "state_" + str(i)
                 state_dict = {
                     "param": param_list,
                     "power": consumptions_for_state,
                     "duration": durations_for_state,
-                    "attributes": ["power", "duration"]
+                    "attributes": ["power", "duration"],
+                    # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen
+                    "isa": "state"
                 }
                 by_name[state_name] = state_dict
             by_param = by_name_to_by_param(by_name)
@@ -943,7 +982,9 @@ if __name__ == '__main__':
                 "param": param_list,
                 "power": consumptions_for_state,
                 "duration": durations_for_state,
-                "attributes": ["power", "duration"]
+                "attributes": ["power", "duration"],
+                # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen
+                "isa": "state"
             }
             new_by_name[state_name] = new_state_dict
         new_by_param = by_name_to_by_param(new_by_name)
@@ -993,7 +1034,13 @@ if __name__ == '__main__':
                 model_function = model_function.replace(replace_string, str(arg))
             print("Duration-Function for state " + state_name + ": "
                   + model_function)
-        model = PTAModel(by_name, param_names, dict())
+        model = PTAModel(new_by_name, param_names, dict())
+        model_json = model.to_json()
+        print(model_json)
+        if not_accurate:
+            print_warning(
+                "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
+                " WHY.")
 
 
     #         TODO: removed clustering (temporarily), since it provided too much dificultys
-- 
cgit v1.2.3


From 42f0d36796f6535e484426a1ffa221bca4ea593a Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 14 Aug 2020 14:50:39 +0200
Subject: bin/ProffOfConcept: Kleine Fehlerkorrekturen.

---
 bin/Proof_Of_Concept_PELT.py | 52 ++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 4819f64..ac32d88 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -4,6 +4,7 @@ import time
 import sys
 import getopt
 import re
+import pprint
 from multiprocessing import Pool, Manager, cpu_count
 from kneed import KneeLocator
 from sklearn.cluster import AgglomerativeClustering
@@ -21,7 +22,8 @@ from dfatool.utils import by_name_to_by_param
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
 
 # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100
-# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX_cache"
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX2_cache"
+from dfatool.validation import CrossValidator
 
 
 def plot_data_from_json(filename, trace_num, x_axis, y_axis):
@@ -98,7 +100,7 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
 
 def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50,
                             num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0,
-                            pen_modifier=None):
+                            pen_modifier=None, show_plots=False):
     # default params in Function
     if model is None:
         model = "l1"
@@ -138,7 +140,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
 
         print_info("starting kneepoint calculation.")
         # init Pool with num_proesses
-        with Pool(num_processes) as p:
+        with Pool(min(num_processes, len(args))) as p:
             # collect results from pool
             result = p.starmap_async(get_bkps, args)
             # monitor loop
@@ -199,18 +201,24 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
             if i == len(fitted_bkps_val[knee[0]:]) - 1:
                 # end sequence with last value
                 end_index = i
+                # # since it is not guaranteed that this is the end of the plateau, assume the mid
+                # # of the plateau was hit.
+                # size = end_index - start_index
+                # end_index = end_index + size
+                # However this is not the clean solution. Better if search interval is widened
                 if end_index - start_index > longest_end - longest_start:
                     # last found sequence is the longest found yet
                     longest_start = start_index
                     longest_end = end_index
                 start_index = i
             prev_val = num_bkpts
-        # plt.xlabel('Penalty')
-        # plt.ylabel('Number of Changepoints')
-        # plt.plot(pen_val, fitted_bkps_val)
-        # plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
-        # plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
-        # plt.show()
+        if show_plots:
+            plt.xlabel('Penalty')
+            plt.ylabel('Number of Changepoints')
+            plt.plot(pen_val, fitted_bkps_val)
+            plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
+            plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed')
+            plt.show()
         # choosing pen from plateau
         mid_of_plat = longest_start + (longest_end - longest_start) // 2
         knee = (mid_of_plat + knee[0], fitted_bkps_val[mid_of_plat + knee[0]])
@@ -331,7 +339,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
 
 def calc_raw_states(arg_list, num_processes=8):
     m = Manager()
-    with Pool(processes=num_processes) as p:
+    with Pool(processes=min(num_processes, len(arg_list))) as p:
         # collect results from pool
         result = p.starmap(calc_raw_states_func, arg_list)
     return result
@@ -375,7 +383,7 @@ def print_error(str_to_prt):
         print("[ERROR]" + str_prt, file=sys.stderr)
 
 
-def norm_signal(signal, scaler=50):
+def norm_signal(signal, scaler=25):
     # TODO: maybe refine normalisation of signal
     max_val = max(signal)
     normed_signal = np.zeros(shape=len(signal))
@@ -656,6 +664,10 @@ if __name__ == '__main__':
                                    + "measurement No. " + str(num_measurement) + " is " + str(
                             avg_std))
                         print_info("That is a reduction of " + str(change_avg_std))
+                        # l_signal = measurements_by_config['offline'][num_measurement]['uW']
+                        # l_bkpts = [s[1] for s in raw_states]
+                        # fig, ax = rpt.display(np.array(l_signal), l_bkpts)
+                        # plt.show()
                     print_info("Finished raw_states calculation.")
                     num_states_array = [int()] * len(raw_states_list)
                     i = 0
@@ -787,6 +799,10 @@ if __name__ == '__main__':
                 print_info("All configs usable.")
             else:
                 print_info("Using only " + str(usable_configs) + " Configs.")
+            if num_raw_states == 1:
+                print_info("Upon further inspection it is clear that no refinement is necessary."
+                           " The macromodel is usable.")
+                sys.exit(-1)
             by_name = {}
             usable_configs_2 = len(state_consumptions_by_config)
             for i in range(num_raw_states):
@@ -1034,9 +1050,17 @@ if __name__ == '__main__':
                 model_function = model_function.replace(replace_string, str(arg))
             print("Duration-Function for state " + state_name + ": "
                   + model_function)
-        model = PTAModel(new_by_name, param_names, dict())
-        model_json = model.to_json()
-        print(model_json)
+        # model = PTAModel(new_by_name, param_names, dict())
+        # model_json = model.to_json()
+        # param_model, _ = model.get_fitted()
+        # param_quality = model.assess(param_model)
+        # pprint.pprint(param_quality)
+        # # model = PTAModel(by_name, ...)
+        # # validator = CrossValidator(PTAModel, by_name, ...)
+        # # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10)
+        # validator = CrossValidator(PTAModel, new_by_name, param_names, dict())
+        # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10)
+        # pprint.pprint(param_quality)
         if not_accurate:
             print_warning(
                 "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
-- 
cgit v1.2.3


From 98de5d25ce583b285965e6fd8c79ab74d3bb6db3 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 14 Aug 2020 15:19:13 +0200
Subject: bin/ProofOfConceptPELT: added resultexport to filesystem.

---
 bin/Proof_Of_Concept_PELT.py | 64 ++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index ac32d88..cba7009 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -547,7 +547,7 @@ if __name__ == '__main__':
     except getopt.GetoptError as err:
         print(err, file=sys.stderr)
         sys.exit(-1)
-
+    filepath = os.path.dirname(opt_filename)
     # OPENING DATA
     if ".json" in opt_filename:
         # open file with trace data from json
@@ -1034,37 +1034,37 @@ if __name__ == '__main__':
                 print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
             new_fit_res_pow_dict[state_name] = combined_fit_power
             new_fit_res_dur_dict[state_name] = combined_fit_duration
-        for state_name in new_by_name.keys():
-            model_function = str(new_fit_res_pow_dict[state_name].model_function)
-            model_args = new_fit_res_pow_dict[state_name].model_args
-            for num_arg, arg in enumerate(model_args):
-                replace_string = "regression_arg(" + str(num_arg) + ")"
-                model_function = model_function.replace(replace_string, str(arg))
-            print("Power-Function for state " + state_name + ": "
-                  + model_function)
-        for state_name in new_by_name.keys():
-            model_function = str(new_fit_res_dur_dict[state_name].model_function)
-            model_args = new_fit_res_dur_dict[state_name].model_args
-            for num_arg, arg in enumerate(model_args):
-                replace_string = "regression_arg(" + str(num_arg) + ")"
-                model_function = model_function.replace(replace_string, str(arg))
-            print("Duration-Function for state " + state_name + ": "
-                  + model_function)
-        # model = PTAModel(new_by_name, param_names, dict())
-        # model_json = model.to_json()
-        # param_model, _ = model.get_fitted()
-        # param_quality = model.assess(param_model)
-        # pprint.pprint(param_quality)
-        # # model = PTAModel(by_name, ...)
-        # # validator = CrossValidator(PTAModel, by_name, ...)
-        # # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10)
-        # validator = CrossValidator(PTAModel, new_by_name, param_names, dict())
-        # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10)
-        # pprint.pprint(param_quality)
-        if not_accurate:
-            print_warning(
-                "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
-                " WHY.")
+        result_loc = os.path.join(filepath, "result.txt")
+        with open(result_loc, "w") as f:
+            f.write("Resulting Sequence: " + str(resulting_sequence))
+            f.write("\n\n")
+            for state_name in new_by_name.keys():
+                model_function = str(new_fit_res_pow_dict[state_name].model_function)
+                model_args = new_fit_res_pow_dict[state_name].model_args
+                for num_arg, arg in enumerate(model_args):
+                    replace_string = "regression_arg(" + str(num_arg) + ")"
+                    model_function = model_function.replace(replace_string, str(arg))
+                print("Power-Function for state " + state_name + ": "
+                      + model_function)
+                f.write("Power-Function for state " + state_name + ": "
+                        + model_function + "\n")
+            f.write("\n\n")
+            for state_name in new_by_name.keys():
+                model_function = str(new_fit_res_dur_dict[state_name].model_function)
+                model_args = new_fit_res_dur_dict[state_name].model_args
+                for num_arg, arg in enumerate(model_args):
+                    replace_string = "regression_arg(" + str(num_arg) + ")"
+                    model_function = model_function.replace(replace_string, str(arg))
+                print("Duration-Function for state " + state_name + ": "
+                      + model_function)
+                f.write("Duration-Function for state " + state_name + ": "
+                        + model_function + "\n")
+            if not_accurate:
+                print_warning(
+                    "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
+                    " WHY.")
+                f.write("THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
+                        " WHY.")
 
 
     #         TODO: removed clustering (temporarily), since it provided too much dificultys
-- 
cgit v1.2.3


From 91a42d937a0a5e50d5ac2e6369d26b23146f15e2 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Fri, 14 Aug 2020 15:52:58 +0200
Subject: bin/ProofOfConceptPELT: better nameing for result file

---
 bin/Proof_Of_Concept_PELT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index cba7009..605ed7e 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -600,7 +600,7 @@ if __name__ == '__main__':
                               "\nThe script will not run to the end properly."
                               "\nNo final parametrization will be done.")
                 from_cache = True
-
+        big_state_name = configurations[0]['name']
         if None in (by_param_file, by_name_file, param_names_file):
             state_durations_by_config = []
             state_consumptions_by_config = []
@@ -1034,7 +1034,7 @@ if __name__ == '__main__':
                 print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
             new_fit_res_pow_dict[state_name] = combined_fit_power
             new_fit_res_dur_dict[state_name] = combined_fit_duration
-        result_loc = os.path.join(filepath, "result.txt")
+        result_loc = os.path.join(filepath, "result" + big_state_name + ".txt")
         with open(result_loc, "w") as f:
             f.write("Resulting Sequence: " + str(resulting_sequence))
             f.write("\n\n")
-- 
cgit v1.2.3


From e1c6e734b2d2725a0e29af6795c18a2575fe3d5d Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Tue, 22 Sep 2020 23:10:26 +0200
Subject: bin/plot_generator

---
 bin/plot_generator.py | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 bin/plot_generator.py

diff --git a/bin/plot_generator.py b/bin/plot_generator.py
new file mode 100644
index 0000000..458271d
--- /dev/null
+++ b/bin/plot_generator.py
@@ -0,0 +1,123 @@
+import getopt
+import sys
+import re
+import os
+import numpy as np
+import pprint
+import json
+import matplotlib.pyplot as plt
+
+if __name__ == '__main__':
+    # OPTION RECOGNITION
+    opt = dict()
+
+    optspec = (
+        "bench_filename= "
+        "result_filename= "
+    )
+    opt_bench_filename = None
+    opt_result_filename = None
+    try:
+        raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" "))
+
+        for option, parameter in raw_opts:
+            optname = re.sub(r"^--", "", option)
+            opt[optname] = parameter
+    except getopt.GetoptError as err:
+        print(err, file=sys.stderr)
+        sys.exit(-1)
+
+    if "bench_filename" in opt:
+        opt_bench_filename = opt['bench_filename']
+    else:
+
+        sys.exit(-1)
+    if "result_filename" in opt:
+        opt_result_filename = opt['result_filename']
+    else:
+        print("wth")
+        sys.exit(-1)
+
+    with open(opt_bench_filename, 'r') as f:
+        configurations = json.load(f)
+    with open(opt_result_filename, 'r') as f:
+        sequence_line = f.readline()
+        begin_sequence = sequence_line.rfind("Resulting Sequence: ") + 20
+
+        if begin_sequence < 20:
+            print("nicht gefunden!")
+            sys.exit(-1)
+        sequence_substr = sequence_line[begin_sequence:]
+        resulting_sequence = eval(sequence_substr)
+        new_line = f.readline()
+        while new_line == "\n":
+            new_line = f.readline()
+        function_line = new_line
+        pow_function_dict = dict()
+        while function_line != "\n":
+            state_name_pos = function_line.find("Power-Function for state ") + 25
+            state_name_end = function_line.find(":")
+            state_name = function_line[state_name_pos:state_name_end]
+            function_string = function_line[state_name_end+1:-1]
+            pow_function_dict[state_name] = function_string
+            function_line = f.readline()
+        new_line = "\n"
+        while new_line == "\n":
+            new_line = f.readline()
+        function_line = new_line
+        dur_function_dict = dict()
+        while function_line != "\n" and function_line != "" and "THIS RESULT IS NOT ACCURATE." not in function_line:
+            state_name_pos = function_line.find("Duration-Function for state ") + 28
+            state_name_end = function_line.find(":")
+            state_name = function_line[state_name_pos:state_name_end]
+            function_string = function_line[state_name_end+1:-1]
+            dur_function_dict[state_name] = function_string
+            function_line = f.readline()
+
+    param_names = configurations[0]['offline_aggregates']['paramkeys'][0]
+
+    for num_fig in range(0, min(4, len(configurations))):
+        rand_config_no = np.random.randint(0, len(configurations), 1)[0]
+        rand_conf = configurations[rand_config_no]
+        rand_signal = np.array(rand_conf['offline'][0]['uW'])
+        rand_param = rand_conf['offline_aggregates']['param'][0]
+        rand_max_pow = max(rand_signal)
+        # pprint.pprint(rand_param)
+        pretty_rand_param = pprint.pformat(rand_param)
+        print(str(param_names) + "(" + str(rand_config_no) + ")" + "\n" + pretty_rand_param)
+        time = 0
+        next_time = 0
+        rand_stepper = 0
+        pow = 0
+        resulting_coords = list()
+        while rand_stepper < len(resulting_sequence):
+            curr_state = resulting_sequence[rand_stepper]
+            curr_state_name = "State_" + str(curr_state)
+            curr_pow_func = pow_function_dict[curr_state_name]
+            curr_dur_func = dur_function_dict[curr_state_name]
+            for num_param, name in enumerate(param_names):
+                replace_string = "parameter(" + name + ")"
+                curr_pow_func = curr_pow_func.replace(replace_string, str(rand_param[num_param]))
+                curr_dur_func = curr_dur_func.replace(replace_string, str(rand_param[num_param]))
+            pow = eval(curr_pow_func)
+            dur = eval(curr_dur_func)
+            next_time = time + dur
+            start_coord = (time, pow)
+            end_coord = (next_time, pow)
+            resulting_coords.append(start_coord)
+            resulting_coords.append(end_coord)
+            rand_stepper = rand_stepper + 1
+            time = next_time
+
+        with open("res_conf_" + str(num_fig) + "_signal.txt", 'w') as f:
+            f.write("x,y\n")
+            for x, y in enumerate(rand_signal):
+                f.write(str(x) + "," + str(y) + "\n")
+        with open("res_conf_" + str(num_fig) + "_fit.txt", 'w') as f:
+            f.write("x,y\n")
+            for x, y in resulting_coords:
+                f.write(str(x) + "," + str(y) + "\n")
+        plt.plot(rand_signal)
+        plt.plot([x for x, y in resulting_coords], [y for x, y in resulting_coords])
+        plt.savefig("res_conf_" + str(num_fig) + "_pic.pdf", format='pdf', dpi=300)
+        plt.clf()
-- 
cgit v1.2.3


From 522d8280cf95f43ca6d5904ae5d79a9a9c502af3 Mon Sep 17 00:00:00 2001
From: jfalkenhagen <jfalkenhagen@uos.de>
Date: Tue, 22 Sep 2020 23:45:43 +0200
Subject: bin/Proof_Of_Conecpt_PELT.py: Schöner kommentiert. Eigentlich für die
 Abgabe bereit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/Proof_Of_Concept_PELT.py | 213 +++++++++++++++++++------------------------
 1 file changed, 93 insertions(+), 120 deletions(-)

diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
index 605ed7e..688c5a7 100644
--- a/bin/Proof_Of_Concept_PELT.py
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -18,7 +18,6 @@ from dfatool import parameters
 from dfatool.model import ParallelParamFit, PTAModel
 from dfatool.utils import by_name_to_by_param
 
-
 # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
 
 # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100
@@ -26,6 +25,7 @@ from dfatool.utils import by_name_to_by_param
 from dfatool.validation import CrossValidator
 
 
+# helper functions. Not used
 def plot_data_from_json(filename, trace_num, x_axis, y_axis):
     with open(filename, 'r') as file:
         tx_data = json.load(file)
@@ -60,18 +60,26 @@ def plot_data_vs_data_vs_means(signal1, signal2, x_axis, y_axis):
     plt.show()
 
 
+# returns the found changepoints by algo for the specific penalty pen.
+# algo should be the return value of Pelt(...).fit(signal)
+# Also puts a token in container q to let the progressmeter know the changepoints for penalty pen
+# have been calculated.
+# used for parallel calculation of changepoints vs penalty
 def get_bkps(algo, pen, q):
     res = pen, len(algo.predict(pen=pen))
     q.put(pen)
     return res
 
 
+# Wrapper for kneedle
 def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing'):
     kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction)
     kneepoint = (kneedle.knee, kneedle.knee_y)
     return kneepoint
 
 
+# returns the changepoints found on signal with penalty penalty.
+# model, jump and min_dist are directly passed to PELT
 def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
     # default params in Function
     if model is None:
@@ -98,6 +106,11 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
     sys.exit(-1)
 
 
+# calculates and returns the necessary penalty for signal. Parallel execution with num_processes many processes
+# jump, min_dist are passed directly to PELT. S is directly passed to kneedle.
+# pen_modifier is used as a factor on the resulting penalty.
+# the interval [range_min, range_max] is used for searching.
+# refresh_delay and refresh_thresh are used to configure the progress "bar".
 def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50,
                             num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0,
                             pen_modifier=None, show_plots=False):
@@ -136,6 +149,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
         q = m.Queue()
 
         for i in range(range_min, range_max + 1):
+            # same calculation for all except other penalty
             args.append((algo, i, q))
 
         print_info("starting kneepoint calculation.")
@@ -184,7 +198,9 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
         # peaks, peak_plateaus = find_peaks(- np.array(fitted_bkps_val), plateau_size=1)
         # Since the data is monotonously decreasing only one plateau can be found.
 
-        # assuming the plateau is constant
+        # assuming the plateau is constant, i.e. no noise. OK to assume this here, since num_bkpts
+        # is monotonously decreasing. If the number of bkpts decreases inside a considered
+        # plateau, it means that the stable configuration is not yet met. -> Search further
         start_index = -1
         end_index = -1
         longest_start = -1
@@ -206,6 +222,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
                 # size = end_index - start_index
                 # end_index = end_index + size
                 # However this is not the clean solution. Better if search interval is widened
+                # with range_min and range_max
                 if end_index - start_index > longest_end - longest_start:
                     # last found sequence is the longest found yet
                     longest_start = start_index
@@ -238,78 +255,20 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0,
     sys.exit(-1)
 
 
-# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting
-# def needs_refinement_no_sort(signal, mean, thresh):
-#     # linear search for the top 10%/ bottom 10%
-#     # should be sufficient
-#     length_of_signal = len(signal)
-#     percentile_size = int()
-#     percentile_size = length_of_signal // 100
-#     upper_percentile = [None] * percentile_size
-#     lower_percentile = [None] * percentile_size
-#     fill_index_upper = percentile_size - 1
-#     fill_index_lower = percentile_size - 1
-#     index_smallest_val = fill_index_upper
-#     index_largest_val = fill_index_lower
-#
-#     for x in signal:
-#         if x > mean:
-#             # will be in upper percentile
-#             if fill_index_upper >= 0:
-#                 upper_percentile[fill_index_upper] = x
-#                 if x < upper_percentile[index_smallest_val]:
-#                     index_smallest_val = fill_index_upper
-#                 fill_index_upper = fill_index_upper - 1
-#                 continue
-#
-#             if x > upper_percentile[index_smallest_val]:
-#                 # replace smallest val. Find next smallest val
-#                 upper_percentile[index_smallest_val] = x
-#                 index_smallest_val = 0
-#                 i = 0
-#                 for y in upper_percentile:
-#                     if upper_percentile[i] < upper_percentile[index_smallest_val]:
-#                         index_smallest_val = i
-#                     i = i + 1
-#
-#         else:
-#             if fill_index_lower >= 0:
-#                 lower_percentile[fill_index_lower] = x
-#                 if x > lower_percentile[index_largest_val]:
-#                     index_largest_val = fill_index_upper
-#                 fill_index_lower = fill_index_lower - 1
-#                 continue
-#             if x < lower_percentile[index_largest_val]:
-#                 # replace smallest val. Find next smallest val
-#                 lower_percentile[index_largest_val] = x
-#                 index_largest_val = 0
-#                 i = 0
-#                 for y in lower_percentile:
-#                     if lower_percentile[i] > lower_percentile[index_largest_val]:
-#                         index_largest_val = i
-#                     i = i + 1
-#
-#     # should have the percentiles
-#     lower_percentile_mean = np.mean(lower_percentile)
-#     upper_percentile_mean = np.mean(upper_percentile)
-#     dist = mean - lower_percentile_mean
-#     if dist > thresh:
-#         return True
-#     dist = upper_percentile_mean - mean
-#     if dist > thresh:
-#         return True
-#     return False
-
-
-# raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model
-#                                                  , opt_jump))
+# calculates the raw_states for measurement measurement. num_measurement is used to identify the
+# return value
+# penalty, model and jump are directly passed to pelt
 def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
+    # extract signal
     signal = np.array(measurement['uW'])
+    # norm signal to remove dependency on absolute values
     normed_signal = norm_signal(signal)
+    # calculate the breakpoints
     bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump)
     calced_states = list()
     start_time = 0
     end_time = 0
+    # calc metrics for all states
     for bkpt in bkpts:
         # start_time of state is end_time of previous one
         # (Transitions are instantaneous)
@@ -322,6 +281,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
         calced_states.append(calced_state)
     num = 0
     new_avg_std = 0
+    # calc avg std for all states from this measurement
     for s in calced_states:
         # print_info("State " + str(num) + " starts at t=" + str(s[0])
         #            + " and ends at t=" + str(s[1])
@@ -329,7 +289,11 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
         #            + "uW with  sigma=" + str(s[3]))
         num = num + 1
         new_avg_std = new_avg_std + s[3]
-    new_avg_std = new_avg_std / len(calced_states)
+    # check case if no state has been found to avoid crashing
+    if len(calced_states) != 0:
+        new_avg_std = new_avg_std / len(calced_states)
+    else:
+        new_avg_std = 0
     change_avg_std = measurement['uW_std'] - new_avg_std
     # print_info("The average standard deviation for the newly found states is "
     #            + str(new_avg_std))
@@ -337,6 +301,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
     return num_measurement, calced_states, new_avg_std, change_avg_std
 
 
+# parallelize calc over all measurements
 def calc_raw_states(arg_list, num_processes=8):
     m = Manager()
     with Pool(processes=min(num_processes, len(arg_list))) as p:
@@ -346,6 +311,7 @@ def calc_raw_states(arg_list, num_processes=8):
 
 
 # Very short benchmark yielded approx. 3 times the speed of solution not using sort
+# checks the percentiles if refinement is necessary
 def needs_refinement(signal, thresh):
     sorted_signal = sorted(signal)
     length_of_signal = len(signal)
@@ -364,7 +330,8 @@ def needs_refinement(signal, thresh):
         return True
     return False
 
-
+# helper functions for user output
+# TODO: maybe switch with python logging feature
 def print_info(str_to_prt):
     str_lst = str_to_prt.split(sep='\n')
     for str_prt in str_lst:
@@ -383,18 +350,17 @@ def print_error(str_to_prt):
         print("[ERROR]" + str_prt, file=sys.stderr)
 
 
+# norms the signal and apply scaler to all values as a factor
 def norm_signal(signal, scaler=25):
-    # TODO: maybe refine normalisation of signal
     max_val = max(signal)
     normed_signal = np.zeros(shape=len(signal))
     for i, signal_i in enumerate(signal):
         normed_signal[i] = signal_i / max_val
         normed_signal[i] = normed_signal[i] * scaler
-    # plt.plot(normed_signal)
-    # plt.show()
     return normed_signal
 
 
+# norms the values to prepare them for clustering
 def norm_values_to_cluster(values_to_cluster):
     new_vals = np.array(values_to_cluster)
     num_samples = len(values_to_cluster)
@@ -409,6 +375,7 @@ def norm_values_to_cluster(values_to_cluster):
     return new_vals
 
 
+# finds state_num using state name
 def get_state_num(state_name, distinct_states):
     for state_num, states in enumerate(distinct_states):
         if state_name in states:
@@ -564,9 +531,9 @@ if __name__ == '__main__':
         # plt.show()
         # sys.exit()
 
-        # loop through all traces check if refinement is necessary
         # resulting_sequence_list = []
         # search for param_names, by_param and by_name files
+        # cachingopts
         by_param_file = None
         by_name_file = None
         param_names_file = None
@@ -597,34 +564,44 @@ if __name__ == '__main__':
             else:
                 print_warning("THE OPTION \"cache_dicts\" IS FOR DEBUGGING PURPOSES ONLY! "
                               "\nDO NOT USE FOR REGULAR APPLICATIONS!"
-                              "\nThe script will not run to the end properly."
-                              "\nNo final parametrization will be done.")
+                              "\nThis will possibly not be maintained in further development.")
                 from_cache = True
         big_state_name = configurations[0]['name']
         if None in (by_param_file, by_name_file, param_names_file):
             state_durations_by_config = []
             state_consumptions_by_config = []
+            # loop through all traces check if refinement is necessary and if necessary refine it.
             for num_config, measurements_by_config in enumerate(configurations):
                 # loop through all occurrences of the looked at state
                 print_info("Looking at state '" + measurements_by_config['name'] + "' with params: "
                            + str(measurements_by_config['parameter']) + "(" + str(
                     num_config + 1) + "/"
                            + str(len(configurations)) + ")")
-                refine = False
+                num_needs_refine = 0
                 print_info("Checking if refinement is necessary...")
                 for measurement in measurements_by_config['offline']:
                     # loop through measurements of particular state
                     # an check if state needs refinement
                     signal = measurement['uW']
                     # mean = measurement['uW_mean']
-                    if needs_refinement(signal, opt_refinement_thresh) and not refine:
-                        print_info("Refinement is necessary!")
-                        refine = True
-                if not refine:
+                    if needs_refinement(signal, opt_refinement_thresh):
+                        num_needs_refine = num_needs_refine + 1
+                if num_needs_refine == 0:
+                    print_info(
+                        "No refinement necessary for state '" + measurements_by_config['name']
+                        + "' with params: " + str(measurements_by_config['parameter']))
+                elif num_needs_refine < len(measurements_by_config['offline']) / 2:
                     print_info(
                         "No refinement necessary for state '" + measurements_by_config['name']
                         + "' with params: " + str(measurements_by_config['parameter']))
+                    print_warning(
+                        "However this decision was not unanimously. This could hint a poor"
+                        "measurement quality.")
                 else:
+                    if num_needs_refine != len(measurements_by_config['parameter']):
+                        print_warning(
+                            "However this decision was not unanimously. This could hint a poor"
+                            "measurement quality.")
                     # assume that all measurements of the same param configuration are fundamentally
                     # similar -> calculate penalty for first measurement, use it for all
                     if opt_pen_override is None:
@@ -651,18 +628,18 @@ if __name__ == '__main__':
                     raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes)
                     # extracting result and putting it in correct order -> index of raw_states_list
                     # entry still corresponds with index of measurement in measurements_by_states
-                    # -> If measurements are discarded the correct ones are easily recognized
+                    # -> If measurements are discarded the used ones are easily recognized
                     for ret_val in raw_states_res:
                         num_measurement = ret_val[0]
                         raw_states = ret_val[1]
                         avg_std = ret_val[2]
                         change_avg_std = ret_val[3]
-                        # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
+                        # FIXME: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
                         #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
                         raw_states_list[num_measurement] = raw_states
                         print_info("The average standard deviation for the newly found states in "
-                                   + "measurement No. " + str(num_measurement) + " is " + str(
-                            avg_std))
+                                   + "measurement No. " + str(num_measurement) + " is "
+                                   + str(avg_std))
                         print_info("That is a reduction of " + str(change_avg_std))
                         # l_signal = measurements_by_config['offline'][num_measurement]['uW']
                         # l_bkpts = [s[1] for s in raw_states]
@@ -681,8 +658,9 @@ if __name__ == '__main__':
                     # TODO: MAGIC NUMBER
                     if num_states_dev > 1:
                         print_warning("The number of states varies strongly across measurements."
-                                      " Consider choosing a larger value for S or using the "
-                                      "pen_modifier option.")
+                                      " Consider choosing a larger range for penalty detection."
+                                      " It is also possible, that the processed data is not accurate"
+                                      " enough to produce proper results.")
                         time.sleep(5)
                     # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
                     #   Einfach Durchschnitt nehmen?
@@ -691,6 +669,11 @@ if __name__ == '__main__':
                     counts = np.bincount(num_states_array)
                     num_raw_states = np.argmax(counts)
                     print_info("Choose " + str(num_raw_states) + " as number of raw_states.")
+                    if num_raw_states == 1:
+                        print_info(
+                            "Upon further inspection it is clear that no refinement is necessary."
+                            " The macromodel is usable for this configuration.")
+                        continue
                     # iterate through all found breakpoints and determine start and end points as well
                     # as power consumption
                     num_measurements = len(raw_states_list)
@@ -729,8 +712,8 @@ if __name__ == '__main__':
                     if num_used_measurements != len(raw_states_list):
                         if num_used_measurements / len(raw_states_list) <= 0.5:
                             print_warning("Only used " + str(num_used_measurements) + "/"
-                                          + str(
-                                len(raw_states_list)) + " Measurements for refinement. "
+                                          + str(len(raw_states_list))
+                                          + " Measurements for refinement. "
                                           + "Others did not recognize number of states correctly."
                                           + "\nYou should verify the integrity of the measurements.")
                         else:
@@ -738,22 +721,16 @@ if __name__ == '__main__':
                                        + str(len(raw_states_list)) + " Measurements for refinement."
                                        + " Others did not recognize number of states correctly.")
                         num_used_measurements = i
-                        # TODO: DEBUG Kram
-                        #sys.exit(0)
                     else:
                         print_info("Used all available measurements.")
 
                     state_durations_by_config.append((num_config, states_duration_list))
                     state_consumptions_by_config.append((num_config, states_consumption_list))
-                    # # TODO:
-                    # if num_config == 6:
-                    #     print("BRECHE AUS")
-                    #     break
 
             # combine all state durations and consumptions to parametrized model
             if len(state_durations_by_config) == 0:
                 print("No refinement necessary for this state. The macromodel is usable.")
-                sys.exit()
+                sys.exit(1)
             if len(state_durations_by_config) / len(configurations) > 1 / 2 \
                     and len(state_durations_by_config) != len(configurations):
                 print_warning(
@@ -799,10 +776,7 @@ if __name__ == '__main__':
                 print_info("All configs usable.")
             else:
                 print_info("Using only " + str(usable_configs) + " Configs.")
-            if num_raw_states == 1:
-                print_info("Upon further inspection it is clear that no refinement is necessary."
-                           " The macromodel is usable.")
-                sys.exit(-1)
+            # build by_name
             by_name = {}
             usable_configs_2 = len(state_consumptions_by_config)
             for i in range(num_raw_states):
@@ -824,7 +798,7 @@ if __name__ == '__main__':
                     "power": consumptions_for_state,
                     "duration": durations_for_state,
                     "attributes": ["power", "duration"],
-                    # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen
+                    # Da kein "richtiger" Automat generiert wird, gibt es auch keine Transitionen
                     "isa": "state"
                 }
                 by_name[state_name] = state_dict
@@ -877,6 +851,7 @@ if __name__ == '__main__':
         paramfit.fit()
         fit_res_dur_dict = {}
         fit_res_pow_dict = {}
+        # fit functions and check if successful
         for state_name in by_name.keys():
             fit_power = paramfit.get_result(state_name, "power")
             fit_duration = paramfit.get_result(state_name, "duration")
@@ -893,6 +868,7 @@ if __name__ == '__main__':
         # only raw_states with the same number of function parameters can be similar
         num_param_pow_dict = {}
         num_param_dur_dict = {}
+        # print found substate_results
         for state_name in by_name.keys():
             model_function = str(fit_res_pow_dict[state_name].model_function)
             model_args = fit_res_pow_dict[state_name].model_args
@@ -911,6 +887,7 @@ if __name__ == '__main__':
                 model_function = model_function.replace(replace_string, str(arg))
             print_info("Duration-Function for state " + state_name + ": "
                        + model_function)
+        # sort states in buckets for clustering
         similar_raw_state_buckets = {}
         for state_name in by_name.keys():
             pow_model_function = str(fit_res_pow_dict[state_name].model_function)
@@ -927,7 +904,9 @@ if __name__ == '__main__':
                        + str(similar_raw_state_buckets[key_tuple]))
             similar_states = similar_raw_state_buckets[key_tuple]
             if len(similar_states) > 1:
-                # functions are identical -> num_params is identical
+                # only necessary to cluster if more than one raw_state has the same function
+                # configuration
+                # functions are identical -> num_params and used params are identical
                 num_params = num_param_dur_dict[similar_states[0]] + num_param_pow_dict[
                     similar_states[0]]
                 values_to_cluster = np.zeros((len(similar_states), num_params))
@@ -951,7 +930,7 @@ if __name__ == '__main__':
                 cluster_labels = cluster.labels_
                 print_info("Cluster labels:\n" + str(cluster_labels))
                 if cluster.n_clusters_ > 1:
-                    # more than one distinct state found
+                    # more than one distinct state found -> seperation of raw_states necessary
                     distinct_state_dict = {}
                     for num_state, label in enumerate(cluster_labels):
                         if label not in distinct_state_dict.keys():
@@ -960,6 +939,7 @@ if __name__ == '__main__':
                     for distinct_state_key in distinct_state_dict.keys():
                         distinct_states.append(distinct_state_dict[distinct_state_key])
                 else:
+                    # all raw_states make up this state
                     distinct_states.append(similar_states)
             else:
                 distinct_states.append(similar_states)
@@ -968,6 +948,7 @@ if __name__ == '__main__':
         num_raw_states = len(by_name.keys())
         resulting_sequence = [int] * num_raw_states
         for i in range(num_raw_states):
+            # apply the projection from raw_states to states
             state_name = "state_" + str(i)
             state_num = get_state_num(state_name, distinct_states)
             if state_num == -1:
@@ -982,7 +963,7 @@ if __name__ == '__main__':
         #         "\nTHE SCRIPT WILL NOW STOP PREMATURELY,"
         #         "SINCE DATA FOR FURTHER COMPUTATION IS MISSING!")
         #     sys.exit(0)
-
+        # parameterize all new states
         new_by_name = {}
         for num_state, distinct_state in enumerate(distinct_states):
             state_name = "State_" + str(num_state)
@@ -1034,6 +1015,7 @@ if __name__ == '__main__':
                 print_warning("Fitting(duration) for state " + state_name + " was not succesful!")
             new_fit_res_pow_dict[state_name] = combined_fit_power
             new_fit_res_dur_dict[state_name] = combined_fit_duration
+        # output results
         result_loc = os.path.join(filepath, "result" + big_state_name + ".txt")
         with open(result_loc, "w") as f:
             f.write("Resulting Sequence: " + str(resulting_sequence))
@@ -1066,9 +1048,11 @@ if __name__ == '__main__':
                 f.write("THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
                         " WHY.")
 
-
-    #         TODO: removed clustering (temporarily), since it provided too much dificultys
-    #           at the current state
+    #         Removed clustering at this point, since it provided too much difficulties
+    #           at the current state. Clustering is still used, but at another point of execution.
+    #           Now parametrization is done first. raw_states are grouped by their using a dict
+    #           where the key is [power_function, duration_dunction]. Then all raw_states from
+    #           each bucket are clustered by their parameters
     #         i = 0
     #         cluster_labels_list = []
     #         num_cluster_list = []
@@ -1249,21 +1233,10 @@ if __name__ == '__main__':
         print_info("Preprocessing file. Depending on its size, this could take a while.")
         preprocessed_data = raw_data.get_preprocessed_data()
         print_info("File fully preprocessed")
-        # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json
+        # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json. Ist erstmal raus. Wird nicht
+        #   umgesetzt.
         print_error("Not implemented yet. Please generate .json files first with dfatool and use"
                     " those.")
     else:
         print_error("Unknown dataformat")
-        sys.exit(-1)
-
-    # print(tx_data[1]['parameter'])
-    # # parse json to array for PELT
-    # signal = np.array(tx_data[1]['offline'][0]['uW'])
-    #
-    # for i in range(0, len(signal)):
-    #     signal[i] = signal[i]/1000
-    # bkps = calc_pelt(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S)
-    # fig, ax = rpt.display(signal, bkps)
-    # plt.xlabel('Time [us]')
-    # plt.ylabel('Power [mW]')
-    # plt.show()
+        sys.exit(-1)
\ No newline at end of file
-- 
cgit v1.2.3