1 files changed, 1368 insertions, 0 deletions
diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py
new file mode 100755
index 0000000..3206b68
--- /dev/null
+++ b/bin/Proof_Of_Concept_PELT.py
@@ -0,0 +1,1368 @@
+#!/usr/bin/env python3
+import json
+import os
+import time
+import sys
+import getopt
+import re
+import pprint
+from multiprocessing import Pool, Manager, cpu_count
+from kneed import KneeLocator
+from sklearn.cluster import AgglomerativeClustering
+import matplotlib.pyplot as plt
+import ruptures as rpt
+import numpy as np
+
+from dfatool.functions import analytic
+from dfatool.loader import RawData
+from dfatool import parameters
+from dfatool.model import ParallelParamFit, PTAModel
+from dfatool.utils import by_name_to_by_param
+
+# from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display
+
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100
+# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX2_cache"
+from dfatool.validation import CrossValidator
+
+
+# returns the found changepoints by algo for the specific penalty pen.
+# algo should be the return value of Pelt(...).fit(signal)
+# Also puts a token in container q to let the progressmeter know the changepoints for penalty pen
+# have been calculated.
+# used for parallel calculation of changepoints vs penalty
+def get_bkps(algo, pen, q):
+    res = pen, len(algo.predict(pen=pen))
+    q.put(pen)
+    return res
+
+
+# Wrapper for kneedle
+def find_knee_point(data_x, data_y, S=1.0, curve="convex", direction="decreasing"):
+    kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction)
+    kneepoint = (kneedle.knee, kneedle.knee_y)
+    return kneepoint
+
+
+# returns the changepoints found on signal with penalty penalty.
+# model, jump and min_dist are directly passed to PELT
+def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False):
+    # default params in Function
+    if model is None:
+        model = "l1"
+    if jump is None:
+        jump = 5
+    if min_dist is None:
+        min_dist = 2
+    if plotting is None:
+        plotting = False
+    # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
+    # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
+    # model = "l1"   #"l1"  # "l2", "rbf"
+    algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
+
+    if penalty is not None:
+        bkps = algo.predict(pen=penalty)
+        if plotting:
+            fig, ax = rpt.display(signal, bkps)
+            plt.show()
+        return bkps
+
+    print_error("No Penalty specified.")
+    sys.exit(-1)
+
+
+# calculates and returns the necessary penalty for signal. Parallel execution with num_processes many processes
+# jump, min_dist are passed directly to PELT. S is directly passed to kneedle.
+# pen_modifier is used as a factor on the resulting penalty.
+# the interval [range_min, range_max] is used for searching.
+# refresh_delay and refresh_thresh are used to configure the progress "bar".
+def calculate_penalty_value(
+    signal,
+    model="l1",
+    jump=5,
+    min_dist=2,
+    range_min=0,
+    range_max=50,
+    num_processes=8,
+    refresh_delay=1,
+    refresh_thresh=5,
+    S=1.0,
+    pen_modifier=None,
+    show_plots=False,
+):
+    # default params in Function
+    if model is None:
+        model = "l1"
+    if jump is None:
+        jump = 5
+    if min_dist is None:
+        min_dist = 2
+    if range_min is None:
+        range_min = 0
+    if range_max is None:
+        range_max = 50
+    if num_processes is None:
+        num_processes = 8
+    if refresh_delay is None:
+        refresh_delay = 1
+    if refresh_thresh is None:
+        refresh_thresh = 5
+    if S is None:
+        S = 1.0
+    if pen_modifier is None:
+        pen_modifier = 1
+    # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30
+    # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html
+    # model = "l1"   #"l1"  # "l2", "rbf"
+    algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal)
+
+    ### CALC BKPS WITH DIFF PENALTYS
+    if range_max != range_min:
+        # building args array for parallelizing
+        args = []
+        # for displaying progression
+        m = Manager()
+        q = m.Queue()
+
+        for i in range(range_min, range_max + 1):
+            # same calculation for all except other penalty
+            args.append((algo, i, q))
+
+        print_info("starting kneepoint calculation.")
+        # init Pool with num_proesses
+        with Pool(min(num_processes, len(args))) as p:
+            # collect results from pool
+            result = p.starmap_async(get_bkps, args)
+            # monitor loop
+            percentage = -100  # Force display of 0%
+            i = 0
+            while True:
+                if result.ready():
+                    break
+
+                size = q.qsize()
+                last_percentage = percentage
+                percentage = round(size / (range_max - range_min) * 100, 2)
+                if percentage >= last_percentage + 2 or i >= refresh_thresh:
+                    print_info("Current progress: " + str(percentage) + "%")
+                    i = 0
+                else:
+                    i += 1
+                time.sleep(refresh_delay)
+            res = result.get()
+        print_info("Finished kneepoint calculation.")
+        # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH
+        # split x and y coords to pass to kneedle
+        pen_val = [x[0] for x in res]
+        fitted_bkps_val = [x[1] for x in res]
+        # # plot to look at res
+        knee = find_knee_point(pen_val, fitted_bkps_val, S=S)
+
+        # TODO: Find plateau on pen_val vs fitted_bkps_val
+        #   scipy.find_peaks() does not find plateaus if they extend through the end of the data.
+        #   to counter that, add one extremely large value to the right side of the data
+        #   after negating it is extremely small -> Almost certainly smaller than the
+        #   found plateau therefore the plateau does not extend through the border
+        #   -> scipy.find_peaks finds it. Choose value from within that plateau.
+        # fitted_bkps_val.append(100000000)
+        # TODO: Approaching over find_peaks might not work if the initial decrease step to the
+        #   "correct" number of changepoints and additional decrease steps e.g. underfitting
+        #   take place within the given penalty interval. find_peak only finds plateaus
+        #   of peaks. If the number of chpts decreases after the wanted plateau the condition
+        #   for local peaks is not satisfied anymore. Therefore this approach will only work
+        #   if the plateau extends over the right border of the penalty interval.
+        # peaks, peak_plateaus = find_peaks(- np.array(fitted_bkps_val), plateau_size=1)
+        # Since the data is monotonously decreasing only one plateau can be found.
+
+        # assuming the plateau is constant, i.e. no noise. OK to assume this here, since num_bkpts
+        # is monotonously decreasing. If the number of bkpts decreases inside a considered
+        # plateau, it means that the stable configuration is not yet met. -> Search further
+        start_index = -1
+        end_index = -1
+        longest_start = -1
+        longest_end = -1
+        prev_val = -1
+        for i, num_bkpts in enumerate(fitted_bkps_val[knee[0] :]):
+            if num_bkpts != prev_val:
+                end_index = i - 1
+                if end_index - start_index > longest_end - longest_start:
+                    # currently found sequence is the longest found yet
+                    longest_start = start_index
+                    longest_end = end_index
+                start_index = i
+            if i == len(fitted_bkps_val[knee[0] :]) - 1:
+                # end sequence with last value
+                end_index = i
+                # # since it is not guaranteed that this is the end of the plateau, assume the mid
+                # # of the plateau was hit.
+                # size = end_index - start_index
+                # end_index = end_index + size
+                # However this is not the clean solution. Better if search interval is widened
+                # with range_min and range_max
+                if end_index - start_index > longest_end - longest_start:
+                    # last found sequence is the longest found yet
+                    longest_start = start_index
+                    longest_end = end_index
+                start_index = i
+            prev_val = num_bkpts
+        if show_plots:
+            plt.xlabel("Penalty")
+            plt.ylabel("Number of Changepoints")
+            plt.plot(pen_val, fitted_bkps_val)
+            plt.vlines(
+                longest_start + knee[0], 0, max(fitted_bkps_val), linestyles="dashed"
+            )
+            plt.vlines(
+                longest_end + knee[0], 0, max(fitted_bkps_val), linestyles="dashed"
+            )
+            plt.show()
+        # choosing pen from plateau
+        mid_of_plat = longest_start + (longest_end - longest_start) // 2
+        knee = (mid_of_plat + knee[0], fitted_bkps_val[mid_of_plat + knee[0]])
+
+        # modify knee according to options. Defaults to 1 * knee
+        knee = (knee[0] * pen_modifier, knee[1])
+
+    else:
+        # range_min == range_max. has the same effect as pen_override
+        knee = (range_min, None)
+    print_info(str(knee[0]) + " has been selected as penalty.")
+    if knee[0] is not None:
+        return knee
+
+    print_error(
+        "With the current thresh-hold S="
+        + str(S)
+        + " it is not possible to select a penalty value."
+    )
+    sys.exit(-1)
+
+
+# calculates the raw_states for measurement measurement. num_measurement is used to identify the
+# return value
+# penalty, model and jump are directly passed to pelt
+def calc_raw_states_func(num_measurement, measurement, penalty, model, jump):
+    # extract signal
+    signal = np.array(measurement["uW"])
+    # norm signal to remove dependency on absolute values
+    normed_signal = norm_signal(signal)
+    # calculate the breakpoints
+    bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump)
+    calced_states = list()
+    start_time = 0
+    end_time = 0
+    # calc metrics for all states
+    for bkpt in bkpts:
+        # start_time of state is end_time of previous one
+        # (Transitions are instantaneous)
+        start_time = end_time
+        end_time = bkpt
+        power_vals = signal[start_time:end_time]
+        mean_power = np.mean(power_vals)
+        std_dev = np.std(power_vals)
+        calced_state = (start_time, end_time, mean_power, std_dev)
+        calced_states.append(calced_state)
+    num = 0
+    new_avg_std = 0
+    # calc avg std for all states from this measurement
+    for s in calced_states:
+        # print_info("State " + str(num) + " starts at t=" + str(s[0])
+        #            + " and ends at t=" + str(s[1])
+        #            + " while using " + str(s[2])
+        #            + "uW with  sigma=" + str(s[3]))
+        num = num + 1
+        new_avg_std = new_avg_std + s[3]
+    # check case if no state has been found to avoid crashing
+    if len(calced_states) != 0:
+        new_avg_std = new_avg_std / len(calced_states)
+    else:
+        new_avg_std = 0
+    change_avg_std = measurement["uW_std"] - new_avg_std
+    # print_info("The average standard deviation for the newly found states is "
+    #            + str(new_avg_std))
+    # print_info("That is a reduction of " + str(change_avg_std))
+    return num_measurement, calced_states, new_avg_std, change_avg_std
+
+
+# parallelize calc over all measurements
+def calc_raw_states(arg_list, num_processes=8):
+    m = Manager()
+    with Pool(processes=min(num_processes, len(arg_list))) as p:
+        # collect results from pool
+        result = p.starmap(calc_raw_states_func, arg_list)
+    return result
+
+
+# Very short benchmark yielded approx. 3 times the speed of solution not using sort
+# checks the percentiles if refinement is necessary
+def needs_refinement(signal, thresh):
+    sorted_signal = sorted(signal)
+    length_of_signal = len(signal)
+    percentile_size = int()
+    percentile_size = length_of_signal // 100
+    lower_percentile = sorted_signal[0:percentile_size]
+    upper_percentile = sorted_signal[
+        length_of_signal - percentile_size : length_of_signal
+    ]
+    lower_percentile_mean = np.mean(lower_percentile)
+    upper_percentile_mean = np.mean(upper_percentile)
+    median = np.median(sorted_signal)
+    dist = median - lower_percentile_mean
+    if dist > thresh:
+        return True
+    dist = upper_percentile_mean - median
+    if dist > thresh:
+        return True
+    return False
+
+
+# helper functions for user output
+# TODO: maybe switch with python logging feature
+def print_info(str_to_prt):
+    str_lst = str_to_prt.split(sep="\n")
+    for str_prt in str_lst:
+        print("[INFO]" + str_prt)
+
+
+def print_warning(str_to_prt):
+    str_lst = str_to_prt.split(sep="\n")
+    for str_prt in str_lst:
+        print("[WARNING]" + str_prt)
+
+
+def print_error(str_to_prt):
+    str_lst = str_to_prt.split(sep="\n")
+    for str_prt in str_lst:
+        print("[ERROR]" + str_prt, file=sys.stderr)
+
+
+# norms the signal and apply scaler to all values as a factor
+def norm_signal(signal, scaler=25):
+    max_val = max(signal)
+    normed_signal = np.zeros(shape=len(signal))
+    for i, signal_i in enumerate(signal):
+        normed_signal[i] = signal_i / max_val
+        normed_signal[i] = normed_signal[i] * scaler
+    return normed_signal
+
+
+# norms the values to prepare them for clustering
+def norm_values_to_cluster(values_to_cluster):
+    new_vals = np.array(values_to_cluster)
+    num_samples = len(values_to_cluster)
+    num_params = len(values_to_cluster[0])
+    for i in range(num_params):
+        param_vals = []
+        for sample in new_vals:
+            param_vals.append(sample[i])
+        max_val = np.max(np.abs(param_vals))
+        for num_sample, sample in enumerate(new_vals):
+            values_to_cluster[num_sample][i] = sample[i] / max_val
+    return new_vals
+
+
+# finds state_num using state name
+def get_state_num(state_name, distinct_states):
+    for state_num, states in enumerate(distinct_states):
+        if state_name in states:
+            return state_num
+    return -1
+
+
+if __name__ == "__main__":
+    # OPTION RECOGNITION
+    opt = dict()
+
+    optspec = (
+        "filename= "
+        "v "
+        "model= "
+        "jump= "
+        "min_dist= "
+        "range_min= "
+        "range_max= "
+        "num_processes= "
+        "refresh_delay= "
+        "refresh_thresh= "
+        "S= "
+        "pen_override= "
+        "pen_modifier= "
+        "plotting= "
+        "refinement_thresh= "
+        "cache_dicts "
+        "cache_loc= "
+    )
+    opt_filename = None
+    opt_verbose = False
+    opt_model = None
+    opt_jump = None
+    opt_min_dist = None
+    opt_range_min = None
+    opt_range_max = None
+    opt_num_processes = cpu_count()
+    opt_refresh_delay = None
+    opt_refresh_thresh = None
+    opt_S = None
+    opt_pen_override = None
+    opt_pen_modifier = None
+    opt_plotting = False
+    opt_refinement_thresh = None
+    opt_cache_loc = None
+    try:
+        raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" "))
+
+        for option, parameter in raw_opts:
+            optname = re.sub(r"^--", "", option)
+            opt[optname] = parameter
+
+        if "filename" not in opt:
+            print_error("No file specified!")
+            sys.exit(-1)
+        else:
+            opt_filename = opt["filename"]
+        if "v" in opt:
+            opt_verbose = True
+            opt_plotting = True
+        if "model" in opt:
+            opt_model = opt["model"]
+        if "jump" in opt:
+            try:
+                opt_jump = int(opt["jump"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "min_dist" in opt:
+            try:
+                opt_min_dist = int(opt["min_dist"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "range_min" in opt:
+            try:
+                opt_range_min = int(opt["range_min"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "range_max" in opt:
+            try:
+                opt_range_max = int(opt["range_max"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "num_processes" in opt:
+            try:
+                opt_num_processes = int(opt["num_processes"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "refresh_delay" in opt:
+            try:
+                opt_refresh_delay = int(opt["refresh_delay"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "refresh_thresh" in opt:
+            try:
+                opt_refresh_thresh = int(opt["refresh_thresh"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "S" in opt:
+            try:
+                opt_S = float(opt["S"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "pen_override" in opt:
+            try:
+                opt_pen_override = int(opt["pen_override"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "pen_modifier" in opt:
+            try:
+                opt_pen_modifier = float(opt["pen_modifier"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "refinement_thresh" in opt:
+            try:
+                opt_refinement_thresh = int(opt["refinement_thresh"])
+            except ValueError as verr:
+                print(verr, file=sys.stderr)
+                sys.exit(-1)
+        if "cache_dicts" in opt:
+            if "cache_loc" in opt:
+                opt_cache_loc = opt["cache_loc"]
+            else:
+                print_error('If "cache_dicts" is set, "cache_loc" must be provided.')
+                sys.exit(-1)
+    except getopt.GetoptError as err:
+        print(err, file=sys.stderr)
+        sys.exit(-1)
+    filepath = os.path.dirname(opt_filename)
+    # OPENING DATA
+    if ".json" in opt_filename:
+        # open file with trace data from json
+        print_info(
+            "Will only refine the state which is present in "
+            + opt_filename
+            + " if necessary."
+        )
+        with open(opt_filename, "r") as f:
+            configurations = json.load(f)
+
+        # for i in range(0, 7):
+        #     signal = np.array(configurations[i]['offline'][0]['uW'])
+        #     plt.plot(signal)
+        # plt.xlabel('Time [us]')
+        # plt.ylabel('Power [mW]')
+        # plt.show()
+        # sys.exit()
+
+        # resulting_sequence_list = []
+        # search for param_names, by_param and by_name files
+        # cachingopts
+        by_param_file = None
+        by_name_file = None
+        param_names_file = None
+        from_cache = False
+        not_accurate = False
+        if opt_cache_loc is not None:
+            flag = False
+            by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
+            by_param_loc = os.path.join(opt_cache_loc, "by_param.txt")
+            param_names_loc = os.path.join(opt_cache_loc, "param_names.txt")
+            if os.path.isfile(by_name_loc) and os.path.getsize(by_name_loc) > 0:
+                by_name_file = open(by_name_loc, "r")
+            else:
+                print_error("In " + opt_cache_loc + " is no by_name.txt.")
+                flag = True
+            if os.path.isfile(by_param_loc) and os.path.getsize(by_param_loc) > 0:
+                by_param_file = open(by_param_loc, "r")
+            else:
+                print_error("In " + opt_cache_loc + " is no by_param.txt.")
+                flag = True
+            if os.path.isfile(param_names_loc) and os.path.getsize(param_names_loc) > 0:
+                param_names_file = open(param_names_loc, "r")
+            else:
+                print_error("In " + opt_cache_loc + " is no param_names.txt.")
+                flag = True
+            if flag:
+                print_info("The cache will be build.")
+            else:
+                print_warning(
+                    'THE OPTION "cache_dicts" IS FOR DEBUGGING PURPOSES ONLY! '
+                    "\nDO NOT USE FOR REGULAR APPLICATIONS!"
+                    "\nThis will possibly not be maintained in further development."
+                )
+                from_cache = True
+        big_state_name = configurations[0]["name"]
+        if None in (by_param_file, by_name_file, param_names_file):
+            state_durations_by_config = []
+            state_consumptions_by_config = []
+            # loop through all traces check if refinement is necessary and if necessary refine it.
+            for num_config, measurements_by_config in enumerate(configurations):
+                # loop through all occurrences of the looked at state
+                print_info(
+                    "Looking at state '"
+                    + measurements_by_config["name"]
+                    + "' with params: "
+                    + str(measurements_by_config["parameter"])
+                    + "("
+                    + str(num_config + 1)
+                    + "/"
+                    + str(len(configurations))
+                    + ")"
+                )
+                num_needs_refine = 0
+                print_info("Checking if refinement is necessary...")
+                for measurement in measurements_by_config["offline"]:
+                    # loop through measurements of particular state
+                    # an check if state needs refinement
+                    signal = measurement["uW"]
+                    # mean = measurement['uW_mean']
+                    if needs_refinement(signal, opt_refinement_thresh):
+                        num_needs_refine = num_needs_refine + 1
+                if num_needs_refine == 0:
+                    print_info(
+                        "No refinement necessary for state '"
+                        + measurements_by_config["name"]
+                        + "' with params: "
+                        + str(measurements_by_config["parameter"])
+                    )
+                elif num_needs_refine < len(measurements_by_config["offline"]) / 2:
+                    print_info(
+                        "No refinement necessary for state '"
+                        + measurements_by_config["name"]
+                        + "' with params: "
+                        + str(measurements_by_config["parameter"])
+                    )
+                    print_warning(
+                        "However this decision was not unanimously. This could hint a poor"
+                        "measurement quality."
+                    )
+                else:
+                    if num_needs_refine != len(measurements_by_config["parameter"]):
+                        print_warning(
+                            "However this decision was not unanimously. This could hint a poor"
+                            "measurement quality."
+                        )
+                    # assume that all measurements of the same param configuration are fundamentally
+                    # similar -> calculate penalty for first measurement, use it for all
+                    if opt_pen_override is None:
+                        signal = np.array(measurements_by_config["offline"][0]["uW"])
+                        normed_signal = norm_signal(signal)
+                        penalty = calculate_penalty_value(
+                            normed_signal,
+                            model=opt_model,
+                            range_min=opt_range_min,
+                            range_max=opt_range_max,
+                            num_processes=opt_num_processes,
+                            jump=opt_jump,
+                            S=opt_S,
+                            pen_modifier=opt_pen_modifier,
+                        )
+                        penalty = penalty[0]
+                    else:
+                        penalty = opt_pen_override
+                    # build arguments for parallel excecution
+                    print_info("Starting raw_states calculation.")
+                    ### YOU ARE HERE
+                    raw_states_calc_args = []
+                    for num_measurement, measurement in enumerate(
+                        measurements_by_config["offline"]
+                    ):
+                        raw_states_calc_args.append(
+                            (num_measurement, measurement, penalty, opt_model, opt_jump)
+                        )
+
+                    raw_states_list = [None] * len(measurements_by_config["offline"])
+                    raw_states_res = calc_raw_states(
+                        raw_states_calc_args, opt_num_processes
+                    )
+                    # extracting result and putting it in correct order -> index of raw_states_list
+                    # entry still corresponds with index of measurement in measurements_by_states
+                    # -> If measurements are discarded the used ones are easily recognized
+                    for ret_val in raw_states_res:
+                        num_measurement = ret_val[0]
+                        raw_states = ret_val[1]
+                        avg_std = ret_val[2]
+                        change_avg_std = ret_val[3]
+                        # FIXME: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch
+                        #   int sein oder nicht? Es scheint auch vernünftig zu klappen...
+                        raw_states_list[num_measurement] = raw_states
+                        print_info(
+                            "The average standard deviation for the newly found states in "
+                            + "measurement No. "
+                            + str(num_measurement)
+                            + " is "
+                            + str(avg_std)
+                        )
+                        print_info("That is a reduction of " + str(change_avg_std))
+                        # l_signal = measurements_by_config['offline'][num_measurement]['uW']
+                        # l_bkpts = [s[1] for s in raw_states]
+                        # fig, ax = rpt.display(np.array(l_signal), l_bkpts)
+                        # plt.show()
+                    print_info("Finished raw_states calculation.")
+                    num_states_array = [int()] * len(raw_states_list)
+                    i = 0
+                    for i, x in enumerate(raw_states_list):
+                        num_states_array[i] = len(x)
+                    avg_num_states = np.mean(num_states_array)
+                    num_states_dev = np.std(num_states_array)
+                    print_info(
+                        "On average "
+                        + str(avg_num_states)
+                        + " States have been found. The standard deviation"
+                        + " is "
+                        + str(num_states_dev)
+                    )
+                    # TODO: MAGIC NUMBER
+                    if num_states_dev > 1:
+                        print_warning(
+                            "The number of states varies strongly across measurements."
+                            " Consider choosing a larger range for penalty detection."
+                            " It is also possible, that the processed data is not accurate"
+                            " enough to produce proper results."
+                        )
+                        time.sleep(5)
+                    # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist?
+                    #   Einfach Durchschnitt nehmen?
+                    #   Preliminary decision: Further on only use the traces, which have the most
+                    #   frequent state count
+                    counts = np.bincount(num_states_array)
+                    num_raw_states = np.argmax(counts)
+                    print_info(
+                        "Choose " + str(num_raw_states) + " as number of raw_states."
+                    )
+                    if num_raw_states == 1:
+                        print_info(
+                            "Upon further inspection it is clear that no refinement is necessary."
+                            " The macromodel is usable for this configuration."
+                        )
+                        continue
+                    # iterate through all found breakpoints and determine start and end points as well
+                    # as power consumption
+                    num_measurements = len(raw_states_list)
+                    states_duration_list = [list()] * num_raw_states
+                    states_consumption_list = [list()] * num_raw_states
+                    for num_elem, _ in enumerate(states_duration_list):
+                        states_duration_list[num_elem] = [0] * num_measurements
+                        states_consumption_list[num_elem] = [0] * num_measurements
+                    num_used_measurements = 0
+                    for num_measurement, raw_states in enumerate(raw_states_list):
+                        if len(raw_states) == num_raw_states:
+                            num_used_measurements = num_used_measurements + 1
+                            for num_state, s in enumerate(raw_states):
+                                states_duration_list[num_state][num_measurement] = (
+                                    s[1] - s[0]
+                                )
+                                states_consumption_list[num_state][num_measurement] = s[
+                                    2
+                                ]
+                            # calced_state = (start_time, end_time, mean_power, std_dev)
+                            # for num_state, s in enumerate(raw_states):
+                            #     state_duration = s[1] - s[0]
+                            #     state_consumption = s[2]
+                            #     states_duration_list[num_state] = \
+                            #         states_duration_list[num_state] + state_duration
+                            #     states_consumption_list[num_state] = \
+                            #         states_consumption_list[num_state] + state_consumption
+                        else:
+                            print_info(
+                                "Discarding measurement No. "
+                                + str(num_measurement)
+                                + " because it did not recognize the number of "
+                                "raw_states correctly."
+                            )
+                        # l_signal = measurements_by_config['offline'][num_measurement]['uW']
+                        # l_bkpts = [s[1] for s in raw_states]
+                        # fig, ax = rpt.display(np.array(l_signal), l_bkpts)
+                        # plt.show()
+                    # for i, x in enumerate(states_duration_list):
+                    #     states_duration_list[i] = x / num_used_measurements
+                    # for i, x in enumerate(states_consumption_list):
+                    #     states_consumption_list[i] = x / num_used_measurements
+                    if num_used_measurements != len(raw_states_list):
+                        if num_used_measurements / len(raw_states_list) <= 0.5:
+                            print_warning(
+                                "Only used "
+                                + str(num_used_measurements)
+                                + "/"
+                                + str(len(raw_states_list))
+                                + " Measurements for refinement. "
+                                + "Others did not recognize number of states correctly."
+                                + "\nYou should verify the integrity of the measurements."
+                            )
+                        else:
+                            print_info(
+                                "Used "
+                                + str(num_used_measurements)
+                                + "/"
+                                + str(len(raw_states_list))
+                                + " Measurements for refinement."
+                                + " Others did not recognize number of states correctly."
+                            )
+                        num_used_measurements = i
+                    else:
+                        print_info("Used all available measurements.")
+
+                    state_durations_by_config.append((num_config, states_duration_list))
+                    state_consumptions_by_config.append(
+                        (num_config, states_consumption_list)
+                    )
+
+            # combine all state durations and consumptions to parametrized model
+            if len(state_durations_by_config) == 0:
+                print(
+                    "No refinement necessary for this state. The macromodel is usable."
+                )
+                sys.exit(1)
+            if len(state_durations_by_config) / len(configurations) > 1 / 2 and len(
+                state_durations_by_config
+            ) != len(configurations):
+                print_warning(
+                    "Some measurements(>50%) need to be refined, however that is not true for"
+                    " all measurements. This hints a correlation between the structure of"
+                    " the underlying automaton and parameters. Only the ones which need to"
+                    " be refined will be refined. THE RESULT WILL NOT ACCURATELY DEPICT "
+                    " THE REAL WORLD."
+                )
+                not_accurate = True
+            if len(state_durations_by_config) / len(configurations) < 1 / 2:
+                print_warning(
+                    "Some measurements(<50%) need to be refined, however that is not true for"
+                    " all measurements. This hints a correlation between the structure of"
+                    " the underlying automaton and parameters. Or a poor quality of measurements."
+                    " No Refinement will be done."
+                )
+                sys.exit(-1)
+            # this is only necessary because at this state only linear automatons can be modeled.
+            num_states_array = [int()] * len(state_consumptions_by_config)
+            for i, (_, states_consumption_list) in enumerate(
+                state_consumptions_by_config
+            ):
+                num_states_array[i] = len(states_consumption_list)
+            counts = np.bincount(num_states_array)
+            num_raw_states = np.argmax(counts)
+            usable_configs = len(state_consumptions_by_config)
+            # param_list identical for each raw_state
+            param_list = []
+            param_names = configurations[0]["offline_aggregates"]["paramkeys"][0]
+            print_info("param_names: " + str(param_names))
+            for num_config, states_consumption_list in state_consumptions_by_config:
+                if len(states_consumption_list) != num_raw_states:
+                    print_warning(
+                        "Config No."
+                        + str(num_config)
+                        + " not usable yet due to different "
+                        + "number of states. This hints a correlation between parameters and "
+                        + "the structure of the resulting automaton. This will be possibly"
+                        + " supported in a future version of this tool. HOWEVER AT THE MOMENT"
+                        " THIS WILL LEAD TO INACCURATE RESULTS!"
+                    )
+                    not_accurate = True
+                    usable_configs = usable_configs - 1
+                else:
+                    param_list.extend(
+                        configurations[num_config]["offline_aggregates"]["param"]
+                    )
+            print_info("param_list: " + str(param_list))
+
+            if usable_configs == len(state_consumptions_by_config):
+                print_info("All configs usable.")
+            else:
+                print_info("Using only " + str(usable_configs) + " Configs.")
+            # build by_name
+            by_name = {}
+            usable_configs_2 = len(state_consumptions_by_config)
+            for i in range(num_raw_states):
+                consumptions_for_state = []
+                durations_for_state = []
+                for j, (_, states_consumption_list) in enumerate(
+                    state_consumptions_by_config
+                ):
+                    if len(states_consumption_list) == num_raw_states:
+                        consumptions_for_state.extend(states_consumption_list[i])
+                        durations_for_state.extend(state_durations_by_config[j][1][i])
+                    else:
+                        not_accurate = True
+                        usable_configs_2 = usable_configs_2 - 1
+                if usable_configs_2 != usable_configs:
+                    print_error(
+                        "an zwei unterschiedlichen Stellen wurden unterschiedlich viele "
+                        "Messungen rausgeworfen. Bei Janis beschweren."
+                    )
+                state_name = "state_" + str(i)
+                state_dict = {
+                    "param": param_list,
+                    "power": consumptions_for_state,
+                    "duration": durations_for_state,
+                    "attributes": ["power", "duration"],
+                    # Da kein "richtiger" Automat generiert wird, gibt es auch keine Transitionen
+                    "isa": "state",
+                }
+                by_name[state_name] = state_dict
+            by_param = by_name_to_by_param(by_name)
+            if opt_cache_loc is not None:
+                by_name_loc = os.path.join(opt_cache_loc, "by_name.txt")
+                by_param_loc = os.path.join(opt_cache_loc, "by_param.txt")
+                param_names_loc = os.path.join(opt_cache_loc, "param_names.txt")
+                f = open(by_name_loc, "w")
+                f.write(str(by_name))
+                f.close()
+                f = open(by_param_loc, "w")
+                f.write(str(by_param))
+                f.close()
+                f = open(param_names_loc, "w")
+                f.write(str(param_names))
+                f.close()
+        else:
+            by_name_text = str(by_name_file.read())
+            by_name = eval(by_name_text)
+            by_param_text = str(by_param_file.read())
+            by_param = eval(by_param_text)
+            param_names_text = str(param_names_file.read())
+            param_names = eval(param_names_text)
+
+        # t = 0
+        # last_pow = 0
+        # for key in by_name.keys():
+        #     end_t = t + np.mean(by_name[key]["duration"])
+        #     power = np.mean(by_name[key]["power"])
+        #     plt.vlines(t, min(last_pow, power), max(last_pow, power))
+        #     plt.hlines(power, t, end_t)
+        #     t = end_t
+        #     last_pow = power
+        # plt.show()
+        stats = parameters.ParamStats(by_name, by_param, param_names, dict())
+        paramfit = ParallelParamFit(by_param)
+        for state_name in by_name.keys():
+            for num_param, param_name in enumerate(param_names):
+                if stats.depends_on_param(state_name, "power", param_name):
+                    paramfit.enqueue(state_name, "power", num_param, param_name)
+                if stats.depends_on_param(state_name, "duration", param_name):
+                    paramfit.enqueue(state_name, "duration", num_param, param_name)
+                print_info(
+                    "State "
+                    + state_name
+                    + "s power depends on param "
+                    + param_name
+                    + ":"
+                    + str(stats.depends_on_param(state_name, "power", param_name))
+                )
+                print_info(
+                    "State "
+                    + state_name
+                    + "s duration depends on param "
+                    + param_name
+                    + ":"
+                    + str(stats.depends_on_param(state_name, "duration", param_name))
+                )
+        paramfit.fit()
+        fit_res_dur_dict = {}
+        fit_res_pow_dict = {}
+        # fit functions and check if successful
+        for state_name in by_name.keys():
+            fit_power = paramfit.get_result(state_name, "power")
+            fit_duration = paramfit.get_result(state_name, "duration")
+            combined_fit_power = analytic.function_powerset(fit_power, param_names, 0)
+            combined_fit_duration = analytic.function_powerset(
+                fit_duration, param_names, 0
+            )
+            combined_fit_power.fit(by_param, state_name, "power")
+            if not combined_fit_power.fit_success:
+                print_warning(
+                    "Fitting(power) for state " + state_name + " was not succesful!"
+                )
+            combined_fit_duration.fit(by_param, state_name, "duration")
+            if not combined_fit_duration.fit_success:
+                print_warning(
+                    "Fitting(duration) for state " + state_name + " was not succesful!"
+                )
+            fit_res_pow_dict[state_name] = combined_fit_power
+            fit_res_dur_dict[state_name] = combined_fit_duration
+        # only raw_states with the same number of function parameters can be similar
+        num_param_pow_dict = {}
+        num_param_dur_dict = {}
+        # print found substate_results
+        for state_name in by_name.keys():
+            model_function = str(fit_res_pow_dict[state_name].model_function)
+            model_args = fit_res_pow_dict[state_name].model_args
+            num_param_pow_dict[state_name] = len(model_args)
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print_info("Power-Function for state " + state_name + ": " + model_function)
+        for state_name in by_name.keys():
+            model_function = str(fit_res_dur_dict[state_name].model_function)
+            model_args = fit_res_dur_dict[state_name].model_args
+            num_param_dur_dict[state_name] = len(model_args)
+            for num_arg, arg in enumerate(model_args):
+                replace_string = "regression_arg(" + str(num_arg) + ")"
+                model_function = model_function.replace(replace_string, str(arg))
+            print_info(
+                "Duration-Function for state " + state_name + ": " + model_function
+            )
+        # sort states in buckets for clustering
+        similar_raw_state_buckets = {}
+        for state_name in by_name.keys():
+            pow_model_function = str(fit_res_pow_dict[state_name].model_function)
+            dur_model_function = str(fit_res_dur_dict[state_name].model_function)
+            key_tuple = (pow_model_function, dur_model_function)
+            if key_tuple not in similar_raw_state_buckets:
+                similar_raw_state_buckets[key_tuple] = []
+            similar_raw_state_buckets[key_tuple].append(state_name)
+
+        # cluster for each Key-Tuple using the function parameters
+        distinct_states = []
+        for key_tuple in similar_raw_state_buckets.keys():
+            print_info(
+                "Key-Tuple "
+                + str(key_tuple)
+                + ": "
+                + str(similar_raw_state_buckets[key_tuple])
+            )
+            similar_states = similar_raw_state_buckets[key_tuple]
+            if len(similar_states) > 1:
+                # only necessary to cluster if more than one raw_state has the same function
+                # configuration
+                # functions are identical -> num_params and used params are identical
+                num_params = (
+                    num_param_dur_dict[similar_states[0]]
+                    + num_param_pow_dict[similar_states[0]]
+                )
+                values_to_cluster = np.zeros((len(similar_states), num_params))
+                for num_state, state_name in enumerate(similar_states):
+                    dur_params = fit_res_dur_dict[state_name].model_args
+                    pow_params = fit_res_pow_dict[state_name].model_args
+                    j = 0
+                    for param in pow_params:
+                        values_to_cluster[num_state][j] = param
+                        j = j + 1
+                    for param in dur_params:
+                        values_to_cluster[num_state][j] = param
+                        j = j + 1
+                normed_vals_to_cluster = norm_values_to_cluster(values_to_cluster)
+                cluster = AgglomerativeClustering(
+                    n_clusters=None,
+                    compute_full_tree=True,
+                    affinity="euclidean",
+                    linkage="ward",
+                    # TODO: Magic Number. Beim Evaluieren finetunen
+                    distance_threshold=1,
+                )
+                cluster.fit_predict(values_to_cluster)
+                cluster_labels = cluster.labels_
+                print_info("Cluster labels:\n" + str(cluster_labels))
+                if cluster.n_clusters_ > 1:
+                    # more than one distinct state found -> seperation of raw_states necessary
+                    distinct_state_dict = {}
+                    for num_state, label in enumerate(cluster_labels):
+                        if label not in distinct_state_dict.keys():
+                            distinct_state_dict[label] = []
+                        distinct_state_dict[label].append(similar_states[num_state])
+                    for distinct_state_key in distinct_state_dict.keys():
+                        distinct_states.append(distinct_state_dict[distinct_state_key])
+                else:
+                    # all raw_states make up this state
+                    distinct_states.append(similar_states)
+            else:
+                distinct_states.append(similar_states)
+        for num_state, distinct_state in enumerate(distinct_states):
+            print("State " + str(num_state) + ": " + str(distinct_state))
+        num_raw_states = len(by_name.keys())
+        resulting_sequence = [int] * num_raw_states
+        for i in range(num_raw_states):
+            # apply the projection from raw_states to states
+            state_name = "state_" + str(i)
+            state_num = get_state_num(state_name, distinct_states)
+            if state_num == -1:
+                print_error(
+                    "Critical Error when creating the resulting sequence. raw_state state_"
+                    + str(i)
+                    + " could not be mapped to a state."
+                )
+                sys.exit(-1)
+            resulting_sequence[i] = state_num
+        print("Resulting sequence is: " + str(resulting_sequence))
+        # if from_cache:
+        #     print_warning(
+        #         "YOU USED THE OPTION \"cache_dicts\". THIS IS FOR DEBUGGING PURPOSES ONLY!"
+        #         "\nTHE SCRIPT WILL NOW STOP PREMATURELY,"
+        #         "SINCE DATA FOR FURTHER COMPUTATION IS MISSING!")
+        #     sys.exit(0)
+        # parameterize all new states
+        new_by_name = {}
+        for num_state, distinct_state in enumerate(distinct_states):
+            state_name = "State_" + str(num_state)
+            consumptions_for_state = []
+            durations_for_state = []
+            param_list = []
+            for raw_state in distinct_state:
+                original_state_dict = by_name[raw_state]
+                param_list.extend(original_state_dict["param"])
+                consumptions_for_state.extend(original_state_dict["power"])
+                durations_for_state.extend(original_state_dict["duration"])
+            new_state_dict = {
+                "param": param_list,
+                "power": consumptions_for_state,
+                "duration": durations_for_state,
+                "attributes": ["power", "duration"],
+                # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen
+                "isa": "state",
+            }
+            new_by_name[state_name] = new_state_dict
+        new_by_param = by_name_to_by_param(new_by_name)
+        new_stats = parameters.ParamStats(
+            new_by_name, new_by_param, param_names, dict()
+        )
+        new_paramfit = ParallelParamFit(new_by_param)
+        for state_name in new_by_name.keys():
+            for num_param, param_name in enumerate(param_names):
+                if new_stats.depends_on_param(state_name, "power", param_name):
+                    new_paramfit.enqueue(state_name, "power", num_param, param_name)
+                if new_stats.depends_on_param(state_name, "duration", param_name):
+                    new_paramfit.enqueue(state_name, "duration", num_param, param_name)
+                print_info(
+                    "State "
+                    + state_name
+                    + "s power depends on param "
+                    + param_name
+                    + ":"
+                    + str(new_stats.depends_on_param(state_name, "power", param_name))
+                )
+                print_info(
+                    "State "
+                    + state_name
+                    + "s duration depends on param "
+                    + param_name
+                    + ":"
+                    + str(
+                        new_stats.depends_on_param(state_name, "duration", param_name)
+                    )
+                )
+        new_paramfit.fit()
+        new_fit_res_dur_dict = {}
+        new_fit_res_pow_dict = {}
+        for state_name in new_by_name.keys():
+            fit_power = new_paramfit.get_result(state_name, "power")
+            fit_duration = new_paramfit.get_result(state_name, "duration")
+            combined_fit_power = analytic.function_powerset(fit_power, param_names, 0)
+            combined_fit_duration = analytic.function_powerset(
+                fit_duration, param_names, 0
+            )
+            combined_fit_power.fit(new_by_param, state_name, "power")
+            if not combined_fit_power.fit_success:
+                print_warning(
+                    "Fitting(power) for state " + state_name + " was not succesful!"
+                )
+            combined_fit_duration.fit(new_by_param, state_name, "duration")
+            if not combined_fit_duration.fit_success:
+                print_warning(
+                    "Fitting(duration) for state " + state_name + " was not succesful!"
+                )
+            new_fit_res_pow_dict[state_name] = combined_fit_power
+            new_fit_res_dur_dict[state_name] = combined_fit_duration
+        # output results
+        result_loc = os.path.join(filepath, "result" + big_state_name + ".txt")
+        with open(result_loc, "w") as f:
+            f.write("Resulting Sequence: " + str(resulting_sequence))
+            f.write("\n\n")
+            for state_name in new_by_name.keys():
+                model_function = str(new_fit_res_pow_dict[state_name].model_function)
+                model_args = new_fit_res_pow_dict[state_name].model_args
+                for num_arg, arg in enumerate(model_args):
+                    replace_string = "regression_arg(" + str(num_arg) + ")"
+                    model_function = model_function.replace(replace_string, str(arg))
+                print("Power-Function for state " + state_name + ": " + model_function)
+                f.write(
+                    "Power-Function for state "
+                    + state_name
+                    + ": "
+                    + model_function
+                    + "\n"
+                )
+            f.write("\n\n")
+            for state_name in new_by_name.keys():
+                model_function = str(new_fit_res_dur_dict[state_name].model_function)
+                model_args = new_fit_res_dur_dict[state_name].model_args
+                for num_arg, arg in enumerate(model_args):
+                    replace_string = "regression_arg(" + str(num_arg) + ")"
+                    model_function = model_function.replace(replace_string, str(arg))
+                print(
+                    "Duration-Function for state " + state_name + ": " + model_function
+                )
+                f.write(
+                    "Duration-Function for state "
+                    + state_name
+                    + ": "
+                    + model_function
+                    + "\n"
+                )
+            if not_accurate:
+                print_warning(
+                    "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
+                    " WHY."
+                )
+                f.write(
+                    "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING"
+                    " WHY."
+                )
+
+    #         Removed clustering at this point, since it provided too much difficulties
+    #           at the current state. Clustering is still used, but at another point of execution.
+    #           Now parametrization is done first. raw_states are grouped by their using a dict
+    #           where the key is [power_function, duration_dunction]. Then all raw_states from
+    #           each bucket are clustered by their parameters
+    #         i = 0
+    #         cluster_labels_list = []
+    #         num_cluster_list = []
+    #         for num_trace, raw_states in enumerate(raw_states_list):
+    #             # iterate through raw states from measurements
+    #             if len(raw_states) == num_raw_states:
+    #                 # build array with power values to cluster these
+    #                 value_to_cluster = np.zeros((num_raw_states, 2))
+    #                 j = 0
+    #                 for s in raw_states:
+    #                     value_to_cluster[j][0] = s[2]
+    #                     value_to_cluster[j][1] = 0
+    #                     j = j + 1
+    #                 # linked = linkage(value_to_cluster, 'single')
+    #                 #
+    #                 # labelList = range(1, 11)
+    #                 #
+    #                 # plt.figure(figsize=(10, 7))
+    #                 # dendrogram(linked,
+    #                 #            orientation='top',
+    #                 #            distance_sort='descending',
+    #                 #            show_leaf_counts=True)
+    #                 # plt.show()
+    #                 # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER
+    #                 #   im distance_threshold
+    #                 cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True,
+    #                                                   affinity='euclidean',
+    #                                                   linkage='ward',
+    #                                                   distance_threshold=opt_refinement_thresh * 100)
+    #                 # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean',
+    #                 #                                   linkage='ward')
+    #                 cluster.fit_predict(value_to_cluster)
+    #                 # print_info("Cluster labels:\n" + str(cluster.labels_))
+    #                 # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow')
+    #                 # plt.show()
+    #                 cluster_labels_list.append((num_trace, cluster.labels_))
+    #                 num_cluster_list.append((num_trace, cluster.n_clusters_))
+    #                 i = i + 1
+    #             else:
+    #                 print_info("Discarding measurement No. " + str(num_trace) + " because it "
+    #                            + "did not recognize the number of raw_states correctly.")
+    #         num_used_measurements = len(raw_states_list)
+    #         if i != len(raw_states_list):
+    #             if i / len(raw_states_list) <= 0.5:
+    #                 print_warning("Only used " + str(i) + "/" + str(len(raw_states_list))
+    #                               + " Measurements for refinement. "
+    #                                 "Others did not recognize number of states correctly."
+    #                                 "\nYou should verify the integrity of the measurements.")
+    #             else:
+    #                 print_info("Used " + str(i) + "/" + str(len(raw_states_list))
+    #                            + " Measurements for refinement. "
+    #                              "Others did not recognize number of states correctly.")
+    #             num_used_measurements = i
+    #             # TODO: DEBUG Kram
+    #             sys.exit(0)
+    #         else:
+    #             print_info("Used all available measurements.")
+    #
+    #         num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list]))
+    #         avg_per_state_list = [None] * len(cluster_labels_list)
+    #         used_clusters = 0
+    #         for number, (num_trace, labels) in enumerate(cluster_labels_list):
+    #             if num_cluster_list[number][1] == num_states:
+    #                 avg_per_state = [0] * num_states
+    #                 count_per_state = [0] * num_states
+    #                 raw_states = raw_states_list[num_trace]
+    #                 for num_label, label in enumerate(labels):
+    #                     count_per_state[label] = count_per_state[label] + 1
+    #                     avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2]
+    #                 for i, _ in enumerate(avg_per_state):
+    #                     avg_per_state[i] = avg_per_state[i] / count_per_state[i]
+    #                 avg_per_state_list[number] = avg_per_state
+    #                 used_clusters = used_clusters + 1
+    #             else:
+    #                 # hopefully this does not happen regularly
+    #                 print_info("Discarding measurement " + str(number)
+    #                            + " because the clustering yielded not matching results.")
+    #                 num_used_measurements = num_used_measurements - 1
+    #         if num_used_measurements == 0:
+    #             print_error("Something went terribly wrong. Discarded all measurements.")
+    #             # continue
+    #             sys.exit(-1)
+    #         # flattend version for clustering:
+    #         values_to_cluster = np.zeros((num_states * used_clusters, 2))
+    #         index = 0
+    #         for avg_per_state in avg_per_state_list:
+    #             if avg_per_state is not None:
+    #                 for avg in avg_per_state:
+    #                     values_to_cluster[index][0] = avg
+    #                     values_to_cluster[index][1] = 0
+    #                     index = index + 1
+    #         # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1])
+    #         # plt.show()
+    #         cluster = AgglomerativeClustering(n_clusters=num_states)
+    #         cluster.fit_predict(values_to_cluster)
+    #         # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die
+    #         # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels.
+    #         # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting
+    #         # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist.
+    #         new_labels_list = []
+    #         new_labels = []
+    #         i = 0
+    #         for label in cluster.labels_:
+    #             new_labels.append(label)
+    #             i = i + 1
+    #             if i == num_states:
+    #                 new_labels_list.append(new_labels)
+    #                 new_labels = []
+    #                 i = 0
+    #         # only the selected measurements are present in new_labels.
+    #         # new_labels_index should not be incremented, if not selected_measurement is skipped
+    #         new_labels_index = 0
+    #         # cluster_labels_list contains all measurements -> if measurement is skipped
+    #         # still increment the index
+    #         index = 0
+    #         for elem in avg_per_state_list:
+    #             if elem is not None:
+    #                 for number, label in enumerate(cluster_labels_list[index][1]):
+    #                     cluster_labels_list[index][1][number] = \
+    #                         new_labels_list[new_labels_index][label]
+    #                 new_labels_index = new_labels_index + 1
+    #             else:
+    #                 # override not selected measurement labels to avoid choosing the wrong ones.
+    #                 for number, label in enumerate(cluster_labels_list[index][1]):
+    #                     cluster_labels_list[index][1][number] = -1
+    #             index = index + 1
+    #         resulting_sequence = [None] * num_raw_states
+    #         i = 0
+    #         confidence = 0
+    #         for x in resulting_sequence:
+    #             j = 0
+    #             test_list = []
+    #             for arr in [elem[1] for elem in cluster_labels_list]:
+    #                 if num_cluster_list[j][1] != num_states:
+    #                     j = j + 1
+    #                 else:
+    #                     if -1 in arr:
+    #                         print_error("Bei Janis beschweren! Fehler beim Umbenennen der"
+    #                                     " Zustände wahrscheinlich.")
+    #                         sys.exit(-1)
+    #                     test_list.append(arr[i])
+    #                     j = j + 1
+    #             bincount = np.bincount(test_list)
+    #             resulting_sequence[i] = np.argmax(bincount)
+    #             confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount)
+    #             i = i + 1
+    #         confidence = confidence / len(resulting_sequence)
+    #         print_info("Confidence of resulting sequence is " + str(confidence)
+    #                    + " while using " + str(num_used_measurements) + "/"
+    #                    + str(len(raw_states_list)) + " measurements.")
+    #         #print(resulting_sequence)
+    #         resulting_sequence_list.append((num_config, resulting_sequence))
+    # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat
+    # #   erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die
+    # #   Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen
+    # #   auftreten.
+    # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw.
+    # #   hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem:
+    # #   wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche
+    # #   Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme,
+    # #   2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines
+    # #   Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen...
+    # for num_config, sequence in resulting_sequence_list:
+    #     print_info("NO. config:" + str(num_config))
+    #     print_info(sequence)
+    #
+    #
+    #
+    #
+
+    elif ".tar" in opt_filename:
+        # open with dfatool
+        raw_data_args = list()
+        raw_data_args.append(opt_filename)
+        raw_data = RawData(raw_data_args, with_traces=True)
+        print_info(
+            "Preprocessing file. Depending on its size, this could take a while."
+        )
+        preprocessed_data = raw_data.get_preprocessed_data()
+        print_info("File fully preprocessed")
+        # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json. Ist erstmal raus. Wird nicht
+        #   umgesetzt.
+        print_error(
+            "Not implemented yet. Please generate .json files first with dfatool and use"
+            " those."
+        )
+    else:
+        print_error("Unknown dataformat")
+        sys.exit(-1)