From 96eb50d12ede52aebb4ef4c116c72cc9280111d8 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 13:20:07 +0200 Subject: analyse-archive: fixed typo; Also added symlink for windows --- bin/analyze-archive.py | 4 +++- bin/dfatool | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/analyze-archive.py b/bin/analyze-archive.py index cfb832f..4531d86 100755 --- a/bin/analyze-archive.py +++ b/bin/analyze-archive.py @@ -357,7 +357,9 @@ if __name__ == "__main__": if raw_data.version <= 1: data_source = "MIMOSA" elif raw_data.version == 2: - data_sourec = "MSP430 EnergyTrace" + data_source = "MSP430 EnergyTrace" + else: + data_source = "UNKNOWN" print(f" Data source ID: {raw_data.version} ({data_source})") preprocessed_data = raw_data.get_preprocessed_data() diff --git a/bin/dfatool b/bin/dfatool index dc598c5..3995af5 120000 --- a/bin/dfatool +++ b/bin/dfatool @@ -1 +1 @@ -../lib \ No newline at end of file +/mnt/c/Users/Janis/Documents/JANIS/UNI/BSc/Bachelorarbeit/aemr/dfatool/lib \ No newline at end of file -- cgit v1.2.3 From 1d2cf70216e3faf7b82d3b96df4bc3ad7cbca291 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 13:21:07 +0200 Subject: added .idea to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 25b1be5..91b6250 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.pyc /htmlcov/ /.coverage* +.idea/ -- cgit v1.2.3 From 8aedd0a2ec227b3bc0233ac136d46ff55c8e6af7 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 13:21:44 +0200 Subject: Initial commit ProofofConcept-WIP tool --- bin/Proof_Of_Concept_PELT.py | 297 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 bin/Proof_Of_Concept_PELT.py diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py new file mode 100644 index 0000000..643a368 --- /dev/null +++ b/bin/Proof_Of_Concept_PELT.py @@ -0,0 +1,297 @@ +def plot_data_from_json(filename, trace_num, xaxis, yaxis): + import matplotlib.pyplot as plt + import json + with open(filename, 'r') as f: + tx_data = json.load(f) + print(tx_data[trace_num]['parameter']) + plt.plot(tx_data[trace_num]['offline'][0]['uW']) + plt.xlabel(xaxis) + plt.ylabel(yaxis) + plt.show() + + +def plot_data_vs_mean(signal, xaxis, yaxis): + import matplotlib.pyplot as plt + from statistics import mean + plt.plot(signal) + average = mean(signal) + plt.hlines(average, 0, len(signal)) + plt.xlabel(xaxis) + plt.ylabel(yaxis) + plt.show() + + +def plot_data_vs_data_vs_means(signal1, signal2, xaxis, yaxis): + import matplotlib.pyplot as plt + from statistics import mean + plt.plot(signal1) + lens = max(len(signal1), len(signal2)) + average = mean(signal1) + plt.hlines(average, 0, lens, color='red') + plt.vlines(len(signal1), 0, 100000, color='red', linestyles='dashed') + plt.plot(signal2) + average = mean(signal2) + plt.hlines(average, 0, lens, color='green') + plt.vlines(len(signal2), 0, 100000, color='green', linestyles='dashed') + plt.xlabel(xaxis) + plt.ylabel(yaxis) + plt.show() + + +def get_bkps(algo, pen, q): + res = pen, len(algo.predict(pen=pen)) + q.put(pen) + return res + + +def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False): + from kneed import KneeLocator + kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction) + if plotting: + kneedle.plot_knee() + kneepoint = (kneedle.knee, kneedle.knee_y) + return kneepoint + + +def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1, + refresh_thresh=5, S=1.0, pen_override=None, plotting=False): + import ruptures as rpt + import time + import matplotlib.pylab as plt + from multiprocessing import Pool, Manager + + # default params in Function + if model is None: + model = 'l1' + if jump is None: + jump = 5 + if min_dist is None: + min_dist = 2 + if range_min is None: + range_min = 1 + if range_max is None: + range_max = 50 + if num_processes is None: + num_processes = 8 + if refresh_delay is None: + refresh_delay = 1 + if refresh_thresh is None: + refresh_thresh = 5 + if S is None: + S = 1.0 + if plotting is None: + plotting = False + + # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30 + # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html + # model = "l1" #"l1" # "l2", "rbf" + algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal) + + ### CALC BKPS WITH DIFF PENALTYS + if pen_override is None: + # building args array for parallelizing + args = [] + # for displaying progression + m = Manager() + q = m.Queue() + + for i in range(range_min, range_max): + args.append((algo, i, q)) + + print('starting kneepoint calculation') + # init Pool with num_proesses + with Pool(num_processes) as p: + # collect results from pool + result = p.starmap_async(get_bkps, args) + # monitor loop + last_percentage = -1 + percentage = -100 # Force display of 0% + i = 0 + while True: + if result.ready(): + break + else: + size = q.qsize() + last_percentage = percentage + percentage = round(size / (range_max - range_min) * 100, 2) + if percentage >= last_percentage + 2 or i >= refresh_thresh: + print('Current progress: ' + str(percentage) + '%') + i = 0 + else: + i += 1 + time.sleep(refresh_delay) + res = result.get() + + # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH + # split x and y coords to pass to kneedle + pen_val = [x[0] for x in res] + fittet_bkps_val = [x[1] for x in res] + # # plot to look at res + + knee = find_knee_point(pen_val, fittet_bkps_val, S=S, plotting=plotting) + plt.xlabel('Penalty') + plt.ylabel('Number of Changepoints') + plt.plot(pen_val, fittet_bkps_val) + plt.vlines(knee[0], 0, max(fittet_bkps_val), linestyles='dashed') + print("knee: " + str(knee[0])) + plt.show() + else: + # use forced pen value for plotting + knee = (pen_override, None) + + + #plt.plot(pen_val, fittet_bkps_val) + if knee[0] is not None: + bkps = algo.predict(pen=knee[0]) + if plotting: + fig, ax = rpt.display(signal, bkps) + plt.show() + return bkps + else: + print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.') + + +if __name__ == '__main__': + import numpy as np + import json + import ruptures as rpt + import matplotlib.pylab as plt + import sys + import getopt + import re + from dfatool.dfatool import RawData + opt = dict() + + optspec = ( + "filename= " + "v " + "model= " + "jump= " + "min_dist= " + "range_min= " + "range_max= " + "num_processes= " + "refresh_delay= " + "refresh_thresh= " + "S= " + "pen_override= " + "plotting= " + ) + opt_filename = None + opt_verbose = False + opt_model = None + opt_jump = None + opt_min_dist = None + opt_range_min = None + opt_range_max = None + opt_num_processes = None + opt_refresh_delay = None + opt_refresh_thresh = None + opt_S = None + opt_pen_override = None + opt_plotting = False + try: + raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" ")) + + for option, parameter in raw_opts: + optname = re.sub(r"^--", "", option) + opt[optname] = parameter + + if 'filename' not in opt: + print("No file specified!", file=sys.stderr) + sys.exit(2) + else: + opt_filename = opt['filename'] + if 'v' in opt: + opt_verbose = True + opt_plotting = True + if 'model' in opt: + opt_model = opt['model'] + if 'jump' in opt: + try: + opt_jump = int(opt['jump']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'min_dist' in opt: + try: + opt_min_dist = int(opt['min_dist']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'range_min' in opt: + try: + opt_range_min = int(opt['range_min']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'range_max' in opt: + try: + opt_range_max = int(opt['range_max']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'num_processes' in opt: + try: + opt_num_processes = int(opt['num_processes']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'refresh_delay' in opt: + try: + opt_refresh_delay = int(opt['refresh_delay']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'refresh_thresh' in opt: + try: + opt_refresh_thresh = int(opt['refresh_thresh']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'S' in opt: + try: + opt_S = float(opt['S']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + if 'pen_override' in opt: + try: + opt_pen_override = int(opt['pen_override']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) + except getopt.GetoptError as err: + print(err, file=sys.stderr) + sys.exit(2) + + if ".json" in opt_filename: + # open file with trace data from json + with open(opt['filename'], 'r') as f: + tx_data = json.load(f) + elif ".tar" in opt_filename: + # open with dfatool + raw_data_args = list() + raw_data_args.append(opt_filename) + raw_data = RawData( + raw_data_args, with_traces=True + ) + print("Preprocessing file. Depending on its size, this could take a while.") + preprocessed_data = raw_data.get_preprocessed_data() + print("File fully preprocessed") + + else: + print("Unknown dataformat", file=sys.stderr) + sys.exit(2) + + print(tx_data[1]['parameter']) + # parse json to array for PELT + signal = np.array(tx_data[1]['offline'][0]['uW']) + + for i in range(0, len(signal)): + signal[i] = signal[i]/1000 + bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) + fig, ax = rpt.display(signal, bkps) + plt.xlabel('Time [us]') + plt.ylabel('Power [mW]') + plt.show() -- cgit v1.2.3 From 2c50b0996563ae2eb313b3d74f762e50c8ca9f6a Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 17:48:24 +0200 Subject: Proof_Of_Concept_Pelt - Implementation of decision whether to refine state or skip it --- bin/Proof_Of_Concept_PELT.py | 137 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 124 insertions(+), 13 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 643a368..2ed7675 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -151,6 +151,89 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.') +# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting +def needs_refinement_no_sort(signal, mean, thresh): + # linear search for the top 10%/ bottom 10% + # should be sufficient + length_of_signal = len(signal) + percentile_size = int() + percentile_size = length_of_signal // 100 + upper_percentile = [None] * percentile_size + lower_percentile = [None] * percentile_size + fill_index_upper = percentile_size - 1 + fill_index_lower = percentile_size - 1 + index_smallest_val = fill_index_upper + index_largest_val = fill_index_lower + + for x in signal: + if x > mean: + # will be in upper percentile + if fill_index_upper >= 0: + upper_percentile[fill_index_upper] = x + if x < upper_percentile[index_smallest_val]: + index_smallest_val = fill_index_upper + fill_index_upper = fill_index_upper - 1 + continue + + if x > upper_percentile[index_smallest_val]: + # replace smallest val. Find next smallest val + upper_percentile[index_smallest_val] = x + index_smallest_val = 0 + i = 0 + for y in upper_percentile: + if upper_percentile[i] < upper_percentile[index_smallest_val]: + index_smallest_val = i + i = i + 1 + + else: + if fill_index_lower >= 0: + lower_percentile[fill_index_lower] = x + if x > lower_percentile[index_largest_val]: + index_largest_val = fill_index_upper + fill_index_lower = fill_index_lower - 1 + continue + if x < lower_percentile[index_largest_val]: + # replace smallest val. Find next smallest val + lower_percentile[index_largest_val] = x + index_largest_val = 0 + i = 0 + for y in lower_percentile: + if lower_percentile[i] > lower_percentile[index_largest_val]: + index_largest_val = i + i = i + 1 + + # should have the percentiles + lower_percentile_mean = np.mean(lower_percentile) + upper_percentile_mean = np.mean(upper_percentile) + dist = mean - lower_percentile_mean + if dist > thresh: + return True + dist = upper_percentile_mean - mean + if dist > thresh: + return True + return False + + +# Very short benchmark yielded approx. 3 times the speed of solution not using sort +def needs_refinement_sort(signal, thresh): + sorted_signal = sorted(signal) + length_of_signal = len(signal) + percentile_size = int() + percentile_size = length_of_signal // 100 + lower_percentile = sorted_signal[0:percentile_size] + upper_percentile = sorted_signal[length_of_signal - percentile_size : length_of_signal] + lower_percentile_mean = np.mean(lower_percentile) + upper_percentile_mean = np.mean(upper_percentile) + median = np.median(sorted_signal) + dist = median - lower_percentile_mean + if dist > thresh: + return True + dist = upper_percentile_mean - median + if dist > thresh: + return True + return False + + if __name__ == '__main__': import numpy as np import json @@ -160,6 +243,7 @@ if __name__ == '__main__': import getopt import re from dfatool.dfatool import RawData + # OPTION RECOGNITION opt = dict() optspec = ( @@ -176,6 +260,7 @@ if __name__ == '__main__': "S= " "pen_override= " "plotting= " + "refinement_thresh= " ) opt_filename = None opt_verbose = False @@ -190,6 +275,7 @@ if __name__ == '__main__': opt_S = None opt_pen_override = None opt_plotting = False + opt_refinement_thresh = None try: raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" ")) @@ -261,14 +347,38 @@ if __name__ == '__main__': except ValueError as verr: print(verr, file=sys.stderr) sys.exit(2) + if 'refinement_thresh' in opt: + try: + opt_refinement_thresh = int(opt['refinement_thresh']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) except getopt.GetoptError as err: print(err, file=sys.stderr) sys.exit(2) + #OPENING DATA + import time if ".json" in opt_filename: # open file with trace data from json - with open(opt['filename'], 'r') as f: - tx_data = json.load(f) + print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.") + with open(opt_filename, 'r') as f: + states = json.load(f) + # loop through all traces check if refinement is necessary + print("Checking if refinement is necessary...") + res = False + for measurements_by_state in states: + # loop through all occurrences of the looked at state + print("Looking at state '" + measurements_by_state['name'] + "'") + for measurement in measurements_by_state['offline']: + # loop through measurements of particular state + # an check if state needs refinement + signal = measurement['uW'] + # mean = measurement['uW_mean'] + # TODO: Decide if median is really the better baseline than mean + if needs_refinement_sort(signal, opt_refinement_thresh): + print("Refinement is necessary!") + break elif ".tar" in opt_filename: # open with dfatool raw_data_args = list() @@ -280,18 +390,19 @@ if __name__ == '__main__': preprocessed_data = raw_data.get_preprocessed_data() print("File fully preprocessed") + # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json else: print("Unknown dataformat", file=sys.stderr) sys.exit(2) - print(tx_data[1]['parameter']) - # parse json to array for PELT - signal = np.array(tx_data[1]['offline'][0]['uW']) - - for i in range(0, len(signal)): - signal[i] = signal[i]/1000 - bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) - fig, ax = rpt.display(signal, bkps) - plt.xlabel('Time [us]') - plt.ylabel('Power [mW]') - plt.show() + # print(tx_data[1]['parameter']) + # # parse json to array for PELT + # signal = np.array(tx_data[1]['offline'][0]['uW']) + # + # for i in range(0, len(signal)): + # signal[i] = signal[i]/1000 + # bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) + # fig, ax = rpt.display(signal, bkps) + # plt.xlabel('Time [us]') + # plt.ylabel('Power [mW]') + # plt.show() -- cgit v1.2.3 From e790c0ff3372b153c582b4adfc7f06a5ba86b5f6 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 17:51:23 +0200 Subject: Proof_Of_Concept_PELT - renamed decision function --- bin/Proof_Of_Concept_PELT.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 2ed7675..6912b02 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -215,7 +215,7 @@ def needs_refinement_no_sort(signal, mean, thresh): # Very short benchmark yielded approx. 3 times the speed of solution not using sort -def needs_refinement_sort(signal, thresh): +def needs_refinement(signal, thresh): sorted_signal = sorted(signal) length_of_signal = len(signal) percentile_size = int() @@ -376,7 +376,7 @@ if __name__ == '__main__': signal = measurement['uW'] # mean = measurement['uW_mean'] # TODO: Decide if median is really the better baseline than mean - if needs_refinement_sort(signal, opt_refinement_thresh): + if needs_refinement(signal, opt_refinement_thresh): print("Refinement is necessary!") break elif ".tar" in opt_filename: -- cgit v1.2.3 From 9075b8ffdbf15425e290747603450438513bca0c Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 2 Jul 2020 18:09:20 +0200 Subject: Proof_Of_Concept_PELT - Code aufgeräumt / Imports am Modulanfang / Typos fixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 77 +++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 6912b02..452ff3f 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -1,40 +1,47 @@ -def plot_data_from_json(filename, trace_num, xaxis, yaxis): - import matplotlib.pyplot as plt - import json +import matplotlib.pyplot as plt +import json +from kneed import KneeLocator +import ruptures as rpt +import time +from multiprocessing import Pool, Manager +import numpy as np +import sys +import getopt +import re +from dfatool.dfatool import RawData + + +def plot_data_from_json(filename, trace_num, x_axis, y_axis): with open(filename, 'r') as f: tx_data = json.load(f) print(tx_data[trace_num]['parameter']) plt.plot(tx_data[trace_num]['offline'][0]['uW']) - plt.xlabel(xaxis) - plt.ylabel(yaxis) + plt.xlabel(x_axis) + plt.ylabel(y_axis) plt.show() -def plot_data_vs_mean(signal, xaxis, yaxis): - import matplotlib.pyplot as plt - from statistics import mean +def plot_data_vs_mean(signal, x_axis, y_axis): plt.plot(signal) - average = mean(signal) + average = np.mean(signal) plt.hlines(average, 0, len(signal)) - plt.xlabel(xaxis) - plt.ylabel(yaxis) + plt.xlabel(x_axis) + plt.ylabel(y_axis) plt.show() -def plot_data_vs_data_vs_means(signal1, signal2, xaxis, yaxis): - import matplotlib.pyplot as plt - from statistics import mean +def plot_data_vs_data_vs_means(signal1, signal2, x_axis, y_axis): plt.plot(signal1) lens = max(len(signal1), len(signal2)) - average = mean(signal1) + average = np.mean(signal1) plt.hlines(average, 0, lens, color='red') plt.vlines(len(signal1), 0, 100000, color='red', linestyles='dashed') plt.plot(signal2) - average = mean(signal2) + average = np.mean(signal2) plt.hlines(average, 0, lens, color='green') plt.vlines(len(signal2), 0, 100000, color='green', linestyles='dashed') - plt.xlabel(xaxis) - plt.ylabel(yaxis) + plt.xlabel(x_axis) + plt.ylabel(y_axis) plt.show() @@ -45,7 +52,6 @@ def get_bkps(algo, pen, q): def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False): - from kneed import KneeLocator kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction) if plotting: kneedle.plot_knee() @@ -53,13 +59,8 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing return kneepoint -def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1, +def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, plotting=False): - import ruptures as rpt - import time - import matplotlib.pylab as plt - from multiprocessing import Pool, Manager - # default params in Function if model is None: model = 'l1' @@ -104,7 +105,6 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, # collect results from pool result = p.starmap_async(get_bkps, args) # monitor loop - last_percentage = -1 percentage = -100 # Force display of 0% i = 0 while True: @@ -125,22 +125,21 @@ def calc_PELT(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH # split x and y coords to pass to kneedle pen_val = [x[0] for x in res] - fittet_bkps_val = [x[1] for x in res] + fitted_bkps_val = [x[1] for x in res] # # plot to look at res - knee = find_knee_point(pen_val, fittet_bkps_val, S=S, plotting=plotting) + knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting) plt.xlabel('Penalty') plt.ylabel('Number of Changepoints') - plt.plot(pen_val, fittet_bkps_val) - plt.vlines(knee[0], 0, max(fittet_bkps_val), linestyles='dashed') + plt.plot(pen_val, fitted_bkps_val) + plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed') print("knee: " + str(knee[0])) plt.show() else: # use forced pen value for plotting knee = (pen_override, None) - - #plt.plot(pen_val, fittet_bkps_val) + # plt.plot(pen_val, fittet_bkps_val) if knee[0] is not None: bkps = algo.predict(pen=knee[0]) if plotting: @@ -215,6 +214,7 @@ def needs_refinement_no_sort(signal, mean, thresh): # Very short benchmark yielded approx. 3 times the speed of solution not using sort +# TODO: Decide whether median is really the better baseline than mean def needs_refinement(signal, thresh): sorted_signal = sorted(signal) length_of_signal = len(signal) @@ -235,14 +235,6 @@ def needs_refinement(signal, thresh): if __name__ == '__main__': - import numpy as np - import json - import ruptures as rpt - import matplotlib.pylab as plt - import sys - import getopt - import re - from dfatool.dfatool import RawData # OPTION RECOGNITION opt = dict() @@ -357,8 +349,7 @@ if __name__ == '__main__': print(err, file=sys.stderr) sys.exit(2) - #OPENING DATA - import time + # OPENING DATA if ".json" in opt_filename: # open file with trace data from json print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.") @@ -401,7 +392,7 @@ if __name__ == '__main__': # # for i in range(0, len(signal)): # signal[i] = signal[i]/1000 - # bkps = calc_PELT(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) + # bkps = calc_pelt(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) # fig, ax = rpt.display(signal, bkps) # plt.xlabel('Time [us]') # plt.ylabel('Power [mW]') -- cgit v1.2.3 From 23a07bf5da14980aeadf7c0e12b422117b3680bc Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sun, 5 Jul 2020 17:29:31 +0200 Subject: bin/Proof_of_Concept_PELT: States are now calculated per Measurement per State-config. Some statistics are calculated for that. Clustering pending --- bin/Proof_Of_Concept_PELT.py | 130 +++++++++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 29 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 452ff3f..d4878c1 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -59,7 +59,7 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing return kneepoint -def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, num_processes=8, refresh_delay=1, +def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, plotting=False): # default params in Function if model is None: @@ -69,7 +69,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, if min_dist is None: min_dist = 2 if range_min is None: - range_min = 1 + range_min = 0 if range_max is None: range_max = 50 if num_processes is None: @@ -82,24 +82,23 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, S = 1.0 if plotting is None: plotting = False - # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30 # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html # model = "l1" #"l1" # "l2", "rbf" algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal) ### CALC BKPS WITH DIFF PENALTYS - if pen_override is None: + if pen_override is None and range_max != range_min: # building args array for parallelizing args = [] # for displaying progression m = Manager() q = m.Queue() - for i in range(range_min, range_max): + for i in range(range_min, range_max + 1): args.append((algo, i, q)) - print('starting kneepoint calculation') + print('[INFO]starting kneepoint calculation.') # init Pool with num_proesses with Pool(num_processes) as p: # collect results from pool @@ -115,30 +114,32 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, last_percentage = percentage percentage = round(size / (range_max - range_min) * 100, 2) if percentage >= last_percentage + 2 or i >= refresh_thresh: - print('Current progress: ' + str(percentage) + '%') + print('[INFO]Current progress: ' + str(percentage) + '%') i = 0 else: i += 1 time.sleep(refresh_delay) res = result.get() - + print_info("Finished kneepoint calculation.") # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH # split x and y coords to pass to kneedle pen_val = [x[0] for x in res] fitted_bkps_val = [x[1] for x in res] # # plot to look at res - knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting) - plt.xlabel('Penalty') - plt.ylabel('Number of Changepoints') - plt.plot(pen_val, fitted_bkps_val) - plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed') - print("knee: " + str(knee[0])) - plt.show() + # plt.xlabel('Penalty') + # plt.ylabel('Number of Changepoints') + # plt.plot(pen_val, fitted_bkps_val) + # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed') + # print("knee: " + str(knee[0])) + # plt.show() else: - # use forced pen value for plotting - knee = (pen_override, None) - + # use forced pen value for plotting if specified. Else use only pen in range + if pen_override is not None: + knee = (pen_override, None) + else: + knee = (range_min, None) + print_info("" + str(knee[0]) + " has been selected as kneepoint.") # plt.plot(pen_val, fittet_bkps_val) if knee[0] is not None: bkps = algo.predict(pen=knee[0]) @@ -147,7 +148,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=1, range_max=50, plt.show() return bkps else: - print('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.') + print_error('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.') + exit() # very short benchmark yielded approx. 1/3 of speed compared to solution with sorting @@ -221,7 +223,7 @@ def needs_refinement(signal, thresh): percentile_size = int() percentile_size = length_of_signal // 100 lower_percentile = sorted_signal[0:percentile_size] - upper_percentile = sorted_signal[length_of_signal - percentile_size : length_of_signal] + upper_percentile = sorted_signal[length_of_signal - percentile_size: length_of_signal] lower_percentile_mean = np.mean(lower_percentile) upper_percentile_mean = np.mean(upper_percentile) median = np.median(sorted_signal) @@ -234,6 +236,18 @@ def needs_refinement(signal, thresh): return False +def print_info(str): + print("[INFO]" + str) + + +def print_warning(str): + print("[WARNING]" + str) + + +def print_error(str): + print("ERROR" + str, file=sys.stderr) + + if __name__ == '__main__': # OPTION RECOGNITION opt = dict() @@ -276,7 +290,7 @@ if __name__ == '__main__': opt[optname] = parameter if 'filename' not in opt: - print("No file specified!", file=sys.stderr) + print_error("No file specified!") sys.exit(2) else: opt_filename = opt['filename'] @@ -352,15 +366,16 @@ if __name__ == '__main__': # OPENING DATA if ".json" in opt_filename: # open file with trace data from json - print("[INFO] Will only refine the state which is present in " + opt_filename + " if necessary.") + print_info(" Will only refine the state which is present in " + opt_filename + " if necessary.") with open(opt_filename, 'r') as f: states = json.load(f) # loop through all traces check if refinement is necessary - print("Checking if refinement is necessary...") - res = False + print_info("Checking if refinement is necessary...") for measurements_by_state in states: # loop through all occurrences of the looked at state - print("Looking at state '" + measurements_by_state['name'] + "'") + print_info("Looking at state '" + measurements_by_state['name'] + "' with params: " + + str(measurements_by_state['parameter'])) + refine = False for measurement in measurements_by_state['offline']: # loop through measurements of particular state # an check if state needs refinement @@ -368,8 +383,65 @@ if __name__ == '__main__': # mean = measurement['uW_mean'] # TODO: Decide if median is really the better baseline than mean if needs_refinement(signal, opt_refinement_thresh): - print("Refinement is necessary!") + print_info("Refinement is necessary!") + refine = True break + if not refine: + print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'") + else: + # calc and save all bkpts for the given state and param config + state_list = list() + for measurement in measurements_by_state['offline']: + signal = np.array(measurement['uW']) + normed_signal = np.zeros(shape=len(signal)) + for i in range(0, len(signal)): + normed_signal[i] = signal[i] / 1000 + bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max, + num_processes=opt_num_processes, jump=opt_jump, S=opt_S, + pen_override=opt_pen_override) + calced_states = list() + start_time = 0 + end_time = 0 + for bkpt in bkpts: + # start_time of state is end_time of previous one(Transitions are instantaneous) + start_time = end_time + end_time = bkpt + power_vals = signal[start_time: end_time] + mean_power = np.mean(power_vals) + std_dev = np.std(power_vals) + calced_state = (start_time, end_time, mean_power, std_dev) + calced_states.append(calced_state) + num = 0 + new_avg_std = 0 + for s in calced_states: + print_info("State " + str(num) + " starts at t=" + str(s[0]) + " and ends at t=" + str(s[1]) + + " while using " + str(s[2]) + "uW with sigma=" + str(s[3])) + num = num + 1 + new_avg_std = new_avg_std + s[3] + new_avg_std = new_avg_std / len(calced_states) + change_avg_std = measurement['uW_std'] - new_avg_std + print_info("The average standard deviation for the newly found states is " + str(new_avg_std) + + ".\n[INFO]That is a reduction of " + str(change_avg_std)) + state_list.append(calced_states) + num_states_array = np.zeros(shape=len(measurements_by_state['offline'])) + i = 0 + for x in state_list: + num_states_array[i] = len(x) + i = i + 1 + avg_num_states = np.mean(num_states_array) + num_states_dev = np.std(num_states_array) + print_info("On average " + str(avg_num_states) + " States have been found. The standard deviation" + + " is " + str(num_states_dev)) + # TODO: MAGIC NUMBER + if num_states_dev > 1: + print_warning("The number of states varies strongly across measurements. Consider choosing a " + "larger value for S.") + time.sleep(5) + # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? + # Einfach Durchschnitt nehmen? + # TODO: TESTING PURPOSES + exit() + elif ".tar" in opt_filename: # open with dfatool raw_data_args = list() @@ -377,13 +449,13 @@ if __name__ == '__main__': raw_data = RawData( raw_data_args, with_traces=True ) - print("Preprocessing file. Depending on its size, this could take a while.") + print_info("Preprocessing file. Depending on its size, this could take a while.") preprocessed_data = raw_data.get_preprocessed_data() - print("File fully preprocessed") + print_info("File fully preprocessed") # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json else: - print("Unknown dataformat", file=sys.stderr) + print_error("Unknown dataformat") sys.exit(2) # print(tx_data[1]['parameter']) -- cgit v1.2.3 From bd1b0c578ab7049f2826c653831d700caa59f7ac Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sun, 5 Jul 2020 17:35:06 +0200 Subject: save unixsymlink --- bin/dfatool(UNIX) | 1 + 1 file changed, 1 insertion(+) create mode 120000 bin/dfatool(UNIX) diff --git a/bin/dfatool(UNIX) b/bin/dfatool(UNIX) new file mode 120000 index 0000000..dc598c5 --- /dev/null +++ b/bin/dfatool(UNIX) @@ -0,0 +1 @@ +../lib \ No newline at end of file -- cgit v1.2.3 From dd5533aca7cd8d13e23c49e3dd81141347a51dfb Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sun, 5 Jul 2020 17:47:05 +0200 Subject: bin/: Auf einmal funktioniert auch der UNIX Symlink für Windows... Nehm ich wohl. Aber wieso nicht gleich so? MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/dfatool | 2 +- bin/dfatool(UNIX) | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 bin/dfatool(UNIX) diff --git a/bin/dfatool b/bin/dfatool index 3995af5..dc598c5 120000 --- a/bin/dfatool +++ b/bin/dfatool @@ -1 +1 @@ -/mnt/c/Users/Janis/Documents/JANIS/UNI/BSc/Bachelorarbeit/aemr/dfatool/lib \ No newline at end of file +../lib \ No newline at end of file diff --git a/bin/dfatool(UNIX) b/bin/dfatool(UNIX) deleted file mode 120000 index dc598c5..0000000 --- a/bin/dfatool(UNIX) +++ /dev/null @@ -1 +0,0 @@ -../lib \ No newline at end of file -- cgit v1.2.3 From bb19cc60ffad666afb7970fc36de2093be445166 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Mon, 6 Jul 2020 18:51:33 +0200 Subject: bin/Proof_Of_Concept_PELT: Added initial clustering via sklearn AgglomerativeClustering with affinity euclidean and ward linkage. added pen_modifier option, with which the penalty can be manipulated. e.g. Doubled, halved or otherwise modified --- bin/Proof_Of_Concept_PELT.py | 111 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 14 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index d4878c1..80f7c04 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -10,6 +10,11 @@ import getopt import re from dfatool.dfatool import RawData +from sklearn.cluster import AgglomerativeClustering +from scipy.cluster.hierarchy import dendrogram, linkage + +# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100 + def plot_data_from_json(filename, trace_num, x_axis, y_axis): with open(filename, 'r') as f: @@ -60,7 +65,7 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1, - refresh_thresh=5, S=1.0, pen_override=None, plotting=False): + refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, plotting=False): # default params in Function if model is None: model = 'l1' @@ -82,6 +87,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, S = 1.0 if plotting is None: plotting = False + if pen_modifier is None: + pen_modifier = 1 # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30 # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html # model = "l1" #"l1" # "l2", "rbf" @@ -98,7 +105,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, for i in range(range_min, range_max + 1): args.append((algo, i, q)) - print('[INFO]starting kneepoint calculation.') + print_info('starting kneepoint calculation.') # init Pool with num_proesses with Pool(num_processes) as p: # collect results from pool @@ -114,7 +121,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, last_percentage = percentage percentage = round(size / (range_max - range_min) * 100, 2) if percentage >= last_percentage + 2 or i >= refresh_thresh: - print('[INFO]Current progress: ' + str(percentage) + '%') + print_info('Current progress: ' + str(percentage) + '%') i = 0 else: i += 1 @@ -133,6 +140,8 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed') # print("knee: " + str(knee[0])) # plt.show() + # modify knee according to options. Defaults to 1 * knee + knee = (knee[0] * pen_modifier, knee[1]) else: # use forced pen value for plotting if specified. Else use only pen in range if pen_override is not None: @@ -237,15 +246,21 @@ def needs_refinement(signal, thresh): def print_info(str): - print("[INFO]" + str) + str_lst = str.split(sep='\n') + for str in str_lst: + print("[INFO]" + str) def print_warning(str): - print("[WARNING]" + str) + str_lst = str.split(sep='\n') + for str in str_lst: + print("[WARNING]" + str) def print_error(str): - print("ERROR" + str, file=sys.stderr) + str_lst = str.split(sep='\n') + for str in str_lst: + print("[ERROR]" + str, file=sys.stderr) if __name__ == '__main__': @@ -265,6 +280,7 @@ if __name__ == '__main__': "refresh_thresh= " "S= " "pen_override= " + "pen_modifier= " "plotting= " "refinement_thresh= " ) @@ -280,6 +296,7 @@ if __name__ == '__main__': opt_refresh_thresh = None opt_S = None opt_pen_override = None + opt_pen_modifier = None opt_plotting = False opt_refinement_thresh = None try: @@ -353,6 +370,12 @@ if __name__ == '__main__': except ValueError as verr: print(verr, file=sys.stderr) sys.exit(2) + if 'pen_modifier' in opt: + try: + opt_pen_modifier = float(opt['pen_modifier']) + except ValueError as verr: + print(verr, file=sys.stderr) + sys.exit(2) if 'refinement_thresh' in opt: try: opt_refinement_thresh = int(opt['refinement_thresh']) @@ -390,7 +413,7 @@ if __name__ == '__main__': print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'") else: # calc and save all bkpts for the given state and param config - state_list = list() + raw_states_list = list() for measurement in measurements_by_state['offline']: signal = np.array(measurement['uW']) normed_signal = np.zeros(shape=len(signal)) @@ -398,7 +421,7 @@ if __name__ == '__main__': normed_signal[i] = signal[i] / 1000 bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S, - pen_override=opt_pen_override) + pen_override=opt_pen_override, pen_modifier=opt_pen_modifier) calced_states = list() start_time = 0 end_time = 0 @@ -420,12 +443,12 @@ if __name__ == '__main__': new_avg_std = new_avg_std + s[3] new_avg_std = new_avg_std / len(calced_states) change_avg_std = measurement['uW_std'] - new_avg_std - print_info("The average standard deviation for the newly found states is " + str(new_avg_std) - + ".\n[INFO]That is a reduction of " + str(change_avg_std)) - state_list.append(calced_states) - num_states_array = np.zeros(shape=len(measurements_by_state['offline'])) + print_info("The average standard deviation for the newly found states is " + str(new_avg_std)) + print_info("That is a reduction of " + str(change_avg_std)) + raw_states_list.append(calced_states) + num_states_array = [int()] * len(raw_states_list) i = 0 - for x in state_list: + for x in raw_states_list: num_states_array[i] = len(x) i = i + 1 avg_num_states = np.mean(num_states_array) @@ -435,10 +458,70 @@ if __name__ == '__main__': # TODO: MAGIC NUMBER if num_states_dev > 1: print_warning("The number of states varies strongly across measurements. Consider choosing a " - "larger value for S.") + "larger value for S or using the pen_modifier option.") time.sleep(5) # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? # Einfach Durchschnitt nehmen? + # Preliminary decision: Further on only use the traces, which have the most frequent state count + counts = np.bincount(num_states_array) + num_raw_states = np.argmax(counts) + print_info("Choose " + str(num_raw_states) + " as number of raw_states.") + i = 0 + cluster_labels_list = [] + num_cluster_list = [] + for raw_states in raw_states_list: + # iterate through raw states from measurements + if len(raw_states) == num_raw_states: + # build array with power values to cluster these + value_to_cluster = np.zeros((num_raw_states, 2)) + j = 0 + for s in raw_states: + value_to_cluster[j][0] = s[2] + value_to_cluster[j][1] = 0 + j = j + 1 + # linked = linkage(value_to_cluster, 'single') + # + # labelList = range(1, 11) + # + # plt.figure(figsize=(10, 7)) + # dendrogram(linked, + # orientation='top', + # distance_sort='descending', + # show_leaf_counts=True) + # plt.show() + # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER + # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', + # linkage='ward', distance_threshold=opt_refinement_thresh) + cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') + cluster.fit_predict(value_to_cluster) + print_info("Cluster labels:\n" + str(cluster.labels_)) + # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') + # plt.show() + # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.: + # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3 + cluster_labels_list.append(cluster.labels_) + num_cluster_list.append(cluster.n_clusters_) + i = i + 1 + if i != len(raw_states_list): + print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + + " Measurements for state clustering. " + "Others did not recognize number of states correctly.") + num_states = np.argmax(np.bincount(num_cluster_list)) + resulting_sequence = [None] * num_raw_states + i = 0 + for x in resulting_sequence: + j = 0 + test_list = [] + for arr in cluster_labels_list: + if num_cluster_list[j] != num_states: + j = j + 1 + else: + test_list.append(arr[i]) + j = j + 1 + resulting_sequence[i] = np.argmax(np.bincount(test_list)) + i = i + 1 + print(resulting_sequence) + # TODO: TESTING PURPOSES exit() -- cgit v1.2.3 From 7dc6363cd7f17cf5a09f678da612e15a0e6bfbac Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Wed, 8 Jul 2020 14:08:13 +0200 Subject: bin/Proof_Of_Concept_PELT: Small bit of refactoring. Fixed some pylint violations --- bin/Proof_Of_Concept_PELT.py | 222 +++++++++++++++++++++++-------------------- 1 file changed, 117 insertions(+), 105 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 80f7c04..dbcc7c1 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -11,7 +11,9 @@ import re from dfatool.dfatool import RawData from sklearn.cluster import AgglomerativeClustering -from scipy.cluster.hierarchy import dendrogram, linkage + + +# from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100 @@ -56,16 +58,15 @@ def get_bkps(algo, pen, q): return res -def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing', plotting=False): +def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing'): kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction) - if plotting: - kneedle.plot_knee() kneepoint = (kneedle.knee, kneedle.knee_y) return kneepoint -def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1, - refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, plotting=False): +def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, + refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, + plotting=False): # default params in Function if model is None: model = 'l1' @@ -116,16 +117,16 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, while True: if result.ready(): break + + size = q.qsize() + last_percentage = percentage + percentage = round(size / (range_max - range_min) * 100, 2) + if percentage >= last_percentage + 2 or i >= refresh_thresh: + print_info('Current progress: ' + str(percentage) + '%') + i = 0 else: - size = q.qsize() - last_percentage = percentage - percentage = round(size / (range_max - range_min) * 100, 2) - if percentage >= last_percentage + 2 or i >= refresh_thresh: - print_info('Current progress: ' + str(percentage) + '%') - i = 0 - else: - i += 1 - time.sleep(refresh_delay) + i += 1 + time.sleep(refresh_delay) res = result.get() print_info("Finished kneepoint calculation.") # DECIDE WHICH PENALTY VALUE TO CHOOSE ACCORDING TO ELBOW/KNEE APPROACH @@ -133,7 +134,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, pen_val = [x[0] for x in res] fitted_bkps_val = [x[1] for x in res] # # plot to look at res - knee = find_knee_point(pen_val, fitted_bkps_val, S=S, plotting=plotting) + knee = find_knee_point(pen_val, fitted_bkps_val, S=S) # plt.xlabel('Penalty') # plt.ylabel('Number of Changepoints') # plt.plot(pen_val, fitted_bkps_val) @@ -156,72 +157,73 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, fig, ax = rpt.display(signal, bkps) plt.show() return bkps - else: - print_error('With the current thresh-hold S=' + str(S) + ' it is not possible to select a penalty value.') - exit() - -# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting -def needs_refinement_no_sort(signal, mean, thresh): - # linear search for the top 10%/ bottom 10% - # should be sufficient - length_of_signal = len(signal) - percentile_size = int() - percentile_size = length_of_signal // 100 - upper_percentile = [None] * percentile_size - lower_percentile = [None] * percentile_size - fill_index_upper = percentile_size - 1 - fill_index_lower = percentile_size - 1 - index_smallest_val = fill_index_upper - index_largest_val = fill_index_lower - - for x in signal: - if x > mean: - # will be in upper percentile - if fill_index_upper >= 0: - upper_percentile[fill_index_upper] = x - if x < upper_percentile[index_smallest_val]: - index_smallest_val = fill_index_upper - fill_index_upper = fill_index_upper - 1 - continue - - if x > upper_percentile[index_smallest_val]: - # replace smallest val. Find next smallest val - upper_percentile[index_smallest_val] = x - index_smallest_val = 0 - i = 0 - for y in upper_percentile: - if upper_percentile[i] < upper_percentile[index_smallest_val]: - index_smallest_val = i - i = i + 1 + print_error('With the current thresh-hold S=' + str(S) + + ' it is not possible to select a penalty value.') + sys.exit() - else: - if fill_index_lower >= 0: - lower_percentile[fill_index_lower] = x - if x > lower_percentile[index_largest_val]: - index_largest_val = fill_index_upper - fill_index_lower = fill_index_lower - 1 - continue - if x < lower_percentile[index_largest_val]: - # replace smallest val. Find next smallest val - lower_percentile[index_largest_val] = x - index_largest_val = 0 - i = 0 - for y in lower_percentile: - if lower_percentile[i] > lower_percentile[index_largest_val]: - index_largest_val = i - i = i + 1 - # should have the percentiles - lower_percentile_mean = np.mean(lower_percentile) - upper_percentile_mean = np.mean(upper_percentile) - dist = mean - lower_percentile_mean - if dist > thresh: - return True - dist = upper_percentile_mean - mean - if dist > thresh: - return True - return False +# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting +# def needs_refinement_no_sort(signal, mean, thresh): +# # linear search for the top 10%/ bottom 10% +# # should be sufficient +# length_of_signal = len(signal) +# percentile_size = int() +# percentile_size = length_of_signal // 100 +# upper_percentile = [None] * percentile_size +# lower_percentile = [None] * percentile_size +# fill_index_upper = percentile_size - 1 +# fill_index_lower = percentile_size - 1 +# index_smallest_val = fill_index_upper +# index_largest_val = fill_index_lower +# +# for x in signal: +# if x > mean: +# # will be in upper percentile +# if fill_index_upper >= 0: +# upper_percentile[fill_index_upper] = x +# if x < upper_percentile[index_smallest_val]: +# index_smallest_val = fill_index_upper +# fill_index_upper = fill_index_upper - 1 +# continue +# +# if x > upper_percentile[index_smallest_val]: +# # replace smallest val. Find next smallest val +# upper_percentile[index_smallest_val] = x +# index_smallest_val = 0 +# i = 0 +# for y in upper_percentile: +# if upper_percentile[i] < upper_percentile[index_smallest_val]: +# index_smallest_val = i +# i = i + 1 +# +# else: +# if fill_index_lower >= 0: +# lower_percentile[fill_index_lower] = x +# if x > lower_percentile[index_largest_val]: +# index_largest_val = fill_index_upper +# fill_index_lower = fill_index_lower - 1 +# continue +# if x < lower_percentile[index_largest_val]: +# # replace smallest val. Find next smallest val +# lower_percentile[index_largest_val] = x +# index_largest_val = 0 +# i = 0 +# for y in lower_percentile: +# if lower_percentile[i] > lower_percentile[index_largest_val]: +# index_largest_val = i +# i = i + 1 +# +# # should have the percentiles +# lower_percentile_mean = np.mean(lower_percentile) +# upper_percentile_mean = np.mean(upper_percentile) +# dist = mean - lower_percentile_mean +# if dist > thresh: +# return True +# dist = upper_percentile_mean - mean +# if dist > thresh: +# return True +# return False # Very short benchmark yielded approx. 3 times the speed of solution not using sort @@ -245,22 +247,22 @@ def needs_refinement(signal, thresh): return False -def print_info(str): - str_lst = str.split(sep='\n') - for str in str_lst: - print("[INFO]" + str) +def print_info(str_to_prt): + str_lst = str_to_prt.split(sep='\n') + for str_prt in str_lst: + print("[INFO]" + str_prt) -def print_warning(str): - str_lst = str.split(sep='\n') - for str in str_lst: - print("[WARNING]" + str) +def print_warning(str_to_prt): + str_lst = str_to_prt.split(sep='\n') + for str_prt in str_lst: + print("[WARNING]" + str_prt) -def print_error(str): - str_lst = str.split(sep='\n') - for str in str_lst: - print("[ERROR]" + str, file=sys.stderr) +def print_error(str_to_prt): + str_lst = str_to_prt.split(sep='\n') + for str_prt in str_lst: + print("[ERROR]" + str_prt, file=sys.stderr) if __name__ == '__main__': @@ -389,7 +391,8 @@ if __name__ == '__main__': # OPENING DATA if ".json" in opt_filename: # open file with trace data from json - print_info(" Will only refine the state which is present in " + opt_filename + " if necessary.") + print_info( + " Will only refine the state which is present in " + opt_filename + " if necessary.") with open(opt_filename, 'r') as f: states = json.load(f) # loop through all traces check if refinement is necessary @@ -410,7 +413,8 @@ if __name__ == '__main__': refine = True break if not refine: - print_info("No refinement necessary for state '" + measurements_by_state['name'] + "'") + print_info("No refinement necessary for state '" + measurements_by_state['name'] + + "'") else: # calc and save all bkpts for the given state and param config raw_states_list = list() @@ -419,14 +423,16 @@ if __name__ == '__main__': normed_signal = np.zeros(shape=len(signal)) for i in range(0, len(signal)): normed_signal[i] = signal[i] / 1000 - bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, range_max=opt_range_max, - num_processes=opt_num_processes, jump=opt_jump, S=opt_S, - pen_override=opt_pen_override, pen_modifier=opt_pen_modifier) + bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, + range_max=opt_range_max, num_processes=opt_num_processes, + jump=opt_jump, S=opt_S, pen_override=opt_pen_override, + pen_modifier=opt_pen_modifier) calced_states = list() start_time = 0 end_time = 0 for bkpt in bkpts: - # start_time of state is end_time of previous one(Transitions are instantaneous) + # start_time of state is end_time of previous one + # (Transitions are instantaneous) start_time = end_time end_time = bkpt power_vals = signal[start_time: end_time] @@ -437,13 +443,16 @@ if __name__ == '__main__': num = 0 new_avg_std = 0 for s in calced_states: - print_info("State " + str(num) + " starts at t=" + str(s[0]) + " and ends at t=" + str(s[1]) - + " while using " + str(s[2]) + "uW with sigma=" + str(s[3])) + print_info("State " + str(num) + " starts at t=" + str(s[0]) + + " and ends at t=" + str(s[1]) + + " while using " + str(s[2]) + + "uW with sigma=" + str(s[3])) num = num + 1 new_avg_std = new_avg_std + s[3] new_avg_std = new_avg_std / len(calced_states) change_avg_std = measurement['uW_std'] - new_avg_std - print_info("The average standard deviation for the newly found states is " + str(new_avg_std)) + print_info("The average standard deviation for the newly found states is " + + str(new_avg_std)) print_info("That is a reduction of " + str(change_avg_std)) raw_states_list.append(calced_states) num_states_array = [int()] * len(raw_states_list) @@ -453,12 +462,14 @@ if __name__ == '__main__': i = i + 1 avg_num_states = np.mean(num_states_array) num_states_dev = np.std(num_states_array) - print_info("On average " + str(avg_num_states) + " States have been found. The standard deviation" + print_info("On average " + str(avg_num_states) + + " States have been found. The standard deviation" + " is " + str(num_states_dev)) # TODO: MAGIC NUMBER if num_states_dev > 1: - print_warning("The number of states varies strongly across measurements. Consider choosing a " - "larger value for S or using the pen_modifier option.") + print_warning("The number of states varies strongly across measurements." + " Consider choosing a larger value for S or using the pen_modifier" + " option.") time.sleep(5) # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? # Einfach Durchschnitt nehmen? @@ -492,7 +503,8 @@ if __name__ == '__main__': # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', # linkage='ward', distance_threshold=opt_refinement_thresh) - cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') + cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', + linkage='ward') cluster.fit_predict(value_to_cluster) print_info("Cluster labels:\n" + str(cluster.labels_)) # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') @@ -523,7 +535,7 @@ if __name__ == '__main__': print(resulting_sequence) # TODO: TESTING PURPOSES - exit() + sys.exit() elif ".tar" in opt_filename: # open with dfatool -- cgit v1.2.3 From 56c0cd63af5e34fc2e3da64018155f715825a343 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Wed, 8 Jul 2020 17:29:56 +0200 Subject: bin/Proof_Of_Concept_PELT: Trennen von Penalty-Berechnung und PELT -> Pro Messreihe/Paramkonig. nur noch einmaliges bestimmen der Penalty. Bestimmen der Penalty über KNEEDLE Ab Kneepoint dann suche nach Plateau -> Wahl der Mitte des Plateaus. Automatisches Clustern funktioniert jetzt auch scheinbar für alle Messreihen aus TX.json. Mit anderen nicht getestet. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 193 +++++++++++++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 54 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index dbcc7c1..0e63b78 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -1,16 +1,16 @@ -import matplotlib.pyplot as plt import json -from kneed import KneeLocator -import ruptures as rpt import time -from multiprocessing import Pool, Manager -import numpy as np import sys import getopt import re -from dfatool.dfatool import RawData - +from multiprocessing import Pool, Manager +from kneed import KneeLocator from sklearn.cluster import AgglomerativeClustering +from scipy.signal import find_peaks +import matplotlib.pyplot as plt +import ruptures as rpt +import numpy as np +from dfatool.dfatool import RawData # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display @@ -19,8 +19,8 @@ from sklearn.cluster import AgglomerativeClustering def plot_data_from_json(filename, trace_num, x_axis, y_axis): - with open(filename, 'r') as f: - tx_data = json.load(f) + with open(filename, 'r') as file: + tx_data = json.load(file) print(tx_data[trace_num]['parameter']) plt.plot(tx_data[trace_num]['offline'][0]['uW']) plt.xlabel(x_axis) @@ -64,12 +64,38 @@ def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing return kneepoint -def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, - refresh_delay=1, refresh_thresh=5, S=1.0, pen_override=None, pen_modifier=None, - plotting=False): +def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False): + # default params in Function + if model is None: + model = "l1" + if jump is None: + jump = 5 + if min_dist is None: + min_dist = 2 + if plotting is None: + plotting = False + # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30 + # https://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/costs/index.html + # model = "l1" #"l1" # "l2", "rbf" + algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal) + + if penalty is not None: + bkps = algo.predict(pen=penalty) + if plotting: + fig, ax = rpt.display(signal, bkps) + plt.show() + return bkps + + print_error("No Penalty specified.") + sys.exit() + + +def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50, + num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0, + pen_modifier=None): # default params in Function if model is None: - model = 'l1' + model = "l1" if jump is None: jump = 5 if min_dist is None: @@ -86,8 +112,6 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, refresh_thresh = 5 if S is None: S = 1.0 - if plotting is None: - plotting = False if pen_modifier is None: pen_modifier = 1 # change point detection. best fit seemingly with l1. rbf prods. RuntimeErr for pen > 30 @@ -96,7 +120,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, algo = rpt.Pelt(model=model, jump=jump, min_size=min_dist).fit(signal) ### CALC BKPS WITH DIFF PENALTYS - if pen_override is None and range_max != range_min: + if range_max != range_min: # building args array for parallelizing args = [] # for displaying progression @@ -106,7 +130,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, for i in range(range_min, range_max + 1): args.append((algo, i, q)) - print_info('starting kneepoint calculation.') + print_info("starting kneepoint calculation.") # init Pool with num_proesses with Pool(num_processes) as p: # collect results from pool @@ -122,7 +146,7 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, last_percentage = percentage percentage = round(size / (range_max - range_min) * 100, 2) if percentage >= last_percentage + 2 or i >= refresh_thresh: - print_info('Current progress: ' + str(percentage) + '%') + print_info("Current progress: " + str(percentage) + "%") i = 0 else: i += 1 @@ -135,31 +159,68 @@ def calc_pelt(signal, model='l1', jump=5, min_dist=2, range_min=0, range_max=50, fitted_bkps_val = [x[1] for x in res] # # plot to look at res knee = find_knee_point(pen_val, fitted_bkps_val, S=S) + + # TODO: Find plateau on pen_val vs fitted_bkps_val + # scipy.find_peaks() does not find plateaus if they extend through the end of the data. + # to counter that, add one extremely large value to the right side of the data + # after negating it is extremely small -> Almost certainly smaller than the + # found plateau therefore the plateau does not extend through the border -> scipy.find_peaks + # finds it. Choose value from within that plateau. + # fitted_bkps_val.append(100000000) + # TODO: Approaching over find_peaks might not work if the initial decrease step to the + # "correct" number of changepoints and additional decrease steps e.g. underfitting + # take place within the given penalty interval. find_peak only finds plateaus + # of peaks. If the number of chpts decreases after the wanted plateau the condition + # for local peaks is not satisfied anymore. Therefore this approach will only work + # if the plateau extends over the right border of the penalty interval. + # peaks, peak_plateaus = find_peaks(- np.array(fitted_bkps_val), plateau_size=1) + # Since the data is monotonously decreasing only one plateau can be found. + + # assuming the plateau is constant + start_index = -1 + end_index = -1 + longest_start = -1 + longest_end = -1 + prev_val = -1 + for i, num_bkpts in enumerate(fitted_bkps_val[knee[0]:]): + if num_bkpts != prev_val: + end_index = i - 1 + if end_index - start_index > longest_end - longest_start: + # currently found sequence is the longest found yet + longest_start = start_index + longest_end = end_index + start_index = i + if i == len(fitted_bkps_val[knee[0]:]) - 1: + # end sequence with last value + end_index = i + if end_index - start_index > longest_end - longest_start: + # last found sequence is the longest found yet + longest_start = start_index + longest_end = end_index + start_index = i + prev_val = num_bkpts # plt.xlabel('Penalty') # plt.ylabel('Number of Changepoints') # plt.plot(pen_val, fitted_bkps_val) - # plt.vlines(knee[0], 0, max(fitted_bkps_val), linestyles='dashed') - # print("knee: " + str(knee[0])) + # plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') + # plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') # plt.show() + # choosing pen from plateau + mid_of_plat = longest_start + (longest_end - longest_start) // 2 + knee = (mid_of_plat + knee[0], fitted_bkps_val[mid_of_plat + knee[0]]) + # modify knee according to options. Defaults to 1 * knee knee = (knee[0] * pen_modifier, knee[1]) + else: - # use forced pen value for plotting if specified. Else use only pen in range - if pen_override is not None: - knee = (pen_override, None) - else: - knee = (range_min, None) - print_info("" + str(knee[0]) + " has been selected as kneepoint.") - # plt.plot(pen_val, fittet_bkps_val) + # range_min == range_max. has the same effect as pen_override + knee = (range_min, None) + print_info(str(knee[0]) + " has been selected as kneepoint.") if knee[0] is not None: - bkps = algo.predict(pen=knee[0]) - if plotting: - fig, ax = rpt.display(signal, bkps) - plt.show() - return bkps + return knee - print_error('With the current thresh-hold S=' + str(S) - + ' it is not possible to select a penalty value.') + print_error("With the current thresh-hold S=" + str(S) + + " it is not possible to select a penalty value.") sys.exit() @@ -265,6 +326,14 @@ def print_error(str_to_prt): print("[ERROR]" + str_prt, file=sys.stderr) +def norm_signal(signal): + # TODO: maybe refine normalisation of signal + normed_signal = np.zeros(shape=len(signal)) + for i, signal_i in enumerate(signal): + normed_signal[i] = signal_i / 1000 + return normed_signal + + if __name__ == '__main__': # OPTION RECOGNITION opt = dict() @@ -414,19 +483,28 @@ if __name__ == '__main__': break if not refine: print_info("No refinement necessary for state '" + measurements_by_state['name'] - + "'") + + "' with params: " + str(measurements_by_state['parameter'])) else: + # assume that all measurements of the same param configuration are fundamentally + # similar -> calculate penalty for first measurement, use it for all + if opt_pen_override is None: + signal = np.array(measurements_by_state['offline'][0]['uW']) + normed_signal = norm_signal(signal) + penalty = calculate_penalty_value(normed_signal, model=opt_model, + range_min=opt_range_min, + range_max=opt_range_max, + num_processes=opt_num_processes, + jump=opt_jump, S=opt_S, + pen_modifier=opt_pen_modifier) + penalty = penalty[0] + else: + penalty = opt_pen_override # calc and save all bkpts for the given state and param config raw_states_list = list() for measurement in measurements_by_state['offline']: signal = np.array(measurement['uW']) - normed_signal = np.zeros(shape=len(signal)) - for i in range(0, len(signal)): - normed_signal[i] = signal[i] / 1000 - bkpts = calc_pelt(normed_signal, model=opt_model, range_min=opt_range_min, - range_max=opt_range_max, num_processes=opt_num_processes, - jump=opt_jump, S=opt_S, pen_override=opt_pen_override, - pen_modifier=opt_pen_modifier) + normed_signal = norm_signal(signal) + bkpts = calc_pelt(normed_signal, penalty, model=opt_model, jump=opt_jump) calced_states = list() start_time = 0 end_time = 0 @@ -468,8 +546,8 @@ if __name__ == '__main__': # TODO: MAGIC NUMBER if num_states_dev > 1: print_warning("The number of states varies strongly across measurements." - " Consider choosing a larger value for S or using the pen_modifier" - " option.") + " Consider choosing a larger value for S or using the " + "pen_modifier option.") time.sleep(5) # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? # Einfach Durchschnitt nehmen? @@ -501,10 +579,10 @@ if __name__ == '__main__': # show_leaf_counts=True) # plt.show() # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER - # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', - # linkage='ward', distance_threshold=opt_refinement_thresh) - cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', - linkage='ward') + cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', + linkage='ward', distance_threshold=opt_refinement_thresh*100) + # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', + # linkage='ward') cluster.fit_predict(value_to_cluster) print_info("Cluster labels:\n" + str(cluster.labels_)) # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') @@ -515,9 +593,19 @@ if __name__ == '__main__': num_cluster_list.append(cluster.n_clusters_) i = i + 1 if i != len(raw_states_list): - print_info("Used " + str(i) + "/" + str(len(raw_states_list)) - + " Measurements for state clustering. " - "Others did not recognize number of states correctly.") + if i / len(raw_states_list) <= 0.5: + print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) + + " Measurements for refinement. " + "Others did not recognize number of states correctly." + "\nYou should verify the integrity of the measurements.") + else: + print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + + " Measurements for refinement. " + "Others did not recognize number of states correctly.") + sys.exit() + else: + print_info("Used all available measurements.") + num_states = np.argmax(np.bincount(num_cluster_list)) resulting_sequence = [None] * num_raw_states i = 0 @@ -534,9 +622,6 @@ if __name__ == '__main__': i = i + 1 print(resulting_sequence) - # TODO: TESTING PURPOSES - sys.exit() - elif ".tar" in opt_filename: # open with dfatool raw_data_args = list() -- cgit v1.2.3 From 71b981c13c007d33f4042823703f98e41ff56770 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 10 Jul 2020 14:19:44 +0200 Subject: Proof_Of-Concept_PELT: Fixed Typo --- bin/Proof_Of_Concept_PELT.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 0e63b78..92d09fa 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -579,6 +579,7 @@ if __name__ == '__main__': # show_leaf_counts=True) # plt.show() # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER + # im distance_threshold cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', linkage='ward', distance_threshold=opt_refinement_thresh*100) # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', -- cgit v1.2.3 From 98a7873ec1ce265e6d229af4fa8416b3a9ef018a Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 10 Jul 2020 16:19:08 +0200 Subject: bin/Proof_Of_Concept_PELT.py: Calculation of raw_states is now parallelized. --- bin/Proof_Of_Concept_PELT.py | 121 ++++++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 42 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 92d09fa..bcbd53e 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -6,7 +6,6 @@ import re from multiprocessing import Pool, Manager from kneed import KneeLocator from sklearn.cluster import AgglomerativeClustering -from scipy.signal import find_peaks import matplotlib.pyplot as plt import ruptures as rpt import numpy as np @@ -287,6 +286,50 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, # return False +# raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model +# , opt_jump)) +def calc_raw_states_func(num_trace, measurement, penalty, model, jump): + signal = np.array(measurement['uW']) + normed_signal = norm_signal(signal) + bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump) + calced_states = list() + start_time = 0 + end_time = 0 + for bkpt in bkpts: + # start_time of state is end_time of previous one + # (Transitions are instantaneous) + start_time = end_time + end_time = bkpt + power_vals = signal[start_time: end_time] + mean_power = np.mean(power_vals) + std_dev = np.std(power_vals) + calced_state = (start_time, end_time, mean_power, std_dev) + calced_states.append(calced_state) + num = 0 + new_avg_std = 0 + for s in calced_states: + # print_info("State " + str(num) + " starts at t=" + str(s[0]) + # + " and ends at t=" + str(s[1]) + # + " while using " + str(s[2]) + # + "uW with sigma=" + str(s[3])) + num = num + 1 + new_avg_std = new_avg_std + s[3] + new_avg_std = new_avg_std / len(calced_states) + change_avg_std = measurement['uW_std'] - new_avg_std + # print_info("The average standard deviation for the newly found states is " + # + str(new_avg_std)) + # print_info("That is a reduction of " + str(change_avg_std)) + return num_trace, calced_states, new_avg_std, change_avg_std + + +def calc_raw_states(arg_list, num_processes=8): + m = Manager() + with Pool(processes=num_processes) as p: + # collect results from pool + result = p.starmap(calc_raw_states_func, arg_list) + return result + + # Very short benchmark yielded approx. 3 times the speed of solution not using sort # TODO: Decide whether median is really the better baseline than mean def needs_refinement(signal, thresh): @@ -477,10 +520,9 @@ if __name__ == '__main__': signal = measurement['uW'] # mean = measurement['uW_mean'] # TODO: Decide if median is really the better baseline than mean - if needs_refinement(signal, opt_refinement_thresh): + if needs_refinement(signal, opt_refinement_thresh) and not refine: print_info("Refinement is necessary!") refine = True - break if not refine: print_info("No refinement necessary for state '" + measurements_by_state['name'] + "' with params: " + str(measurements_by_state['parameter'])) @@ -499,45 +541,34 @@ if __name__ == '__main__': penalty = penalty[0] else: penalty = opt_pen_override - # calc and save all bkpts for the given state and param config - raw_states_list = list() - for measurement in measurements_by_state['offline']: - signal = np.array(measurement['uW']) - normed_signal = norm_signal(signal) - bkpts = calc_pelt(normed_signal, penalty, model=opt_model, jump=opt_jump) - calced_states = list() - start_time = 0 - end_time = 0 - for bkpt in bkpts: - # start_time of state is end_time of previous one - # (Transitions are instantaneous) - start_time = end_time - end_time = bkpt - power_vals = signal[start_time: end_time] - mean_power = np.mean(power_vals) - std_dev = np.std(power_vals) - calced_state = (start_time, end_time, mean_power, std_dev) - calced_states.append(calced_state) - num = 0 - new_avg_std = 0 - for s in calced_states: - print_info("State " + str(num) + " starts at t=" + str(s[0]) - + " and ends at t=" + str(s[1]) - + " while using " + str(s[2]) - + "uW with sigma=" + str(s[3])) - num = num + 1 - new_avg_std = new_avg_std + s[3] - new_avg_std = new_avg_std / len(calced_states) - change_avg_std = measurement['uW_std'] - new_avg_std - print_info("The average standard deviation for the newly found states is " - + str(new_avg_std)) + # build arguments for parallel excecution + print_info("Starting raw_states calculation.") + raw_states_calc_args = [] + for num_measurement, measurement in enumerate(measurements_by_state['offline']): + raw_states_calc_args.append((num_measurement, measurement, penalty, + opt_model, opt_jump)) + + raw_states_list = [None] * len(measurements_by_state['offline']) + raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) + # extracting result and putting it in correct order -> index of raw_states_list + # entry still corresponds with index of measurement in measurements_by_states + # -> If measurements are discarded the correct ones are easily recognized + for ret_val in raw_states_res: + num_trace = ret_val[0] + raw_states = ret_val[1] + avg_std = ret_val[2] + change_avg_std = ret_val[3] + # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch + # int sein oder nicht? Es scheint auch vernünftig zu klappen... + raw_states_list[num_trace] = raw_states + print_info("The average standard deviation for the newly found states in " + + "measurement No. " + str(num_trace) + " is " + str(avg_std)) print_info("That is a reduction of " + str(change_avg_std)) - raw_states_list.append(calced_states) + print_info("Finished raw_states calculation.") num_states_array = [int()] * len(raw_states_list) i = 0 - for x in raw_states_list: + for i, x in enumerate(raw_states_list): num_states_array[i] = len(x) - i = i + 1 avg_num_states = np.mean(num_states_array) num_states_dev = np.std(num_states_array) print_info("On average " + str(avg_num_states) @@ -558,7 +589,7 @@ if __name__ == '__main__': i = 0 cluster_labels_list = [] num_cluster_list = [] - for raw_states in raw_states_list: + for num_trace, raw_states in enumerate(raw_states_list): # iterate through raw states from measurements if len(raw_states) == num_raw_states: # build array with power values to cluster these @@ -580,12 +611,14 @@ if __name__ == '__main__': # plt.show() # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER # im distance_threshold - cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='euclidean', - linkage='ward', distance_threshold=opt_refinement_thresh*100) + cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, + affinity='euclidean', + linkage='ward', + distance_threshold=opt_refinement_thresh * 100) # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', # linkage='ward') cluster.fit_predict(value_to_cluster) - print_info("Cluster labels:\n" + str(cluster.labels_)) + # print_info("Cluster labels:\n" + str(cluster.labels_)) # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') # plt.show() # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.: @@ -593,6 +626,9 @@ if __name__ == '__main__': cluster_labels_list.append(cluster.labels_) num_cluster_list.append(cluster.n_clusters_) i = i + 1 + else: + print_info("Discarding measurement No. " + str(num_trace) + " because it " + + "did not recognize the number of raw_states correctly.") if i != len(raw_states_list): if i / len(raw_states_list) <= 0.5: print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) @@ -603,6 +639,7 @@ if __name__ == '__main__': print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + " Measurements for refinement. " "Others did not recognize number of states correctly.") + # TODO: DEBUG Kram sys.exit() else: print_info("Used all available measurements.") -- cgit v1.2.3 From 4dc7c23ada35fc2b64685f1eb9df18a5104aaa2c Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 10 Jul 2020 20:02:07 +0200 Subject: bin/Proof_Of_Concept_PELT: exits haben jetzt errorcodes. Anfang für Vereinheitlichung der Labels ist gemacht MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 89 +++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 25 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index bcbd53e..dde99d8 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -3,7 +3,7 @@ import time import sys import getopt import re -from multiprocessing import Pool, Manager +from multiprocessing import Pool, Manager, cpu_count from kneed import KneeLocator from sklearn.cluster import AgglomerativeClustering import matplotlib.pyplot as plt @@ -86,7 +86,7 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False): return bkps print_error("No Penalty specified.") - sys.exit() + sys.exit(-1) def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50, @@ -220,7 +220,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, print_error("With the current thresh-hold S=" + str(S) + " it is not possible to select a penalty value.") - sys.exit() + sys.exit(-1) # very short benchmark yielded approx. 1/3 of speed compared to solution with sorting @@ -405,7 +405,7 @@ if __name__ == '__main__': opt_min_dist = None opt_range_min = None opt_range_max = None - opt_num_processes = None + opt_num_processes = cpu_count() opt_refresh_delay = None opt_refresh_thresh = None opt_S = None @@ -422,7 +422,7 @@ if __name__ == '__main__': if 'filename' not in opt: print_error("No file specified!") - sys.exit(2) + sys.exit(-1) else: opt_filename = opt['filename'] if 'v' in opt: @@ -435,70 +435,70 @@ if __name__ == '__main__': opt_jump = int(opt['jump']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'min_dist' in opt: try: opt_min_dist = int(opt['min_dist']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'range_min' in opt: try: opt_range_min = int(opt['range_min']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'range_max' in opt: try: opt_range_max = int(opt['range_max']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'num_processes' in opt: try: opt_num_processes = int(opt['num_processes']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'refresh_delay' in opt: try: opt_refresh_delay = int(opt['refresh_delay']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'refresh_thresh' in opt: try: opt_refresh_thresh = int(opt['refresh_thresh']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'S' in opt: try: opt_S = float(opt['S']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'pen_override' in opt: try: opt_pen_override = int(opt['pen_override']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'pen_modifier' in opt: try: opt_pen_modifier = float(opt['pen_modifier']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) if 'refinement_thresh' in opt: try: opt_refinement_thresh = int(opt['refinement_thresh']) except ValueError as verr: print(verr, file=sys.stderr) - sys.exit(2) + sys.exit(-1) except getopt.GetoptError as err: print(err, file=sys.stderr) - sys.exit(2) + sys.exit(-1) # OPENING DATA if ".json" in opt_filename: @@ -623,8 +623,8 @@ if __name__ == '__main__': # plt.show() # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.: # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3 - cluster_labels_list.append(cluster.labels_) - num_cluster_list.append(cluster.n_clusters_) + cluster_labels_list.append((num_trace, cluster.labels_)) + num_cluster_list.append((num_trace, cluster.n_clusters_)) i = i + 1 else: print_info("Discarding measurement No. " + str(num_trace) + " because it " @@ -640,18 +640,55 @@ if __name__ == '__main__': + " Measurements for refinement. " "Others did not recognize number of states correctly.") # TODO: DEBUG Kram - sys.exit() + sys.exit(0) else: print_info("Used all available measurements.") - num_states = np.argmax(np.bincount(num_cluster_list)) + num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list])) + avg_per_state_list = [None] * len(cluster_labels_list) + used_clusters = 0 + for number, (num_trace, labels) in enumerate(cluster_labels_list): + if num_cluster_list[number][1] == num_states: + avg_per_state = [0] * num_states + count_per_state = [0] * num_states + raw_states = raw_states_list[num_trace] + for num_label, label in enumerate(labels): + count_per_state[label] = count_per_state[label] + 1 + avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] + for i,_ in enumerate(avg_per_state): + avg_per_state[i] = avg_per_state[i] / count_per_state[i] + avg_per_state_list[number] = avg_per_state + used_clusters = used_clusters + 1 + + # flattend version for clustering: + values_to_cluster = np.zeros((num_states * used_clusters, 2)) + index = 0 + for avg_per_state in avg_per_state_list: + if None not in avg_per_state: + for avg in avg_per_state: + values_to_cluster[index][0] = avg + values_to_cluster[index][1] = 0 + index = index + 1 + # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1]) + # plt.show() + cluster = AgglomerativeClustering(n_clusters=num_states) + cluster.fit_predict(values_to_cluster) + # HIER WEITER: + # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die + # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. + # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting + # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. + resulting_sequence = [None] * num_raw_states i = 0 for x in resulting_sequence: j = 0 test_list = [] - for arr in cluster_labels_list: - if num_cluster_list[j] != num_states: + for arr in [elem[1] for elem in cluster_labels_list]: + if num_cluster_list[j][1] != num_states: + # hopefully this does not happen regularly + print_info("Discarding measurement " + str(j) + + " because the clustering yielded not matching results.") j = j + 1 else: test_list.append(arr[i]) @@ -659,6 +696,7 @@ if __name__ == '__main__': resulting_sequence[i] = np.argmax(np.bincount(test_list)) i = i + 1 print(resulting_sequence) + sys.exit() elif ".tar" in opt_filename: # open with dfatool @@ -670,11 +708,12 @@ if __name__ == '__main__': print_info("Preprocessing file. Depending on its size, this could take a while.") preprocessed_data = raw_data.get_preprocessed_data() print_info("File fully preprocessed") - # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json + print_error("Not implemented yet. Please generate .json files first with dfatool and use" + " those.") else: print_error("Unknown dataformat") - sys.exit(2) + sys.exit(-1) # print(tx_data[1]['parameter']) # # parse json to array for PELT -- cgit v1.2.3 From 5b5fb3103d8305eed9d6828858509013bfe60e97 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sat, 11 Jul 2020 17:23:03 +0200 Subject: Removed black from gitlab-ci for the moment --- .gitlab-ci.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 52d6e1c..f397fcd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,13 +3,6 @@ image: debian:bullseye stages: - test -lint_python: - stage: test - script: - - apt-get update -qy - - apt-get install -y black - - black --check --diff bin - run_tests: stage: test script: -- cgit v1.2.3 From e1f0618fb04e42b7d3e49055af83f58a803b28b8 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sat, 11 Jul 2020 17:24:09 +0200 Subject: bin/Proof_of_Concept_PELT.py: Vereinheitlichen der Zustandsbezeichner für eine Paramkonfig funktioniert jetzt sehr gut. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 98 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 22 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index dde99d8..0d5be54 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -504,17 +504,18 @@ if __name__ == '__main__': if ".json" in opt_filename: # open file with trace data from json print_info( - " Will only refine the state which is present in " + opt_filename + " if necessary.") + "Will only refine the state which is present in " + opt_filename + " if necessary.") with open(opt_filename, 'r') as f: - states = json.load(f) + configurations = json.load(f) # loop through all traces check if refinement is necessary - print_info("Checking if refinement is necessary...") - for measurements_by_state in states: + resulting_sequence_list = [] + for num_config, measurements_by_configuration in enumerate(configurations): # loop through all occurrences of the looked at state - print_info("Looking at state '" + measurements_by_state['name'] + "' with params: " - + str(measurements_by_state['parameter'])) + print_info("Looking at state '" + measurements_by_configuration['name'] + "' with params: " + + str(measurements_by_configuration['parameter'])) refine = False - for measurement in measurements_by_state['offline']: + print_info("Checking if refinement is necessary...") + for measurement in measurements_by_configuration['offline']: # loop through measurements of particular state # an check if state needs refinement signal = measurement['uW'] @@ -524,13 +525,13 @@ if __name__ == '__main__': print_info("Refinement is necessary!") refine = True if not refine: - print_info("No refinement necessary for state '" + measurements_by_state['name'] - + "' with params: " + str(measurements_by_state['parameter'])) + print_info("No refinement necessary for state '" + measurements_by_configuration['name'] + + "' with params: " + str(measurements_by_configuration['parameter'])) else: # assume that all measurements of the same param configuration are fundamentally # similar -> calculate penalty for first measurement, use it for all if opt_pen_override is None: - signal = np.array(measurements_by_state['offline'][0]['uW']) + signal = np.array(measurements_by_configuration['offline'][0]['uW']) normed_signal = norm_signal(signal) penalty = calculate_penalty_value(normed_signal, model=opt_model, range_min=opt_range_min, @@ -544,11 +545,11 @@ if __name__ == '__main__': # build arguments for parallel excecution print_info("Starting raw_states calculation.") raw_states_calc_args = [] - for num_measurement, measurement in enumerate(measurements_by_state['offline']): + for num_measurement, measurement in enumerate(measurements_by_configuration['offline']): raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model, opt_jump)) - raw_states_list = [None] * len(measurements_by_state['offline']) + raw_states_list = [None] * len(measurements_by_configuration['offline']) raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) # extracting result and putting it in correct order -> index of raw_states_list # entry still corresponds with index of measurement in measurements_by_states @@ -629,6 +630,7 @@ if __name__ == '__main__': else: print_info("Discarding measurement No. " + str(num_trace) + " because it " + "did not recognize the number of raw_states correctly.") + num_used_measurements = len(raw_states_list) if i != len(raw_states_list): if i / len(raw_states_list) <= 0.5: print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) @@ -639,6 +641,7 @@ if __name__ == '__main__': print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + " Measurements for refinement. " "Others did not recognize number of states correctly.") + num_used_measurements = i # TODO: DEBUG Kram sys.exit(0) else: @@ -655,16 +658,24 @@ if __name__ == '__main__': for num_label, label in enumerate(labels): count_per_state[label] = count_per_state[label] + 1 avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] - for i,_ in enumerate(avg_per_state): + for i, _ in enumerate(avg_per_state): avg_per_state[i] = avg_per_state[i] / count_per_state[i] avg_per_state_list[number] = avg_per_state used_clusters = used_clusters + 1 - + else: + # hopefully this does not happen regularly + print_info("Discarding measurement " + str(number) + + " because the clustering yielded not matching results.") + num_used_measurements = num_used_measurements - 1 + if num_used_measurements == 0: + print_error("Something went terribly wrong. Discarded all measurements.") + # continue + sys.exit(-1) # flattend version for clustering: values_to_cluster = np.zeros((num_states * used_clusters, 2)) index = 0 for avg_per_state in avg_per_state_list: - if None not in avg_per_state: + if avg_per_state is not None: for avg in avg_per_state: values_to_cluster[index][0] = avg values_to_cluster[index][1] = 0 @@ -673,30 +684,73 @@ if __name__ == '__main__': # plt.show() cluster = AgglomerativeClustering(n_clusters=num_states) cluster.fit_predict(values_to_cluster) - # HIER WEITER: # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. - + new_labels_list = [] + new_labels = [] + i = 0 + for label in cluster.labels_: + new_labels.append(label) + i = i + 1 + if i == num_states: + new_labels_list.append(new_labels) + new_labels = [] + i = 0 + # only the selected measurements are present in new_labels. + # new_labels_index should not be incremented, if not selected_measurement is skipped + new_labels_index = 0 + # cluster_labels_list contains all measurements -> if measurement is skipped + # still increment the index + index = 0 + for elem in avg_per_state_list: + if elem is not None: + for number, label in enumerate(cluster_labels_list[index][1]): + cluster_labels_list[index][1][number] = \ + new_labels_list[new_labels_index][label] + new_labels_index = new_labels_index + 1 + else: + # override not selected measurement labels to avoid choosing the wrong ones. + for number, label in enumerate(cluster_labels_list[index][1]): + cluster_labels_list[index][1][number] = -1 + index = index + 1 resulting_sequence = [None] * num_raw_states i = 0 + confidence = 0 for x in resulting_sequence: j = 0 test_list = [] for arr in [elem[1] for elem in cluster_labels_list]: if num_cluster_list[j][1] != num_states: - # hopefully this does not happen regularly - print_info("Discarding measurement " + str(j) - + " because the clustering yielded not matching results.") j = j + 1 else: + if -1 in arr: + print_error("Bei Janis beschweren! Fehler beim Umbenennen der" + " Zustände wahrscheinlich.") + sys.exit(-1) test_list.append(arr[i]) j = j + 1 - resulting_sequence[i] = np.argmax(np.bincount(test_list)) + bincount = np.bincount(test_list) + resulting_sequence[i] = np.argmax(bincount) + confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount) i = i + 1 + confidence = confidence / len(resulting_sequence) + print_info("Confidence of resulting sequence is " + str(confidence) + + " while using " + str(num_used_measurements) + "/" + + str(len(raw_states_list)) + " measurements.") print(resulting_sequence) - sys.exit() + resulting_sequence_list.append((num_config, resulting_sequence)) + # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat + # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die + # Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen + # auftreten. + # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw. + # hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem: + # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche + # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, + # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines + # Zustands ja nicht mehr ändern. elif ".tar" in opt_filename: # open with dfatool -- cgit v1.2.3 From a00ffc0e32ddc72a8faceec4344432cdbf3b90c7 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 16 Jul 2020 16:34:20 +0200 Subject: bin/Proof_Of_Concept_PELT: kleine kosmetische Änderungen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 0d5be54..7726f53 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -160,11 +160,11 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, knee = find_knee_point(pen_val, fitted_bkps_val, S=S) # TODO: Find plateau on pen_val vs fitted_bkps_val - # scipy.find_peaks() does not find plateaus if they extend through the end of the data. - # to counter that, add one extremely large value to the right side of the data - # after negating it is extremely small -> Almost certainly smaller than the - # found plateau therefore the plateau does not extend through the border -> scipy.find_peaks - # finds it. Choose value from within that plateau. + # scipy.find_peaks() does not find plateaus if they extend through the end of the data. + # to counter that, add one extremely large value to the right side of the data + # after negating it is extremely small -> Almost certainly smaller than the + # found plateau therefore the plateau does not extend through the border + # -> scipy.find_peaks finds it. Choose value from within that plateau. # fitted_bkps_val.append(100000000) # TODO: Approaching over find_peaks might not work if the initial decrease step to the # "correct" number of changepoints and additional decrease steps e.g. underfitting @@ -331,7 +331,6 @@ def calc_raw_states(arg_list, num_processes=8): # Very short benchmark yielded approx. 3 times the speed of solution not using sort -# TODO: Decide whether median is really the better baseline than mean def needs_refinement(signal, thresh): sorted_signal = sorted(signal) length_of_signal = len(signal) @@ -509,29 +508,28 @@ if __name__ == '__main__': configurations = json.load(f) # loop through all traces check if refinement is necessary resulting_sequence_list = [] - for num_config, measurements_by_configuration in enumerate(configurations): + for num_config, measurements_by_config in enumerate(configurations): # loop through all occurrences of the looked at state - print_info("Looking at state '" + measurements_by_configuration['name'] + "' with params: " - + str(measurements_by_configuration['parameter'])) + print_info("Looking at state '" + measurements_by_config['name'] + "' with params: " + + str(measurements_by_config['parameter'])) refine = False print_info("Checking if refinement is necessary...") - for measurement in measurements_by_configuration['offline']: + for measurement in measurements_by_config['offline']: # loop through measurements of particular state # an check if state needs refinement signal = measurement['uW'] # mean = measurement['uW_mean'] - # TODO: Decide if median is really the better baseline than mean if needs_refinement(signal, opt_refinement_thresh) and not refine: print_info("Refinement is necessary!") refine = True if not refine: - print_info("No refinement necessary for state '" + measurements_by_configuration['name'] - + "' with params: " + str(measurements_by_configuration['parameter'])) + print_info("No refinement necessary for state '" + measurements_by_config['name'] + + "' with params: " + str(measurements_by_config['parameter'])) else: # assume that all measurements of the same param configuration are fundamentally # similar -> calculate penalty for first measurement, use it for all if opt_pen_override is None: - signal = np.array(measurements_by_configuration['offline'][0]['uW']) + signal = np.array(measurements_by_config['offline'][0]['uW']) normed_signal = norm_signal(signal) penalty = calculate_penalty_value(normed_signal, model=opt_model, range_min=opt_range_min, @@ -545,11 +543,11 @@ if __name__ == '__main__': # build arguments for parallel excecution print_info("Starting raw_states calculation.") raw_states_calc_args = [] - for num_measurement, measurement in enumerate(measurements_by_configuration['offline']): + for num_measurement, measurement in enumerate(measurements_by_config['offline']): raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model, opt_jump)) - raw_states_list = [None] * len(measurements_by_configuration['offline']) + raw_states_list = [None] * len(measurements_by_config['offline']) raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) # extracting result and putting it in correct order -> index of raw_states_list # entry still corresponds with index of measurement in measurements_by_states @@ -622,8 +620,6 @@ if __name__ == '__main__': # print_info("Cluster labels:\n" + str(cluster.labels_)) # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') # plt.show() - # TODO: Problem: Der Algorithmus nummeriert die Zustände nicht immer gleich... also bspw.: - # mal ist das tatsächliche Transmit mit 1 belabelt und mal mit 3 cluster_labels_list.append((num_trace, cluster.labels_)) num_cluster_list.append((num_trace, cluster.n_clusters_)) i = i + 1 @@ -739,7 +735,7 @@ if __name__ == '__main__': print_info("Confidence of resulting sequence is " + str(confidence) + " while using " + str(num_used_measurements) + "/" + str(len(raw_states_list)) + " measurements.") - print(resulting_sequence) + #print(resulting_sequence) resulting_sequence_list.append((num_config, resulting_sequence)) # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die @@ -750,7 +746,10 @@ if __name__ == '__main__': # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines - # Zustands ja nicht mehr ändern. + # Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen... + for num_config, sequence in resulting_sequence_list: + print_info("NO. config:" + str(num_config)) + print_info(sequence) elif ".tar" in opt_filename: # open with dfatool -- cgit v1.2.3 From e15ac967c7e9b1b9f781ee9478f3b1e723d6177a Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Thu, 16 Jul 2020 16:41:19 +0200 Subject: Proof_Of_Concept_PELT: Fixed imports after merge --- bin/Proof_Of_Concept_PELT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 7726f53..de47d4a 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -9,7 +9,7 @@ from sklearn.cluster import AgglomerativeClustering import matplotlib.pyplot as plt import ruptures as rpt import numpy as np -from dfatool.dfatool import RawData +from dfatool.loader import RawData # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display -- cgit v1.2.3 From bf49cf3ccee8c6d3c91c6a2ac81d7923a35b198e Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Mon, 20 Jul 2020 23:48:21 +0200 Subject: bin/Proof_Of_Concept_PELT: Parametrisierung von raw_states sollte eigentlich vernünftig klappen. Für mindestens TX klappt das aber nicht. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 694 ++++++++++++++++++++++++++++--------------- 1 file changed, 456 insertions(+), 238 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index de47d4a..75cdce6 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -1,4 +1,5 @@ import json +import os import time import sys import getopt @@ -9,7 +10,12 @@ from sklearn.cluster import AgglomerativeClustering import matplotlib.pyplot as plt import ruptures as rpt import numpy as np + +from dfatool.functions import analytic from dfatool.loader import RawData +from dfatool import parameters +from dfatool.model import ParallelParamFit +from dfatool.utils import by_name_to_by_param # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display @@ -396,6 +402,8 @@ if __name__ == '__main__': "pen_modifier= " "plotting= " "refinement_thresh= " + "cache_dicts " + "cache_loc= " ) opt_filename = None opt_verbose = False @@ -412,6 +420,7 @@ if __name__ == '__main__': opt_pen_modifier = None opt_plotting = False opt_refinement_thresh = None + opt_cache_loc = None try: raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" ")) @@ -495,6 +504,12 @@ if __name__ == '__main__': except ValueError as verr: print(verr, file=sys.stderr) sys.exit(-1) + if 'cache_dicts' in opt: + if 'cache_loc' in opt: + opt_cache_loc = opt['cache_loc'] + else: + print_error("If \"cache_dicts\" is set, \"cache_loc\" must be provided.") + sys.exit(-1) except getopt.GetoptError as err: print(err, file=sys.stderr) sys.exit(-1) @@ -506,250 +521,453 @@ if __name__ == '__main__': "Will only refine the state which is present in " + opt_filename + " if necessary.") with open(opt_filename, 'r') as f: configurations = json.load(f) + + # for i in range(0, 7): + # signal = np.array(configurations[i]['offline'][0]['uW']) + # plt.plot(signal) + # plt.xlabel('Time [us]') + # plt.ylabel('Power [mW]') + # plt.show() + # sys.exit() + # loop through all traces check if refinement is necessary - resulting_sequence_list = [] - for num_config, measurements_by_config in enumerate(configurations): - # loop through all occurrences of the looked at state - print_info("Looking at state '" + measurements_by_config['name'] + "' with params: " - + str(measurements_by_config['parameter'])) - refine = False - print_info("Checking if refinement is necessary...") - for measurement in measurements_by_config['offline']: - # loop through measurements of particular state - # an check if state needs refinement - signal = measurement['uW'] - # mean = measurement['uW_mean'] - if needs_refinement(signal, opt_refinement_thresh) and not refine: - print_info("Refinement is necessary!") - refine = True - if not refine: - print_info("No refinement necessary for state '" + measurements_by_config['name'] - + "' with params: " + str(measurements_by_config['parameter'])) + # resulting_sequence_list = [] + # search for param_names, by_param and by_name files + by_param_file = None + by_name_file = None + param_names_file = None + if opt_cache_loc is not None: + flag = False + by_name_loc = os.path.join(opt_cache_loc, "by_name.txt") + by_param_loc = os.path.join(opt_cache_loc, "by_param.txt") + param_names_loc = os.path.join(opt_cache_loc, "param_names.txt") + if os.path.isfile(by_name_loc) and os.path.getsize(by_name_loc) > 0: + by_name_file = open(by_name_loc, "r") + else: + print_error("In " + opt_cache_loc + " is no by_name.txt.") + flag = True + if os.path.isfile(by_param_loc) and os.path.getsize(by_param_loc) > 0: + by_param_file = open(by_param_loc, "r") + else: + print_error("In " + opt_cache_loc + " is no by_param.txt.") + flag = True + if os.path.isfile(param_names_loc) and os.path.getsize(param_names_loc) > 0: + param_names_file = open(param_names_loc, "r") else: - # assume that all measurements of the same param configuration are fundamentally - # similar -> calculate penalty for first measurement, use it for all - if opt_pen_override is None: - signal = np.array(measurements_by_config['offline'][0]['uW']) - normed_signal = norm_signal(signal) - penalty = calculate_penalty_value(normed_signal, model=opt_model, - range_min=opt_range_min, - range_max=opt_range_max, - num_processes=opt_num_processes, - jump=opt_jump, S=opt_S, - pen_modifier=opt_pen_modifier) - penalty = penalty[0] + print_error("In " + opt_cache_loc + " is no param_names.txt.") + flag = True + if flag: + print_info("The cache will be build.") + + if None in (by_param_file, by_name_file, param_names_file): + state_durations_by_config = [] + state_consumptions_by_config = [] + for num_config, measurements_by_config in enumerate(configurations): + # loop through all occurrences of the looked at state + print_info("Looking at state '" + measurements_by_config['name'] + "' with params: " + + str(measurements_by_config['parameter']) + "(" + str(num_config + 1) + "/" + + str(len(configurations)) + ")") + refine = False + print_info("Checking if refinement is necessary...") + for measurement in measurements_by_config['offline']: + # loop through measurements of particular state + # an check if state needs refinement + signal = measurement['uW'] + # mean = measurement['uW_mean'] + if needs_refinement(signal, opt_refinement_thresh) and not refine: + print_info("Refinement is necessary!") + refine = True + if not refine: + print_info("No refinement necessary for state '" + measurements_by_config['name'] + + "' with params: " + str(measurements_by_config['parameter'])) else: - penalty = opt_pen_override - # build arguments for parallel excecution - print_info("Starting raw_states calculation.") - raw_states_calc_args = [] - for num_measurement, measurement in enumerate(measurements_by_config['offline']): - raw_states_calc_args.append((num_measurement, measurement, penalty, - opt_model, opt_jump)) - - raw_states_list = [None] * len(measurements_by_config['offline']) - raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) - # extracting result and putting it in correct order -> index of raw_states_list - # entry still corresponds with index of measurement in measurements_by_states - # -> If measurements are discarded the correct ones are easily recognized - for ret_val in raw_states_res: - num_trace = ret_val[0] - raw_states = ret_val[1] - avg_std = ret_val[2] - change_avg_std = ret_val[3] - # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch - # int sein oder nicht? Es scheint auch vernünftig zu klappen... - raw_states_list[num_trace] = raw_states - print_info("The average standard deviation for the newly found states in " - + "measurement No. " + str(num_trace) + " is " + str(avg_std)) - print_info("That is a reduction of " + str(change_avg_std)) - print_info("Finished raw_states calculation.") - num_states_array = [int()] * len(raw_states_list) - i = 0 - for i, x in enumerate(raw_states_list): - num_states_array[i] = len(x) - avg_num_states = np.mean(num_states_array) - num_states_dev = np.std(num_states_array) - print_info("On average " + str(avg_num_states) - + " States have been found. The standard deviation" - + " is " + str(num_states_dev)) - # TODO: MAGIC NUMBER - if num_states_dev > 1: - print_warning("The number of states varies strongly across measurements." - " Consider choosing a larger value for S or using the " - "pen_modifier option.") - time.sleep(5) - # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? - # Einfach Durchschnitt nehmen? - # Preliminary decision: Further on only use the traces, which have the most frequent state count - counts = np.bincount(num_states_array) - num_raw_states = np.argmax(counts) - print_info("Choose " + str(num_raw_states) + " as number of raw_states.") - i = 0 - cluster_labels_list = [] - num_cluster_list = [] - for num_trace, raw_states in enumerate(raw_states_list): - # iterate through raw states from measurements - if len(raw_states) == num_raw_states: - # build array with power values to cluster these - value_to_cluster = np.zeros((num_raw_states, 2)) - j = 0 - for s in raw_states: - value_to_cluster[j][0] = s[2] - value_to_cluster[j][1] = 0 - j = j + 1 - # linked = linkage(value_to_cluster, 'single') - # - # labelList = range(1, 11) - # - # plt.figure(figsize=(10, 7)) - # dendrogram(linked, - # orientation='top', - # distance_sort='descending', - # show_leaf_counts=True) - # plt.show() - # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER - # im distance_threshold - cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, - affinity='euclidean', - linkage='ward', - distance_threshold=opt_refinement_thresh * 100) - # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', - # linkage='ward') - cluster.fit_predict(value_to_cluster) - # print_info("Cluster labels:\n" + str(cluster.labels_)) - # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') - # plt.show() - cluster_labels_list.append((num_trace, cluster.labels_)) - num_cluster_list.append((num_trace, cluster.n_clusters_)) - i = i + 1 + # assume that all measurements of the same param configuration are fundamentally + # similar -> calculate penalty for first measurement, use it for all + if opt_pen_override is None: + signal = np.array(measurements_by_config['offline'][0]['uW']) + normed_signal = norm_signal(signal) + penalty = calculate_penalty_value(normed_signal, model=opt_model, + range_min=opt_range_min, + range_max=opt_range_max, + num_processes=opt_num_processes, + jump=opt_jump, S=opt_S, + pen_modifier=opt_pen_modifier) + penalty = penalty[0] else: - print_info("Discarding measurement No. " + str(num_trace) + " because it " - + "did not recognize the number of raw_states correctly.") - num_used_measurements = len(raw_states_list) - if i != len(raw_states_list): - if i / len(raw_states_list) <= 0.5: - print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) - + " Measurements for refinement. " - "Others did not recognize number of states correctly." - "\nYou should verify the integrity of the measurements.") + penalty = opt_pen_override + # build arguments for parallel excecution + print_info("Starting raw_states calculation.") + raw_states_calc_args = [] + for num_measurement, measurement in enumerate(measurements_by_config['offline']): + raw_states_calc_args.append((num_measurement, measurement, penalty, + opt_model, opt_jump)) + + raw_states_list = [None] * len(measurements_by_config['offline']) + raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) + # extracting result and putting it in correct order -> index of raw_states_list + # entry still corresponds with index of measurement in measurements_by_states + # -> If measurements are discarded the correct ones are easily recognized + for ret_val in raw_states_res: + num_trace = ret_val[0] + raw_states = ret_val[1] + avg_std = ret_val[2] + change_avg_std = ret_val[3] + # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch + # int sein oder nicht? Es scheint auch vernünftig zu klappen... + raw_states_list[num_trace] = raw_states + print_info("The average standard deviation for the newly found states in " + + "measurement No. " + str(num_trace) + " is " + str(avg_std)) + print_info("That is a reduction of " + str(change_avg_std)) + print_info("Finished raw_states calculation.") + num_states_array = [int()] * len(raw_states_list) + i = 0 + for i, x in enumerate(raw_states_list): + num_states_array[i] = len(x) + avg_num_states = np.mean(num_states_array) + num_states_dev = np.std(num_states_array) + print_info("On average " + str(avg_num_states) + + " States have been found. The standard deviation" + + " is " + str(num_states_dev)) + # TODO: MAGIC NUMBER + if num_states_dev > 1: + print_warning("The number of states varies strongly across measurements." + " Consider choosing a larger value for S or using the " + "pen_modifier option.") + time.sleep(5) + # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? + # Einfach Durchschnitt nehmen? + # Preliminary decision: Further on only use the traces, which have the most + # frequent state count + counts = np.bincount(num_states_array) + num_raw_states = np.argmax(counts) + print_info("Choose " + str(num_raw_states) + " as number of raw_states.") + # iterate through all found breakpoints and determine start and end points as well + # as power consumption + states_duration_list = [0] * num_raw_states + states_consumption_list = [0] * num_raw_states + num_used_measurements = 0 + for num_trace, raw_states in enumerate(raw_states_list): + if len(raw_states) == num_raw_states: + num_used_measurements = num_used_measurements + 1 + # calced_state = (start_time, end_time, mean_power, std_dev) + for num_state, s in enumerate(raw_states): + state_duration = s[1] - s[0] + state_consumption = s[2] + states_duration_list[num_state] = \ + states_duration_list[num_state] + state_duration + states_consumption_list[num_state] = \ + states_consumption_list[num_state] + state_consumption + else: + print_info("Discarding measurement No. " + str(num_trace) + " because it " + + "did not recognize the number of raw_states correctly.") + for i, x in enumerate(states_duration_list): + states_duration_list[i] = x / num_used_measurements + for i, x in enumerate(states_consumption_list): + states_consumption_list[i] = x / num_used_measurements + if num_used_measurements != len(raw_states_list): + if num_used_measurements / len(raw_states_list) <= 0.5: + print_warning("Only used " + str(num_used_measurements) + "/" + + str(len(raw_states_list)) + " Measurements for refinement. " + + "Others did not recognize number of states correctly." + + "\nYou should verify the integrity of the measurements.") + else: + print_info("Used " + str(num_used_measurements) + "/" + + str(len(raw_states_list)) + " Measurements for refinement. " + + "Others did not recognize number of states correctly.") + num_used_measurements = i + # TODO: DEBUG Kram + sys.exit(0) else: - print_info("Used " + str(i) + "/" + str(len(raw_states_list)) - + " Measurements for refinement. " - "Others did not recognize number of states correctly.") - num_used_measurements = i - # TODO: DEBUG Kram - sys.exit(0) + print_info("Used all available measurements.") + + state_durations_by_config.append((num_config, states_duration_list)) + state_consumptions_by_config.append((num_config, states_consumption_list)) + # # TODO: + # if num_config == 6: + # print("BRECHE AUS") + # break + + # combine all state durations and consumptions to parametrized model + + # this is only necessary because at this state only linear automatons can be modeled. + num_states_array = [int()] * len(state_consumptions_by_config) + for i, (_, states_consumption_list) in enumerate(state_consumptions_by_config): + num_states_array[i] = len(states_consumption_list) + counts = np.bincount(num_states_array) + num_raw_states = np.argmax(counts) + usable_configs = len(state_consumptions_by_config) + # param_list identical for each raw_state + # TODO: Kann man die echt einfach rausziehen aus der json? Ich hab sie nicht gefunden... + # Nur für jede Messung. Aber da sind die ja ohnehin identisch. + param_list = [] + param_names = configurations[0]['offline_aggregates']['paramkeys'][0] + print_info("param_names: " + str(param_names)) + for num_config, states_consumption_list in state_consumptions_by_config: + if len(states_consumption_list) != num_raw_states: + print_warning("Config No." + str(num_config) + " not usable yet due to different " + + "number of states. This hints a correlation between parameters and " + + "the structure of the resulting automaton. This will be possibly be" + + " supported in a future version of this tool.") + usable_configs = usable_configs - 1 else: - print_info("Used all available measurements.") - - num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list])) - avg_per_state_list = [None] * len(cluster_labels_list) - used_clusters = 0 - for number, (num_trace, labels) in enumerate(cluster_labels_list): - if num_cluster_list[number][1] == num_states: - avg_per_state = [0] * num_states - count_per_state = [0] * num_states - raw_states = raw_states_list[num_trace] - for num_label, label in enumerate(labels): - count_per_state[label] = count_per_state[label] + 1 - avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] - for i, _ in enumerate(avg_per_state): - avg_per_state[i] = avg_per_state[i] / count_per_state[i] - avg_per_state_list[number] = avg_per_state - used_clusters = used_clusters + 1 - else: - # hopefully this does not happen regularly - print_info("Discarding measurement " + str(number) - + " because the clustering yielded not matching results.") - num_used_measurements = num_used_measurements - 1 - if num_used_measurements == 0: - print_error("Something went terribly wrong. Discarded all measurements.") - # continue - sys.exit(-1) - # flattend version for clustering: - values_to_cluster = np.zeros((num_states * used_clusters, 2)) - index = 0 - for avg_per_state in avg_per_state_list: - if avg_per_state is not None: - for avg in avg_per_state: - values_to_cluster[index][0] = avg - values_to_cluster[index][1] = 0 - index = index + 1 - # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1]) - # plt.show() - cluster = AgglomerativeClustering(n_clusters=num_states) - cluster.fit_predict(values_to_cluster) - # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die - # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. - # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting - # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. - new_labels_list = [] - new_labels = [] - i = 0 - for label in cluster.labels_: - new_labels.append(label) - i = i + 1 - if i == num_states: - new_labels_list.append(new_labels) - new_labels = [] - i = 0 - # only the selected measurements are present in new_labels. - # new_labels_index should not be incremented, if not selected_measurement is skipped - new_labels_index = 0 - # cluster_labels_list contains all measurements -> if measurement is skipped - # still increment the index - index = 0 - for elem in avg_per_state_list: - if elem is not None: - for number, label in enumerate(cluster_labels_list[index][1]): - cluster_labels_list[index][1][number] = \ - new_labels_list[new_labels_index][label] - new_labels_index = new_labels_index + 1 - else: - # override not selected measurement labels to avoid choosing the wrong ones. - for number, label in enumerate(cluster_labels_list[index][1]): - cluster_labels_list[index][1][number] = -1 - index = index + 1 - resulting_sequence = [None] * num_raw_states - i = 0 - confidence = 0 - for x in resulting_sequence: - j = 0 - test_list = [] - for arr in [elem[1] for elem in cluster_labels_list]: - if num_cluster_list[j][1] != num_states: - j = j + 1 - else: - if -1 in arr: - print_error("Bei Janis beschweren! Fehler beim Umbenennen der" - " Zustände wahrscheinlich.") - sys.exit(-1) - test_list.append(arr[i]) - j = j + 1 - bincount = np.bincount(test_list) - resulting_sequence[i] = np.argmax(bincount) - confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount) - i = i + 1 - confidence = confidence / len(resulting_sequence) - print_info("Confidence of resulting sequence is " + str(confidence) - + " while using " + str(num_used_measurements) + "/" - + str(len(raw_states_list)) + " measurements.") - #print(resulting_sequence) - resulting_sequence_list.append((num_config, resulting_sequence)) - # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat - # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die - # Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen - # auftreten. - # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw. - # hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem: - # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche - # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, - # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines - # Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen... - for num_config, sequence in resulting_sequence_list: - print_info("NO. config:" + str(num_config)) - print_info(sequence) + param_list.append(configurations[num_config]['offline_aggregates']['param'][0]) + print_info("param_list: " + str(param_list)) + + if usable_configs == len(state_consumptions_by_config): + print_info("All configs usable.") + else: + print_info("Using only " + str(usable_configs) + " Configs.") + by_name = {} + for i in range(num_raw_states): + consumptions_for_state = [] + durations_for_state = [] + for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config): + consumptions_for_state.append(states_consumption_list[i]) + durations_for_state.append(state_durations_by_config[j][1][i]) + name = "state_" + str(i) + state_dict = { + "param": param_list, + "power": consumptions_for_state, + "duration": durations_for_state, + "attributes": ["power", "duration"] + } + by_name[name] = state_dict + by_param = by_name_to_by_param(by_name) + if opt_cache_loc is not None: + by_name_loc = os.path.join(opt_cache_loc, "by_name.txt") + by_param_loc = os.path.join(opt_cache_loc, "by_param.txt") + param_names_loc = os.path.join(opt_cache_loc, "param_names.txt") + f = open(by_name_loc, "w") + f.write(str(by_name)) + f.close() + f = open(by_param_loc, "w") + f.write(str(by_param)) + f.close() + f = open(param_names_loc, "w") + f.write(str(param_names)) + f.close() + else: + by_name_text = str(by_name_file.read()) + by_name = eval(by_name_text) + by_param_text = str(by_param_file.read()) + by_param = eval(by_param_text) + param_names_text = str(param_names_file.read()) + param_names = eval(param_names_text) + + # t = 0 + # last_pow = 0 + # for key in by_name.keys(): + # end_t = t + np.mean(by_name[key]["duration"]) + # power = np.mean(by_name[key]["power"]) + # plt.vlines(t, min(last_pow, power), max(last_pow, power)) + # plt.hlines(power, t, end_t) + # t = end_t + # last_pow = power + # plt.show() + stats = parameters.ParamStats(by_name, by_param, param_names, dict()) + paramfit = ParallelParamFit(by_param) + for state_name in by_name.keys(): + for num_param, param_name in enumerate(param_names): + if stats.depends_on_param(state_name, "power", param_name): + paramfit.enqueue(state_name, "power", num_param, param_name) + if stats.depends_on_param(state_name, "duration", param_name): + paramfit.enqueue(state_name, "duration", num_param, param_name) + print_info("State " + state_name + "s power depends on param " + param_name + ":" + + str(stats.depends_on_param(state_name, "power", param_name)) + ) + print_info("State " + state_name + "s duration depends on param " + param_name + ":" + + str(stats.depends_on_param(state_name, "duration", param_name)) + ) + paramfit.fit() + fit_res_dur_list = [] + fit_res_pow_list = [] + for state_name in by_name.keys(): + fit_power = paramfit.get_result(state_name, "power") + fit_duration = paramfit.get_result(state_name, "duration") + combined_fit_power = analytic.function_powerset(fit_power, param_names, 0) + combined_fit_duration = analytic.function_powerset(fit_duration, param_names, 0) + combined_fit_power.fit(by_param, state_name, "power") + if not combined_fit_power.fit_success: + print_warning("Fitting(power) for state " + state_name + " was not succesful!") + combined_fit_duration.fit(by_param, state_name, "duration") + if not combined_fit_duration.fit_success: + print_warning("Fitting(duration) for state " + state_name + " was not succesful!") + fit_res_pow_list.append(combined_fit_power) + fit_res_dur_list.append(combined_fit_duration) + + + # TODO: removed clustering (temporarily), since it provided too much dificultys + # at the current state + # i = 0 + # cluster_labels_list = [] + # num_cluster_list = [] + # for num_trace, raw_states in enumerate(raw_states_list): + # # iterate through raw states from measurements + # if len(raw_states) == num_raw_states: + # # build array with power values to cluster these + # value_to_cluster = np.zeros((num_raw_states, 2)) + # j = 0 + # for s in raw_states: + # value_to_cluster[j][0] = s[2] + # value_to_cluster[j][1] = 0 + # j = j + 1 + # # linked = linkage(value_to_cluster, 'single') + # # + # # labelList = range(1, 11) + # # + # # plt.figure(figsize=(10, 7)) + # # dendrogram(linked, + # # orientation='top', + # # distance_sort='descending', + # # show_leaf_counts=True) + # # plt.show() + # # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER + # # im distance_threshold + # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, + # affinity='euclidean', + # linkage='ward', + # distance_threshold=opt_refinement_thresh * 100) + # # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', + # # linkage='ward') + # cluster.fit_predict(value_to_cluster) + # # print_info("Cluster labels:\n" + str(cluster.labels_)) + # # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') + # # plt.show() + # cluster_labels_list.append((num_trace, cluster.labels_)) + # num_cluster_list.append((num_trace, cluster.n_clusters_)) + # i = i + 1 + # else: + # print_info("Discarding measurement No. " + str(num_trace) + " because it " + # + "did not recognize the number of raw_states correctly.") + # num_used_measurements = len(raw_states_list) + # if i != len(raw_states_list): + # if i / len(raw_states_list) <= 0.5: + # print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) + # + " Measurements for refinement. " + # "Others did not recognize number of states correctly." + # "\nYou should verify the integrity of the measurements.") + # else: + # print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + # + " Measurements for refinement. " + # "Others did not recognize number of states correctly.") + # num_used_measurements = i + # # TODO: DEBUG Kram + # sys.exit(0) + # else: + # print_info("Used all available measurements.") + # + # num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list])) + # avg_per_state_list = [None] * len(cluster_labels_list) + # used_clusters = 0 + # for number, (num_trace, labels) in enumerate(cluster_labels_list): + # if num_cluster_list[number][1] == num_states: + # avg_per_state = [0] * num_states + # count_per_state = [0] * num_states + # raw_states = raw_states_list[num_trace] + # for num_label, label in enumerate(labels): + # count_per_state[label] = count_per_state[label] + 1 + # avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] + # for i, _ in enumerate(avg_per_state): + # avg_per_state[i] = avg_per_state[i] / count_per_state[i] + # avg_per_state_list[number] = avg_per_state + # used_clusters = used_clusters + 1 + # else: + # # hopefully this does not happen regularly + # print_info("Discarding measurement " + str(number) + # + " because the clustering yielded not matching results.") + # num_used_measurements = num_used_measurements - 1 + # if num_used_measurements == 0: + # print_error("Something went terribly wrong. Discarded all measurements.") + # # continue + # sys.exit(-1) + # # flattend version for clustering: + # values_to_cluster = np.zeros((num_states * used_clusters, 2)) + # index = 0 + # for avg_per_state in avg_per_state_list: + # if avg_per_state is not None: + # for avg in avg_per_state: + # values_to_cluster[index][0] = avg + # values_to_cluster[index][1] = 0 + # index = index + 1 + # # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1]) + # # plt.show() + # cluster = AgglomerativeClustering(n_clusters=num_states) + # cluster.fit_predict(values_to_cluster) + # # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die + # # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. + # # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting + # # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. + # new_labels_list = [] + # new_labels = [] + # i = 0 + # for label in cluster.labels_: + # new_labels.append(label) + # i = i + 1 + # if i == num_states: + # new_labels_list.append(new_labels) + # new_labels = [] + # i = 0 + # # only the selected measurements are present in new_labels. + # # new_labels_index should not be incremented, if not selected_measurement is skipped + # new_labels_index = 0 + # # cluster_labels_list contains all measurements -> if measurement is skipped + # # still increment the index + # index = 0 + # for elem in avg_per_state_list: + # if elem is not None: + # for number, label in enumerate(cluster_labels_list[index][1]): + # cluster_labels_list[index][1][number] = \ + # new_labels_list[new_labels_index][label] + # new_labels_index = new_labels_index + 1 + # else: + # # override not selected measurement labels to avoid choosing the wrong ones. + # for number, label in enumerate(cluster_labels_list[index][1]): + # cluster_labels_list[index][1][number] = -1 + # index = index + 1 + # resulting_sequence = [None] * num_raw_states + # i = 0 + # confidence = 0 + # for x in resulting_sequence: + # j = 0 + # test_list = [] + # for arr in [elem[1] for elem in cluster_labels_list]: + # if num_cluster_list[j][1] != num_states: + # j = j + 1 + # else: + # if -1 in arr: + # print_error("Bei Janis beschweren! Fehler beim Umbenennen der" + # " Zustände wahrscheinlich.") + # sys.exit(-1) + # test_list.append(arr[i]) + # j = j + 1 + # bincount = np.bincount(test_list) + # resulting_sequence[i] = np.argmax(bincount) + # confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount) + # i = i + 1 + # confidence = confidence / len(resulting_sequence) + # print_info("Confidence of resulting sequence is " + str(confidence) + # + " while using " + str(num_used_measurements) + "/" + # + str(len(raw_states_list)) + " measurements.") + # #print(resulting_sequence) + # resulting_sequence_list.append((num_config, resulting_sequence)) + # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat + # # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die + # # Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen + # # auftreten. + # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw. + # # hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem: + # # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche + # # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, + # # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines + # # Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen... + # for num_config, sequence in resulting_sequence_list: + # print_info("NO. config:" + str(num_config)) + # print_info(sequence) + # + # + # + # + elif ".tar" in opt_filename: # open with dfatool -- cgit v1.2.3 From 61fb6094a33c4855c763f1925e61aec90294daa3 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Sun, 9 Aug 2020 15:11:42 +0200 Subject: Parametrisierung scheint vernünftig zu klappen. Vermutlich fertig. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 637 ++++++++++++++++++++++++++++--------------- 1 file changed, 418 insertions(+), 219 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 75cdce6..40c405d 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -14,13 +14,14 @@ import numpy as np from dfatool.functions import analytic from dfatool.loader import RawData from dfatool import parameters -from dfatool.model import ParallelParamFit +from dfatool.model import ParallelParamFit, PTAModel from dfatool.utils import by_name_to_by_param # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display -# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=10 --refinement_thresh=100 +# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 +# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX_cache" def plot_data_from_json(filename, trace_num, x_axis, y_axis): @@ -294,7 +295,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, # raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model # , opt_jump)) -def calc_raw_states_func(num_trace, measurement, penalty, model, jump): +def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): signal = np.array(measurement['uW']) normed_signal = norm_signal(signal) bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump) @@ -325,7 +326,7 @@ def calc_raw_states_func(num_trace, measurement, penalty, model, jump): # print_info("The average standard deviation for the newly found states is " # + str(new_avg_std)) # print_info("That is a reduction of " + str(change_avg_std)) - return num_trace, calced_states, new_avg_std, change_avg_std + return num_measurement, calced_states, new_avg_std, change_avg_std def calc_raw_states(arg_list, num_processes=8): @@ -382,6 +383,27 @@ def norm_signal(signal): return normed_signal +def norm_values_to_cluster(values_to_cluster): + new_vals = np.array(values_to_cluster) + num_samples = len(values_to_cluster) + num_params = len(values_to_cluster[0]) + for i in range(num_params): + param_vals = [] + for sample in new_vals: + param_vals.append(sample[i]) + max_val = np.max(np.abs(param_vals)) + for num_sample, sample in enumerate(new_vals): + values_to_cluster[num_sample][i] = sample[i] / max_val + return new_vals + + +def get_state_num(state_name, distinct_states): + for state_num, states in enumerate(distinct_states): + if state_name in states: + return state_num + return -1 + + if __name__ == '__main__': # OPTION RECOGNITION opt = dict() @@ -536,6 +558,7 @@ if __name__ == '__main__': by_param_file = None by_name_file = None param_names_file = None + from_cache = False if opt_cache_loc is not None: flag = False by_name_loc = os.path.join(opt_cache_loc, "by_name.txt") @@ -558,6 +581,12 @@ if __name__ == '__main__': flag = True if flag: print_info("The cache will be build.") + else: + print_warning("THE OPTION \"cache_dicts\" IS FOR DEBUGGING PURPOSES ONLY! " + "\nDO NOT USE FOR REGULAR APPLICATIONS!" + "\nThe script will not run to the end properly." + "\nNo final parametrization will be done.") + from_cache = True if None in (by_param_file, by_name_file, param_names_file): state_durations_by_config = [] @@ -565,7 +594,8 @@ if __name__ == '__main__': for num_config, measurements_by_config in enumerate(configurations): # loop through all occurrences of the looked at state print_info("Looking at state '" + measurements_by_config['name'] + "' with params: " - + str(measurements_by_config['parameter']) + "(" + str(num_config + 1) + "/" + + str(measurements_by_config['parameter']) + "(" + str( + num_config + 1) + "/" + str(len(configurations)) + ")") refine = False print_info("Checking if refinement is necessary...") @@ -578,8 +608,9 @@ if __name__ == '__main__': print_info("Refinement is necessary!") refine = True if not refine: - print_info("No refinement necessary for state '" + measurements_by_config['name'] - + "' with params: " + str(measurements_by_config['parameter'])) + print_info( + "No refinement necessary for state '" + measurements_by_config['name'] + + "' with params: " + str(measurements_by_config['parameter'])) else: # assume that all measurements of the same param configuration are fundamentally # similar -> calculate penalty for first measurement, use it for all @@ -598,7 +629,8 @@ if __name__ == '__main__': # build arguments for parallel excecution print_info("Starting raw_states calculation.") raw_states_calc_args = [] - for num_measurement, measurement in enumerate(measurements_by_config['offline']): + for num_measurement, measurement in enumerate( + measurements_by_config['offline']): raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model, opt_jump)) @@ -608,15 +640,16 @@ if __name__ == '__main__': # entry still corresponds with index of measurement in measurements_by_states # -> If measurements are discarded the correct ones are easily recognized for ret_val in raw_states_res: - num_trace = ret_val[0] + num_measurement = ret_val[0] raw_states = ret_val[1] avg_std = ret_val[2] change_avg_std = ret_val[3] # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch # int sein oder nicht? Es scheint auch vernünftig zu klappen... - raw_states_list[num_trace] = raw_states + raw_states_list[num_measurement] = raw_states print_info("The average standard deviation for the newly found states in " - + "measurement No. " + str(num_trace) + " is " + str(avg_std)) + + "measurement No. " + str(num_measurement) + " is " + str( + avg_std)) print_info("That is a reduction of " + str(change_avg_std)) print_info("Finished raw_states calculation.") num_states_array = [int()] * len(raw_states_list) @@ -643,37 +676,46 @@ if __name__ == '__main__': print_info("Choose " + str(num_raw_states) + " as number of raw_states.") # iterate through all found breakpoints and determine start and end points as well # as power consumption - states_duration_list = [0] * num_raw_states - states_consumption_list = [0] * num_raw_states + num_measurements = len(raw_states_list) + states_duration_list = [list()] * num_raw_states + states_consumption_list = [list()] * num_raw_states + for num_elem, _ in enumerate(states_duration_list): + states_duration_list[num_elem] = [0] * num_measurements + states_consumption_list[num_elem] = [0] * num_measurements num_used_measurements = 0 - for num_trace, raw_states in enumerate(raw_states_list): + for num_measurement, raw_states in enumerate(raw_states_list): if len(raw_states) == num_raw_states: num_used_measurements = num_used_measurements + 1 - # calced_state = (start_time, end_time, mean_power, std_dev) for num_state, s in enumerate(raw_states): - state_duration = s[1] - s[0] - state_consumption = s[2] - states_duration_list[num_state] = \ - states_duration_list[num_state] + state_duration - states_consumption_list[num_state] = \ - states_consumption_list[num_state] + state_consumption + states_duration_list[num_state][num_measurement] = s[1] - s[0] + states_consumption_list[num_state][num_measurement] = s[2] + # calced_state = (start_time, end_time, mean_power, std_dev) + # for num_state, s in enumerate(raw_states): + # state_duration = s[1] - s[0] + # state_consumption = s[2] + # states_duration_list[num_state] = \ + # states_duration_list[num_state] + state_duration + # states_consumption_list[num_state] = \ + # states_consumption_list[num_state] + state_consumption else: - print_info("Discarding measurement No. " + str(num_trace) + " because it " - + "did not recognize the number of raw_states correctly.") - for i, x in enumerate(states_duration_list): - states_duration_list[i] = x / num_used_measurements - for i, x in enumerate(states_consumption_list): - states_consumption_list[i] = x / num_used_measurements + print_info("Discarding measurement No. " + str(num_measurement) + + " because it did not recognize the number of " + "raw_states correctly.") + # for i, x in enumerate(states_duration_list): + # states_duration_list[i] = x / num_used_measurements + # for i, x in enumerate(states_consumption_list): + # states_consumption_list[i] = x / num_used_measurements if num_used_measurements != len(raw_states_list): if num_used_measurements / len(raw_states_list) <= 0.5: print_warning("Only used " + str(num_used_measurements) + "/" - + str(len(raw_states_list)) + " Measurements for refinement. " + + str( + len(raw_states_list)) + " Measurements for refinement. " + "Others did not recognize number of states correctly." + "\nYou should verify the integrity of the measurements.") else: print_info("Used " + str(num_used_measurements) + "/" - + str(len(raw_states_list)) + " Measurements for refinement. " - + "Others did not recognize number of states correctly.") + + str(len(raw_states_list)) + " Measurements for refinement." + + " Others did not recognize number of states correctly.") num_used_measurements = i # TODO: DEBUG Kram sys.exit(0) @@ -697,20 +739,19 @@ if __name__ == '__main__': num_raw_states = np.argmax(counts) usable_configs = len(state_consumptions_by_config) # param_list identical for each raw_state - # TODO: Kann man die echt einfach rausziehen aus der json? Ich hab sie nicht gefunden... - # Nur für jede Messung. Aber da sind die ja ohnehin identisch. param_list = [] param_names = configurations[0]['offline_aggregates']['paramkeys'][0] print_info("param_names: " + str(param_names)) for num_config, states_consumption_list in state_consumptions_by_config: if len(states_consumption_list) != num_raw_states: - print_warning("Config No." + str(num_config) + " not usable yet due to different " - + "number of states. This hints a correlation between parameters and " - + "the structure of the resulting automaton. This will be possibly be" - + " supported in a future version of this tool.") + print_warning( + "Config No." + str(num_config) + " not usable yet due to different " + + "number of states. This hints a correlation between parameters and " + + "the structure of the resulting automaton. This will be possibly" + + " supported in a future version of this tool.") usable_configs = usable_configs - 1 else: - param_list.append(configurations[num_config]['offline_aggregates']['param'][0]) + param_list.extend(configurations[num_config]['offline_aggregates']['param']) print_info("param_list: " + str(param_list)) if usable_configs == len(state_consumptions_by_config): @@ -722,16 +763,16 @@ if __name__ == '__main__': consumptions_for_state = [] durations_for_state = [] for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config): - consumptions_for_state.append(states_consumption_list[i]) - durations_for_state.append(state_durations_by_config[j][1][i]) - name = "state_" + str(i) + consumptions_for_state.extend(states_consumption_list[i]) + durations_for_state.extend(state_durations_by_config[j][1][i]) + state_name = "state_" + str(i) state_dict = { "param": param_list, "power": consumptions_for_state, "duration": durations_for_state, "attributes": ["power", "duration"] } - by_name[name] = state_dict + by_name[state_name] = state_dict by_param = by_name_to_by_param(by_name) if opt_cache_loc is not None: by_name_loc = os.path.join(opt_cache_loc, "by_name.txt") @@ -779,8 +820,8 @@ if __name__ == '__main__': + str(stats.depends_on_param(state_name, "duration", param_name)) ) paramfit.fit() - fit_res_dur_list = [] - fit_res_pow_list = [] + fit_res_dur_dict = {} + fit_res_pow_dict = {} for state_name in by_name.keys(): fit_power = paramfit.get_result(state_name, "power") fit_duration = paramfit.get_result(state_name, "duration") @@ -792,182 +833,340 @@ if __name__ == '__main__': combined_fit_duration.fit(by_param, state_name, "duration") if not combined_fit_duration.fit_success: print_warning("Fitting(duration) for state " + state_name + " was not succesful!") - fit_res_pow_list.append(combined_fit_power) - fit_res_dur_list.append(combined_fit_duration) - - - # TODO: removed clustering (temporarily), since it provided too much dificultys - # at the current state - # i = 0 - # cluster_labels_list = [] - # num_cluster_list = [] - # for num_trace, raw_states in enumerate(raw_states_list): - # # iterate through raw states from measurements - # if len(raw_states) == num_raw_states: - # # build array with power values to cluster these - # value_to_cluster = np.zeros((num_raw_states, 2)) - # j = 0 - # for s in raw_states: - # value_to_cluster[j][0] = s[2] - # value_to_cluster[j][1] = 0 - # j = j + 1 - # # linked = linkage(value_to_cluster, 'single') - # # - # # labelList = range(1, 11) - # # - # # plt.figure(figsize=(10, 7)) - # # dendrogram(linked, - # # orientation='top', - # # distance_sort='descending', - # # show_leaf_counts=True) - # # plt.show() - # # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER - # # im distance_threshold - # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, - # affinity='euclidean', - # linkage='ward', - # distance_threshold=opt_refinement_thresh * 100) - # # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', - # # linkage='ward') - # cluster.fit_predict(value_to_cluster) - # # print_info("Cluster labels:\n" + str(cluster.labels_)) - # # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') - # # plt.show() - # cluster_labels_list.append((num_trace, cluster.labels_)) - # num_cluster_list.append((num_trace, cluster.n_clusters_)) - # i = i + 1 - # else: - # print_info("Discarding measurement No. " + str(num_trace) + " because it " - # + "did not recognize the number of raw_states correctly.") - # num_used_measurements = len(raw_states_list) - # if i != len(raw_states_list): - # if i / len(raw_states_list) <= 0.5: - # print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) - # + " Measurements for refinement. " - # "Others did not recognize number of states correctly." - # "\nYou should verify the integrity of the measurements.") - # else: - # print_info("Used " + str(i) + "/" + str(len(raw_states_list)) - # + " Measurements for refinement. " - # "Others did not recognize number of states correctly.") - # num_used_measurements = i - # # TODO: DEBUG Kram - # sys.exit(0) - # else: - # print_info("Used all available measurements.") - # - # num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list])) - # avg_per_state_list = [None] * len(cluster_labels_list) - # used_clusters = 0 - # for number, (num_trace, labels) in enumerate(cluster_labels_list): - # if num_cluster_list[number][1] == num_states: - # avg_per_state = [0] * num_states - # count_per_state = [0] * num_states - # raw_states = raw_states_list[num_trace] - # for num_label, label in enumerate(labels): - # count_per_state[label] = count_per_state[label] + 1 - # avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] - # for i, _ in enumerate(avg_per_state): - # avg_per_state[i] = avg_per_state[i] / count_per_state[i] - # avg_per_state_list[number] = avg_per_state - # used_clusters = used_clusters + 1 - # else: - # # hopefully this does not happen regularly - # print_info("Discarding measurement " + str(number) - # + " because the clustering yielded not matching results.") - # num_used_measurements = num_used_measurements - 1 - # if num_used_measurements == 0: - # print_error("Something went terribly wrong. Discarded all measurements.") - # # continue - # sys.exit(-1) - # # flattend version for clustering: - # values_to_cluster = np.zeros((num_states * used_clusters, 2)) - # index = 0 - # for avg_per_state in avg_per_state_list: - # if avg_per_state is not None: - # for avg in avg_per_state: - # values_to_cluster[index][0] = avg - # values_to_cluster[index][1] = 0 - # index = index + 1 - # # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1]) - # # plt.show() - # cluster = AgglomerativeClustering(n_clusters=num_states) - # cluster.fit_predict(values_to_cluster) - # # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die - # # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. - # # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting - # # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. - # new_labels_list = [] - # new_labels = [] - # i = 0 - # for label in cluster.labels_: - # new_labels.append(label) - # i = i + 1 - # if i == num_states: - # new_labels_list.append(new_labels) - # new_labels = [] - # i = 0 - # # only the selected measurements are present in new_labels. - # # new_labels_index should not be incremented, if not selected_measurement is skipped - # new_labels_index = 0 - # # cluster_labels_list contains all measurements -> if measurement is skipped - # # still increment the index - # index = 0 - # for elem in avg_per_state_list: - # if elem is not None: - # for number, label in enumerate(cluster_labels_list[index][1]): - # cluster_labels_list[index][1][number] = \ - # new_labels_list[new_labels_index][label] - # new_labels_index = new_labels_index + 1 - # else: - # # override not selected measurement labels to avoid choosing the wrong ones. - # for number, label in enumerate(cluster_labels_list[index][1]): - # cluster_labels_list[index][1][number] = -1 - # index = index + 1 - # resulting_sequence = [None] * num_raw_states - # i = 0 - # confidence = 0 - # for x in resulting_sequence: - # j = 0 - # test_list = [] - # for arr in [elem[1] for elem in cluster_labels_list]: - # if num_cluster_list[j][1] != num_states: - # j = j + 1 - # else: - # if -1 in arr: - # print_error("Bei Janis beschweren! Fehler beim Umbenennen der" - # " Zustände wahrscheinlich.") - # sys.exit(-1) - # test_list.append(arr[i]) - # j = j + 1 - # bincount = np.bincount(test_list) - # resulting_sequence[i] = np.argmax(bincount) - # confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount) - # i = i + 1 - # confidence = confidence / len(resulting_sequence) - # print_info("Confidence of resulting sequence is " + str(confidence) - # + " while using " + str(num_used_measurements) + "/" - # + str(len(raw_states_list)) + " measurements.") - # #print(resulting_sequence) - # resulting_sequence_list.append((num_config, resulting_sequence)) - # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat - # # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die - # # Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen - # # auftreten. - # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw. - # # hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem: - # # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche - # # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, - # # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines - # # Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen... - # for num_config, sequence in resulting_sequence_list: - # print_info("NO. config:" + str(num_config)) - # print_info(sequence) - # - # - # - # - + fit_res_pow_dict[state_name] = combined_fit_power + fit_res_dur_dict[state_name] = combined_fit_duration + # only raw_states with the same number of function parameters can be similar + num_param_pow_dict = {} + num_param_dur_dict = {} + for state_name in by_name.keys(): + model_function = str(fit_res_pow_dict[state_name].model_function) + model_args = fit_res_pow_dict[state_name].model_args + num_param_pow_dict[state_name] = len(model_args) + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print_info("Power-Function for state " + state_name + ": " + + model_function) + for state_name in by_name.keys(): + model_function = str(fit_res_dur_dict[state_name].model_function) + model_args = fit_res_dur_dict[state_name].model_args + num_param_dur_dict[state_name] = len(model_args) + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print_info("Duration-Function for state " + state_name + ": " + + model_function) + similar_raw_state_buckets = {} + for state_name in by_name.keys(): + pow_model_function = str(fit_res_pow_dict[state_name].model_function) + dur_model_function = str(fit_res_dur_dict[state_name].model_function) + key_tuple = (pow_model_function, dur_model_function) + if key_tuple not in similar_raw_state_buckets: + similar_raw_state_buckets[key_tuple] = [] + similar_raw_state_buckets[key_tuple].append(state_name) + + # cluster for each Key-Tuple using the function parameters + distinct_states = [] + for key_tuple in similar_raw_state_buckets.keys(): + print_info("Key-Tuple " + str(key_tuple) + ": " + + str(similar_raw_state_buckets[key_tuple])) + similar_states = similar_raw_state_buckets[key_tuple] + if len(similar_states) > 1: + # functions are identical -> num_params is identical + num_params = num_param_dur_dict[similar_states[0]] + num_param_pow_dict[ + similar_states[0]] + values_to_cluster = np.zeros((len(similar_states), num_params)) + for num_state, state_name in enumerate(similar_states): + dur_params = fit_res_dur_dict[state_name].model_args + pow_params = fit_res_pow_dict[state_name].model_args + j = 0 + for param in pow_params: + values_to_cluster[num_state][j] = param + j = j + 1 + for param in dur_params: + values_to_cluster[num_state][j] = param + j = j + 1 + normed_vals_to_cluster = norm_values_to_cluster(values_to_cluster) + cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, + affinity='euclidean', + linkage='ward', + # TODO: Magic Number. Beim Evaluieren finetunen + distance_threshold=1) + cluster.fit_predict(values_to_cluster) + cluster_labels = cluster.labels_ + print_info("Cluster labels:\n" + str(cluster_labels)) + if cluster.n_clusters_ > 1: + # more than one distinct state found + distinct_state_dict = {} + for num_state, label in enumerate(cluster_labels): + if label not in distinct_state_dict.keys(): + distinct_state_dict[label] = [] + distinct_state_dict[label].append(similar_states[num_state]) + for distinct_state_key in distinct_state_dict.keys(): + distinct_states.append(distinct_state_dict[distinct_state_key]) + else: + distinct_states.append(similar_states) + else: + distinct_states.append(similar_states) + for num_state, distinct_state in enumerate(distinct_states): + print("State " + str(num_state) + ": " + str(distinct_state)) + num_raw_states = len(by_name.keys()) + resulting_sequence = [int] * num_raw_states + for i in range(num_raw_states): + state_name = "state_" + str(i) + state_num = get_state_num(state_name, distinct_states) + if state_num == -1: + print_error("Critical Error when creating the resulting sequence. raw_state state_" + + str(i) + " could not be mapped to a state.") + sys.exit(-1) + resulting_sequence[i] = state_num + print("Resulting sequence is: " + str(resulting_sequence)) + # if from_cache: + # print_warning( + # "YOU USED THE OPTION \"cache_dicts\". THIS IS FOR DEBUGGING PURPOSES ONLY!" + # "\nTHE SCRIPT WILL NOW STOP PREMATURELY," + # "SINCE DATA FOR FURTHER COMPUTATION IS MISSING!") + # sys.exit(0) + + new_by_name = {} + for num_state, distinct_state in enumerate(distinct_states): + state_name = "State_" + str(num_state) + consumptions_for_state = [] + durations_for_state = [] + param_list = [] + for raw_state in distinct_state: + original_state_dict = by_name[raw_state] + param_list.extend(original_state_dict["param"]) + consumptions_for_state.extend(original_state_dict["power"]) + durations_for_state.extend(original_state_dict["duration"]) + new_state_dict = { + "param": param_list, + "power": consumptions_for_state, + "duration": durations_for_state, + "attributes": ["power", "duration"] + } + new_by_name[state_name] = new_state_dict + new_by_param = by_name_to_by_param(new_by_name) + new_stats = parameters.ParamStats(new_by_name, new_by_param, param_names, dict()) + new_paramfit = ParallelParamFit(new_by_param) + for state_name in new_by_name.keys(): + for num_param, param_name in enumerate(param_names): + if new_stats.depends_on_param(state_name, "power", param_name): + new_paramfit.enqueue(state_name, "power", num_param, param_name) + if new_stats.depends_on_param(state_name, "duration", param_name): + new_paramfit.enqueue(state_name, "duration", num_param, param_name) + print_info("State " + state_name + "s power depends on param " + param_name + ":" + + str(new_stats.depends_on_param(state_name, "power", param_name)) + ) + print_info("State " + state_name + "s duration depends on param " + param_name + ":" + + str(new_stats.depends_on_param(state_name, "duration", param_name)) + ) + new_paramfit.fit() + new_fit_res_dur_dict = {} + new_fit_res_pow_dict = {} + for state_name in new_by_name.keys(): + fit_power = new_paramfit.get_result(state_name, "power") + fit_duration = new_paramfit.get_result(state_name, "duration") + combined_fit_power = analytic.function_powerset(fit_power, param_names, 0) + combined_fit_duration = analytic.function_powerset(fit_duration, param_names, 0) + combined_fit_power.fit(new_by_param, state_name, "power") + if not combined_fit_power.fit_success: + print_warning("Fitting(power) for state " + state_name + " was not succesful!") + combined_fit_duration.fit(new_by_param, state_name, "duration") + if not combined_fit_duration.fit_success: + print_warning("Fitting(duration) for state " + state_name + " was not succesful!") + new_fit_res_pow_dict[state_name] = combined_fit_power + new_fit_res_dur_dict[state_name] = combined_fit_duration + for state_name in new_by_name.keys(): + model_function = str(new_fit_res_pow_dict[state_name].model_function) + model_args = new_fit_res_pow_dict[state_name].model_args + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print("Power-Function for state " + state_name + ": " + + model_function) + for state_name in new_by_name.keys(): + model_function = str(new_fit_res_dur_dict[state_name].model_function) + model_args = new_fit_res_dur_dict[state_name].model_args + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print("Duration-Function for state " + state_name + ": " + + model_function) + model = PTAModel(by_name, param_names, dict()) + + + # TODO: removed clustering (temporarily), since it provided too much dificultys + # at the current state + # i = 0 + # cluster_labels_list = [] + # num_cluster_list = [] + # for num_trace, raw_states in enumerate(raw_states_list): + # # iterate through raw states from measurements + # if len(raw_states) == num_raw_states: + # # build array with power values to cluster these + # value_to_cluster = np.zeros((num_raw_states, 2)) + # j = 0 + # for s in raw_states: + # value_to_cluster[j][0] = s[2] + # value_to_cluster[j][1] = 0 + # j = j + 1 + # # linked = linkage(value_to_cluster, 'single') + # # + # # labelList = range(1, 11) + # # + # # plt.figure(figsize=(10, 7)) + # # dendrogram(linked, + # # orientation='top', + # # distance_sort='descending', + # # show_leaf_counts=True) + # # plt.show() + # # TODO: Automatic detection of number of clusters. Aktuell noch MAGIC NUMBER + # # im distance_threshold + # cluster = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, + # affinity='euclidean', + # linkage='ward', + # distance_threshold=opt_refinement_thresh * 100) + # # cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', + # # linkage='ward') + # cluster.fit_predict(value_to_cluster) + # # print_info("Cluster labels:\n" + str(cluster.labels_)) + # # plt.scatter(value_to_cluster[:, 0], value_to_cluster[:, 1], c=cluster.labels_, cmap='rainbow') + # # plt.show() + # cluster_labels_list.append((num_trace, cluster.labels_)) + # num_cluster_list.append((num_trace, cluster.n_clusters_)) + # i = i + 1 + # else: + # print_info("Discarding measurement No. " + str(num_trace) + " because it " + # + "did not recognize the number of raw_states correctly.") + # num_used_measurements = len(raw_states_list) + # if i != len(raw_states_list): + # if i / len(raw_states_list) <= 0.5: + # print_warning("Only used " + str(i) + "/" + str(len(raw_states_list)) + # + " Measurements for refinement. " + # "Others did not recognize number of states correctly." + # "\nYou should verify the integrity of the measurements.") + # else: + # print_info("Used " + str(i) + "/" + str(len(raw_states_list)) + # + " Measurements for refinement. " + # "Others did not recognize number of states correctly.") + # num_used_measurements = i + # # TODO: DEBUG Kram + # sys.exit(0) + # else: + # print_info("Used all available measurements.") + # + # num_states = np.argmax(np.bincount([elem[1] for elem in num_cluster_list])) + # avg_per_state_list = [None] * len(cluster_labels_list) + # used_clusters = 0 + # for number, (num_trace, labels) in enumerate(cluster_labels_list): + # if num_cluster_list[number][1] == num_states: + # avg_per_state = [0] * num_states + # count_per_state = [0] * num_states + # raw_states = raw_states_list[num_trace] + # for num_label, label in enumerate(labels): + # count_per_state[label] = count_per_state[label] + 1 + # avg_per_state[label] = avg_per_state[label] + raw_states[num_label][2] + # for i, _ in enumerate(avg_per_state): + # avg_per_state[i] = avg_per_state[i] / count_per_state[i] + # avg_per_state_list[number] = avg_per_state + # used_clusters = used_clusters + 1 + # else: + # # hopefully this does not happen regularly + # print_info("Discarding measurement " + str(number) + # + " because the clustering yielded not matching results.") + # num_used_measurements = num_used_measurements - 1 + # if num_used_measurements == 0: + # print_error("Something went terribly wrong. Discarded all measurements.") + # # continue + # sys.exit(-1) + # # flattend version for clustering: + # values_to_cluster = np.zeros((num_states * used_clusters, 2)) + # index = 0 + # for avg_per_state in avg_per_state_list: + # if avg_per_state is not None: + # for avg in avg_per_state: + # values_to_cluster[index][0] = avg + # values_to_cluster[index][1] = 0 + # index = index + 1 + # # plt.scatter(values_to_cluster[:, 0], values_to_cluster[:, 1]) + # # plt.show() + # cluster = AgglomerativeClustering(n_clusters=num_states) + # cluster.fit_predict(values_to_cluster) + # # Aktuell hast du hier ein plattes Array mit labels. Jetzt also das wieder auf die + # # ursprünglichen Labels abbilden, die dann verändern mit den hier gefundenen Labels. + # # Alle identischen Zustände haben identische Labels. Dann vllt bei resulting + # # sequence ausgeben, wie groß die übereinstimmung bei der Stateabfolge ist. + # new_labels_list = [] + # new_labels = [] + # i = 0 + # for label in cluster.labels_: + # new_labels.append(label) + # i = i + 1 + # if i == num_states: + # new_labels_list.append(new_labels) + # new_labels = [] + # i = 0 + # # only the selected measurements are present in new_labels. + # # new_labels_index should not be incremented, if not selected_measurement is skipped + # new_labels_index = 0 + # # cluster_labels_list contains all measurements -> if measurement is skipped + # # still increment the index + # index = 0 + # for elem in avg_per_state_list: + # if elem is not None: + # for number, label in enumerate(cluster_labels_list[index][1]): + # cluster_labels_list[index][1][number] = \ + # new_labels_list[new_labels_index][label] + # new_labels_index = new_labels_index + 1 + # else: + # # override not selected measurement labels to avoid choosing the wrong ones. + # for number, label in enumerate(cluster_labels_list[index][1]): + # cluster_labels_list[index][1][number] = -1 + # index = index + 1 + # resulting_sequence = [None] * num_raw_states + # i = 0 + # confidence = 0 + # for x in resulting_sequence: + # j = 0 + # test_list = [] + # for arr in [elem[1] for elem in cluster_labels_list]: + # if num_cluster_list[j][1] != num_states: + # j = j + 1 + # else: + # if -1 in arr: + # print_error("Bei Janis beschweren! Fehler beim Umbenennen der" + # " Zustände wahrscheinlich.") + # sys.exit(-1) + # test_list.append(arr[i]) + # j = j + 1 + # bincount = np.bincount(test_list) + # resulting_sequence[i] = np.argmax(bincount) + # confidence = confidence + bincount[resulting_sequence[i]] / np.sum(bincount) + # i = i + 1 + # confidence = confidence / len(resulting_sequence) + # print_info("Confidence of resulting sequence is " + str(confidence) + # + " while using " + str(num_used_measurements) + "/" + # + str(len(raw_states_list)) + " measurements.") + # #print(resulting_sequence) + # resulting_sequence_list.append((num_config, resulting_sequence)) + # # TODO: Was jetzt? Hier habe ich jetzt pro Konfiguration eine Zustandsfolge. Daraus Automat + # # erzeugen. Aber wie? Oder erst parametrisieren? Eigentlich brauche ich vorher die + # # Loops. Wie erkenne ich die? Es können beliebig viele Loops an beliebigen Stellen + # # auftreten. + # # TODO: Die Zustandsfolgen werden sich nicht einfach in isomorphe(-einzelne wegfallende bzw. + # # hinzukommende Zustände) Automaten übersetzten lassen. Basiert alles auf dem Problem: + # # wie erkenne ich, dass zwei Zustände die selben sind und nicht nur einfach eine ähnliche + # # Leistungsaufnahme haben?! Vllt Zustände 2D clustern? 1Dim = Leistungsaufnahme, + # # 2Dim=Dauer? Zumindest innerhalb einer Paramkonfiguration sollte sich die Dauer eines + # # Zustands ja nicht mehr ändern. Kann sicherlich immernoch Falschclustering erzeugen... + # for num_config, sequence in resulting_sequence_list: + # print_info("NO. config:" + str(num_config)) + # print_info(sequence) + # + # + # + # elif ".tar" in opt_filename: # open with dfatool -- cgit v1.2.3 From 2a1aee9b92085e50050ea22b547db450da820eab Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Mon, 10 Aug 2020 16:40:46 +0200 Subject: Proof_Of_Concept_PELT: Kleine Bugfixes, für den Fall dass nicht alle Messungen verwendet werden können. Verbesserung der Normierung des Signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 69 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 40c405d..4819f64 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -221,7 +221,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, else: # range_min == range_max. has the same effect as pen_override knee = (range_min, None) - print_info(str(knee[0]) + " has been selected as kneepoint.") + print_info(str(knee[0]) + " has been selected as penalty.") if knee[0] is not None: return knee @@ -375,11 +375,15 @@ def print_error(str_to_prt): print("[ERROR]" + str_prt, file=sys.stderr) -def norm_signal(signal): +def norm_signal(signal, scaler=50): # TODO: maybe refine normalisation of signal + max_val = max(signal) normed_signal = np.zeros(shape=len(signal)) for i, signal_i in enumerate(signal): - normed_signal[i] = signal_i / 1000 + normed_signal[i] = signal_i / max_val + normed_signal[i] = normed_signal[i] * scaler + # plt.plot(normed_signal) + # plt.show() return normed_signal @@ -559,6 +563,7 @@ if __name__ == '__main__': by_name_file = None param_names_file = None from_cache = False + not_accurate = False if opt_cache_loc is not None: flag = False by_name_loc = os.path.join(opt_cache_loc, "by_name.txt") @@ -701,6 +706,10 @@ if __name__ == '__main__': print_info("Discarding measurement No. " + str(num_measurement) + " because it did not recognize the number of " "raw_states correctly.") + # l_signal = measurements_by_config['offline'][num_measurement]['uW'] + # l_bkpts = [s[1] for s in raw_states] + # fig, ax = rpt.display(np.array(l_signal), l_bkpts) + # plt.show() # for i, x in enumerate(states_duration_list): # states_duration_list[i] = x / num_used_measurements # for i, x in enumerate(states_consumption_list): @@ -718,7 +727,7 @@ if __name__ == '__main__': + " Others did not recognize number of states correctly.") num_used_measurements = i # TODO: DEBUG Kram - sys.exit(0) + #sys.exit(0) else: print_info("Used all available measurements.") @@ -730,7 +739,25 @@ if __name__ == '__main__': # break # combine all state durations and consumptions to parametrized model - + if len(state_durations_by_config) == 0: + print("No refinement necessary for this state. The macromodel is usable.") + sys.exit() + if len(state_durations_by_config) / len(configurations) > 1 / 2 \ + and len(state_durations_by_config) != len(configurations): + print_warning( + "Some measurements(>50%) need to be refined, however that is not true for" + " all measurements. This hints a correlation between the structure of" + " the underlying automaton and parameters. Only the ones which need to" + " be refined will be refined. THE RESULT WILL NOT ACCURATELY DEPICT " + " THE REAL WORLD.") + not_accurate = True + if len(state_durations_by_config) / len(configurations) < 1 / 2: + print_warning( + "Some measurements(<50%) need to be refined, however that is not true for" + " all measurements. This hints a correlation between the structure of" + " the underlying automaton and parameters. Or a poor quality of measurements." + " No Refinement will be done.") + sys.exit(-1) # this is only necessary because at this state only linear automatons can be modeled. num_states_array = [int()] * len(state_consumptions_by_config) for i, (_, states_consumption_list) in enumerate(state_consumptions_by_config): @@ -748,7 +775,9 @@ if __name__ == '__main__': "Config No." + str(num_config) + " not usable yet due to different " + "number of states. This hints a correlation between parameters and " + "the structure of the resulting automaton. This will be possibly" - + " supported in a future version of this tool.") + + " supported in a future version of this tool. HOWEVER AT THE MOMENT" + " THIS WILL LEAD TO INACCURATE RESULTS!") + not_accurate = True usable_configs = usable_configs - 1 else: param_list.extend(configurations[num_config]['offline_aggregates']['param']) @@ -759,18 +788,28 @@ if __name__ == '__main__': else: print_info("Using only " + str(usable_configs) + " Configs.") by_name = {} + usable_configs_2 = len(state_consumptions_by_config) for i in range(num_raw_states): consumptions_for_state = [] durations_for_state = [] for j, (_, states_consumption_list) in enumerate(state_consumptions_by_config): - consumptions_for_state.extend(states_consumption_list[i]) - durations_for_state.extend(state_durations_by_config[j][1][i]) + if len(states_consumption_list) == num_raw_states: + consumptions_for_state.extend(states_consumption_list[i]) + durations_for_state.extend(state_durations_by_config[j][1][i]) + else: + not_accurate = True + usable_configs_2 = usable_configs_2 - 1 + if usable_configs_2 != usable_configs: + print_error("an zwei unterschiedlichen Stellen wurden unterschiedlich viele " + "Messungen rausgeworfen. Bei Janis beschweren.") state_name = "state_" + str(i) state_dict = { "param": param_list, "power": consumptions_for_state, "duration": durations_for_state, - "attributes": ["power", "duration"] + "attributes": ["power", "duration"], + # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen + "isa": "state" } by_name[state_name] = state_dict by_param = by_name_to_by_param(by_name) @@ -943,7 +982,9 @@ if __name__ == '__main__': "param": param_list, "power": consumptions_for_state, "duration": durations_for_state, - "attributes": ["power", "duration"] + "attributes": ["power", "duration"], + # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen + "isa": "state" } new_by_name[state_name] = new_state_dict new_by_param = by_name_to_by_param(new_by_name) @@ -993,7 +1034,13 @@ if __name__ == '__main__': model_function = model_function.replace(replace_string, str(arg)) print("Duration-Function for state " + state_name + ": " + model_function) - model = PTAModel(by_name, param_names, dict()) + model = PTAModel(new_by_name, param_names, dict()) + model_json = model.to_json() + print(model_json) + if not_accurate: + print_warning( + "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" + " WHY.") # TODO: removed clustering (temporarily), since it provided too much dificultys -- cgit v1.2.3 From 42f0d36796f6535e484426a1ffa221bca4ea593a Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 14 Aug 2020 14:50:39 +0200 Subject: bin/ProffOfConcept: Kleine Fehlerkorrekturen. --- bin/Proof_Of_Concept_PELT.py | 52 ++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 4819f64..ac32d88 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -4,6 +4,7 @@ import time import sys import getopt import re +import pprint from multiprocessing import Pool, Manager, cpu_count from kneed import KneeLocator from sklearn.cluster import AgglomerativeClustering @@ -21,7 +22,8 @@ from dfatool.utils import by_name_to_by_param # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 -# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX_cache" +# py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 --cache_dicts --cache_loc="..\data\TX2_cache" +from dfatool.validation import CrossValidator def plot_data_from_json(filename, trace_num, x_axis, y_axis): @@ -98,7 +100,7 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False): def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0, - pen_modifier=None): + pen_modifier=None, show_plots=False): # default params in Function if model is None: model = "l1" @@ -138,7 +140,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, print_info("starting kneepoint calculation.") # init Pool with num_proesses - with Pool(num_processes) as p: + with Pool(min(num_processes, len(args))) as p: # collect results from pool result = p.starmap_async(get_bkps, args) # monitor loop @@ -199,18 +201,24 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, if i == len(fitted_bkps_val[knee[0]:]) - 1: # end sequence with last value end_index = i + # # since it is not guaranteed that this is the end of the plateau, assume the mid + # # of the plateau was hit. + # size = end_index - start_index + # end_index = end_index + size + # However this is not the clean solution. Better if search interval is widened if end_index - start_index > longest_end - longest_start: # last found sequence is the longest found yet longest_start = start_index longest_end = end_index start_index = i prev_val = num_bkpts - # plt.xlabel('Penalty') - # plt.ylabel('Number of Changepoints') - # plt.plot(pen_val, fitted_bkps_val) - # plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') - # plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') - # plt.show() + if show_plots: + plt.xlabel('Penalty') + plt.ylabel('Number of Changepoints') + plt.plot(pen_val, fitted_bkps_val) + plt.vlines(longest_start + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') + plt.vlines(longest_end + knee[0], 0, max(fitted_bkps_val), linestyles='dashed') + plt.show() # choosing pen from plateau mid_of_plat = longest_start + (longest_end - longest_start) // 2 knee = (mid_of_plat + knee[0], fitted_bkps_val[mid_of_plat + knee[0]]) @@ -331,7 +339,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): def calc_raw_states(arg_list, num_processes=8): m = Manager() - with Pool(processes=num_processes) as p: + with Pool(processes=min(num_processes, len(arg_list))) as p: # collect results from pool result = p.starmap(calc_raw_states_func, arg_list) return result @@ -375,7 +383,7 @@ def print_error(str_to_prt): print("[ERROR]" + str_prt, file=sys.stderr) -def norm_signal(signal, scaler=50): +def norm_signal(signal, scaler=25): # TODO: maybe refine normalisation of signal max_val = max(signal) normed_signal = np.zeros(shape=len(signal)) @@ -656,6 +664,10 @@ if __name__ == '__main__': + "measurement No. " + str(num_measurement) + " is " + str( avg_std)) print_info("That is a reduction of " + str(change_avg_std)) + # l_signal = measurements_by_config['offline'][num_measurement]['uW'] + # l_bkpts = [s[1] for s in raw_states] + # fig, ax = rpt.display(np.array(l_signal), l_bkpts) + # plt.show() print_info("Finished raw_states calculation.") num_states_array = [int()] * len(raw_states_list) i = 0 @@ -787,6 +799,10 @@ if __name__ == '__main__': print_info("All configs usable.") else: print_info("Using only " + str(usable_configs) + " Configs.") + if num_raw_states == 1: + print_info("Upon further inspection it is clear that no refinement is necessary." + " The macromodel is usable.") + sys.exit(-1) by_name = {} usable_configs_2 = len(state_consumptions_by_config) for i in range(num_raw_states): @@ -1034,9 +1050,17 @@ if __name__ == '__main__': model_function = model_function.replace(replace_string, str(arg)) print("Duration-Function for state " + state_name + ": " + model_function) - model = PTAModel(new_by_name, param_names, dict()) - model_json = model.to_json() - print(model_json) + # model = PTAModel(new_by_name, param_names, dict()) + # model_json = model.to_json() + # param_model, _ = model.get_fitted() + # param_quality = model.assess(param_model) + # pprint.pprint(param_quality) + # # model = PTAModel(by_name, ...) + # # validator = CrossValidator(PTAModel, by_name, ...) + # # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10) + # validator = CrossValidator(PTAModel, new_by_name, param_names, dict()) + # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10) + # pprint.pprint(param_quality) if not_accurate: print_warning( "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" -- cgit v1.2.3 From 98de5d25ce583b285965e6fd8c79ab74d3bb6db3 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 14 Aug 2020 15:19:13 +0200 Subject: bin/ProofOfConceptPELT: added resultexport to filesystem. --- bin/Proof_Of_Concept_PELT.py | 64 ++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index ac32d88..cba7009 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -547,7 +547,7 @@ if __name__ == '__main__': except getopt.GetoptError as err: print(err, file=sys.stderr) sys.exit(-1) - + filepath = os.path.dirname(opt_filename) # OPENING DATA if ".json" in opt_filename: # open file with trace data from json @@ -1034,37 +1034,37 @@ if __name__ == '__main__': print_warning("Fitting(duration) for state " + state_name + " was not succesful!") new_fit_res_pow_dict[state_name] = combined_fit_power new_fit_res_dur_dict[state_name] = combined_fit_duration - for state_name in new_by_name.keys(): - model_function = str(new_fit_res_pow_dict[state_name].model_function) - model_args = new_fit_res_pow_dict[state_name].model_args - for num_arg, arg in enumerate(model_args): - replace_string = "regression_arg(" + str(num_arg) + ")" - model_function = model_function.replace(replace_string, str(arg)) - print("Power-Function for state " + state_name + ": " - + model_function) - for state_name in new_by_name.keys(): - model_function = str(new_fit_res_dur_dict[state_name].model_function) - model_args = new_fit_res_dur_dict[state_name].model_args - for num_arg, arg in enumerate(model_args): - replace_string = "regression_arg(" + str(num_arg) + ")" - model_function = model_function.replace(replace_string, str(arg)) - print("Duration-Function for state " + state_name + ": " - + model_function) - # model = PTAModel(new_by_name, param_names, dict()) - # model_json = model.to_json() - # param_model, _ = model.get_fitted() - # param_quality = model.assess(param_model) - # pprint.pprint(param_quality) - # # model = PTAModel(by_name, ...) - # # validator = CrossValidator(PTAModel, by_name, ...) - # # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10) - # validator = CrossValidator(PTAModel, new_by_name, param_names, dict()) - # param_quality = validator.kfold(lambda m: m.get_fitted()[0], 10) - # pprint.pprint(param_quality) - if not_accurate: - print_warning( - "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" - " WHY.") + result_loc = os.path.join(filepath, "result.txt") + with open(result_loc, "w") as f: + f.write("Resulting Sequence: " + str(resulting_sequence)) + f.write("\n\n") + for state_name in new_by_name.keys(): + model_function = str(new_fit_res_pow_dict[state_name].model_function) + model_args = new_fit_res_pow_dict[state_name].model_args + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print("Power-Function for state " + state_name + ": " + + model_function) + f.write("Power-Function for state " + state_name + ": " + + model_function + "\n") + f.write("\n\n") + for state_name in new_by_name.keys(): + model_function = str(new_fit_res_dur_dict[state_name].model_function) + model_args = new_fit_res_dur_dict[state_name].model_args + for num_arg, arg in enumerate(model_args): + replace_string = "regression_arg(" + str(num_arg) + ")" + model_function = model_function.replace(replace_string, str(arg)) + print("Duration-Function for state " + state_name + ": " + + model_function) + f.write("Duration-Function for state " + state_name + ": " + + model_function + "\n") + if not_accurate: + print_warning( + "THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" + " WHY.") + f.write("THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" + " WHY.") # TODO: removed clustering (temporarily), since it provided too much dificultys -- cgit v1.2.3 From 91a42d937a0a5e50d5ac2e6369d26b23146f15e2 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Fri, 14 Aug 2020 15:52:58 +0200 Subject: bin/ProofOfConceptPELT: better nameing for result file --- bin/Proof_Of_Concept_PELT.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index cba7009..605ed7e 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -600,7 +600,7 @@ if __name__ == '__main__': "\nThe script will not run to the end properly." "\nNo final parametrization will be done.") from_cache = True - + big_state_name = configurations[0]['name'] if None in (by_param_file, by_name_file, param_names_file): state_durations_by_config = [] state_consumptions_by_config = [] @@ -1034,7 +1034,7 @@ if __name__ == '__main__': print_warning("Fitting(duration) for state " + state_name + " was not succesful!") new_fit_res_pow_dict[state_name] = combined_fit_power new_fit_res_dur_dict[state_name] = combined_fit_duration - result_loc = os.path.join(filepath, "result.txt") + result_loc = os.path.join(filepath, "result" + big_state_name + ".txt") with open(result_loc, "w") as f: f.write("Resulting Sequence: " + str(resulting_sequence)) f.write("\n\n") -- cgit v1.2.3 From e1c6e734b2d2725a0e29af6795c18a2575fe3d5d Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Tue, 22 Sep 2020 23:10:26 +0200 Subject: bin/plot_generator --- bin/plot_generator.py | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 bin/plot_generator.py diff --git a/bin/plot_generator.py b/bin/plot_generator.py new file mode 100644 index 0000000..458271d --- /dev/null +++ b/bin/plot_generator.py @@ -0,0 +1,123 @@ +import getopt +import sys +import re +import os +import numpy as np +import pprint +import json +import matplotlib.pyplot as plt + +if __name__ == '__main__': + # OPTION RECOGNITION + opt = dict() + + optspec = ( + "bench_filename= " + "result_filename= " + ) + opt_bench_filename = None + opt_result_filename = None + try: + raw_opts, args = getopt.getopt(sys.argv[1:], "", optspec.split(" ")) + + for option, parameter in raw_opts: + optname = re.sub(r"^--", "", option) + opt[optname] = parameter + except getopt.GetoptError as err: + print(err, file=sys.stderr) + sys.exit(-1) + + if "bench_filename" in opt: + opt_bench_filename = opt['bench_filename'] + else: + + sys.exit(-1) + if "result_filename" in opt: + opt_result_filename = opt['result_filename'] + else: + print("wth") + sys.exit(-1) + + with open(opt_bench_filename, 'r') as f: + configurations = json.load(f) + with open(opt_result_filename, 'r') as f: + sequence_line = f.readline() + begin_sequence = sequence_line.rfind("Resulting Sequence: ") + 20 + + if begin_sequence < 20: + print("nicht gefunden!") + sys.exit(-1) + sequence_substr = sequence_line[begin_sequence:] + resulting_sequence = eval(sequence_substr) + new_line = f.readline() + while new_line == "\n": + new_line = f.readline() + function_line = new_line + pow_function_dict = dict() + while function_line != "\n": + state_name_pos = function_line.find("Power-Function for state ") + 25 + state_name_end = function_line.find(":") + state_name = function_line[state_name_pos:state_name_end] + function_string = function_line[state_name_end+1:-1] + pow_function_dict[state_name] = function_string + function_line = f.readline() + new_line = "\n" + while new_line == "\n": + new_line = f.readline() + function_line = new_line + dur_function_dict = dict() + while function_line != "\n" and function_line != "" and "THIS RESULT IS NOT ACCURATE." not in function_line: + state_name_pos = function_line.find("Duration-Function for state ") + 28 + state_name_end = function_line.find(":") + state_name = function_line[state_name_pos:state_name_end] + function_string = function_line[state_name_end+1:-1] + dur_function_dict[state_name] = function_string + function_line = f.readline() + + param_names = configurations[0]['offline_aggregates']['paramkeys'][0] + + for num_fig in range(0, min(4, len(configurations))): + rand_config_no = np.random.randint(0, len(configurations), 1)[0] + rand_conf = configurations[rand_config_no] + rand_signal = np.array(rand_conf['offline'][0]['uW']) + rand_param = rand_conf['offline_aggregates']['param'][0] + rand_max_pow = max(rand_signal) + # pprint.pprint(rand_param) + pretty_rand_param = pprint.pformat(rand_param) + print(str(param_names) + "(" + str(rand_config_no) + ")" + "\n" + pretty_rand_param) + time = 0 + next_time = 0 + rand_stepper = 0 + pow = 0 + resulting_coords = list() + while rand_stepper < len(resulting_sequence): + curr_state = resulting_sequence[rand_stepper] + curr_state_name = "State_" + str(curr_state) + curr_pow_func = pow_function_dict[curr_state_name] + curr_dur_func = dur_function_dict[curr_state_name] + for num_param, name in enumerate(param_names): + replace_string = "parameter(" + name + ")" + curr_pow_func = curr_pow_func.replace(replace_string, str(rand_param[num_param])) + curr_dur_func = curr_dur_func.replace(replace_string, str(rand_param[num_param])) + pow = eval(curr_pow_func) + dur = eval(curr_dur_func) + next_time = time + dur + start_coord = (time, pow) + end_coord = (next_time, pow) + resulting_coords.append(start_coord) + resulting_coords.append(end_coord) + rand_stepper = rand_stepper + 1 + time = next_time + + with open("res_conf_" + str(num_fig) + "_signal.txt", 'w') as f: + f.write("x,y\n") + for x, y in enumerate(rand_signal): + f.write(str(x) + "," + str(y) + "\n") + with open("res_conf_" + str(num_fig) + "_fit.txt", 'w') as f: + f.write("x,y\n") + for x, y in resulting_coords: + f.write(str(x) + "," + str(y) + "\n") + plt.plot(rand_signal) + plt.plot([x for x, y in resulting_coords], [y for x, y in resulting_coords]) + plt.savefig("res_conf_" + str(num_fig) + "_pic.pdf", format='pdf', dpi=300) + plt.clf() -- cgit v1.2.3 From 522d8280cf95f43ca6d5904ae5d79a9a9c502af3 Mon Sep 17 00:00:00 2001 From: jfalkenhagen Date: Tue, 22 Sep 2020 23:45:43 +0200 Subject: bin/Proof_Of_Conecpt_PELT.py: Schöner kommentiert. Eigentlich für die Abgabe bereit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/Proof_Of_Concept_PELT.py | 213 +++++++++++++++++++------------------------ 1 file changed, 93 insertions(+), 120 deletions(-) diff --git a/bin/Proof_Of_Concept_PELT.py b/bin/Proof_Of_Concept_PELT.py index 605ed7e..688c5a7 100644 --- a/bin/Proof_Of_Concept_PELT.py +++ b/bin/Proof_Of_Concept_PELT.py @@ -18,7 +18,6 @@ from dfatool import parameters from dfatool.model import ParallelParamFit, PTAModel from dfatool.utils import by_name_to_by_param - # from scipy.cluster.hierarchy import dendrogram, linkage # for graphical display # py bin\Proof_Of_Concept_PELT.py --filename="..\data\TX.json" --jump=1 --pen_override=28 --refinement_thresh=100 @@ -26,6 +25,7 @@ from dfatool.utils import by_name_to_by_param from dfatool.validation import CrossValidator +# helper functions. Not used def plot_data_from_json(filename, trace_num, x_axis, y_axis): with open(filename, 'r') as file: tx_data = json.load(file) @@ -60,18 +60,26 @@ def plot_data_vs_data_vs_means(signal1, signal2, x_axis, y_axis): plt.show() +# returns the found changepoints by algo for the specific penalty pen. +# algo should be the return value of Pelt(...).fit(signal) +# Also puts a token in container q to let the progressmeter know the changepoints for penalty pen +# have been calculated. +# used for parallel calculation of changepoints vs penalty def get_bkps(algo, pen, q): res = pen, len(algo.predict(pen=pen)) q.put(pen) return res +# Wrapper for kneedle def find_knee_point(data_x, data_y, S=1.0, curve='convex', direction='decreasing'): kneedle = KneeLocator(data_x, data_y, S=S, curve=curve, direction=direction) kneepoint = (kneedle.knee, kneedle.knee_y) return kneepoint +# returns the changepoints found on signal with penalty penalty. +# model, jump and min_dist are directly passed to PELT def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False): # default params in Function if model is None: @@ -98,6 +106,11 @@ def calc_pelt(signal, penalty, model="l1", jump=5, min_dist=2, plotting=False): sys.exit(-1) +# calculates and returns the necessary penalty for signal. Parallel execution with num_processes many processes +# jump, min_dist are passed directly to PELT. S is directly passed to kneedle. +# pen_modifier is used as a factor on the resulting penalty. +# the interval [range_min, range_max] is used for searching. +# refresh_delay and refresh_thresh are used to configure the progress "bar". def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, range_max=50, num_processes=8, refresh_delay=1, refresh_thresh=5, S=1.0, pen_modifier=None, show_plots=False): @@ -136,6 +149,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, q = m.Queue() for i in range(range_min, range_max + 1): + # same calculation for all except other penalty args.append((algo, i, q)) print_info("starting kneepoint calculation.") @@ -184,7 +198,9 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, # peaks, peak_plateaus = find_peaks(- np.array(fitted_bkps_val), plateau_size=1) # Since the data is monotonously decreasing only one plateau can be found. - # assuming the plateau is constant + # assuming the plateau is constant, i.e. no noise. OK to assume this here, since num_bkpts + # is monotonously decreasing. If the number of bkpts decreases inside a considered + # plateau, it means that the stable configuration is not yet met. -> Search further start_index = -1 end_index = -1 longest_start = -1 @@ -206,6 +222,7 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, # size = end_index - start_index # end_index = end_index + size # However this is not the clean solution. Better if search interval is widened + # with range_min and range_max if end_index - start_index > longest_end - longest_start: # last found sequence is the longest found yet longest_start = start_index @@ -238,78 +255,20 @@ def calculate_penalty_value(signal, model="l1", jump=5, min_dist=2, range_min=0, sys.exit(-1) -# very short benchmark yielded approx. 1/3 of speed compared to solution with sorting -# def needs_refinement_no_sort(signal, mean, thresh): -# # linear search for the top 10%/ bottom 10% -# # should be sufficient -# length_of_signal = len(signal) -# percentile_size = int() -# percentile_size = length_of_signal // 100 -# upper_percentile = [None] * percentile_size -# lower_percentile = [None] * percentile_size -# fill_index_upper = percentile_size - 1 -# fill_index_lower = percentile_size - 1 -# index_smallest_val = fill_index_upper -# index_largest_val = fill_index_lower -# -# for x in signal: -# if x > mean: -# # will be in upper percentile -# if fill_index_upper >= 0: -# upper_percentile[fill_index_upper] = x -# if x < upper_percentile[index_smallest_val]: -# index_smallest_val = fill_index_upper -# fill_index_upper = fill_index_upper - 1 -# continue -# -# if x > upper_percentile[index_smallest_val]: -# # replace smallest val. Find next smallest val -# upper_percentile[index_smallest_val] = x -# index_smallest_val = 0 -# i = 0 -# for y in upper_percentile: -# if upper_percentile[i] < upper_percentile[index_smallest_val]: -# index_smallest_val = i -# i = i + 1 -# -# else: -# if fill_index_lower >= 0: -# lower_percentile[fill_index_lower] = x -# if x > lower_percentile[index_largest_val]: -# index_largest_val = fill_index_upper -# fill_index_lower = fill_index_lower - 1 -# continue -# if x < lower_percentile[index_largest_val]: -# # replace smallest val. Find next smallest val -# lower_percentile[index_largest_val] = x -# index_largest_val = 0 -# i = 0 -# for y in lower_percentile: -# if lower_percentile[i] > lower_percentile[index_largest_val]: -# index_largest_val = i -# i = i + 1 -# -# # should have the percentiles -# lower_percentile_mean = np.mean(lower_percentile) -# upper_percentile_mean = np.mean(upper_percentile) -# dist = mean - lower_percentile_mean -# if dist > thresh: -# return True -# dist = upper_percentile_mean - mean -# if dist > thresh: -# return True -# return False - - -# raw_states_calc_args.append((num_measurement, measurement, penalty, opt_model -# , opt_jump)) +# calculates the raw_states for measurement measurement. num_measurement is used to identify the +# return value +# penalty, model and jump are directly passed to pelt def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): + # extract signal signal = np.array(measurement['uW']) + # norm signal to remove dependency on absolute values normed_signal = norm_signal(signal) + # calculate the breakpoints bkpts = calc_pelt(normed_signal, penalty, model=model, jump=jump) calced_states = list() start_time = 0 end_time = 0 + # calc metrics for all states for bkpt in bkpts: # start_time of state is end_time of previous one # (Transitions are instantaneous) @@ -322,6 +281,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): calced_states.append(calced_state) num = 0 new_avg_std = 0 + # calc avg std for all states from this measurement for s in calced_states: # print_info("State " + str(num) + " starts at t=" + str(s[0]) # + " and ends at t=" + str(s[1]) @@ -329,7 +289,11 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): # + "uW with sigma=" + str(s[3])) num = num + 1 new_avg_std = new_avg_std + s[3] - new_avg_std = new_avg_std / len(calced_states) + # check case if no state has been found to avoid crashing + if len(calced_states) != 0: + new_avg_std = new_avg_std / len(calced_states) + else: + new_avg_std = 0 change_avg_std = measurement['uW_std'] - new_avg_std # print_info("The average standard deviation for the newly found states is " # + str(new_avg_std)) @@ -337,6 +301,7 @@ def calc_raw_states_func(num_measurement, measurement, penalty, model, jump): return num_measurement, calced_states, new_avg_std, change_avg_std +# parallelize calc over all measurements def calc_raw_states(arg_list, num_processes=8): m = Manager() with Pool(processes=min(num_processes, len(arg_list))) as p: @@ -346,6 +311,7 @@ def calc_raw_states(arg_list, num_processes=8): # Very short benchmark yielded approx. 3 times the speed of solution not using sort +# checks the percentiles if refinement is necessary def needs_refinement(signal, thresh): sorted_signal = sorted(signal) length_of_signal = len(signal) @@ -364,7 +330,8 @@ def needs_refinement(signal, thresh): return True return False - +# helper functions for user output +# TODO: maybe switch with python logging feature def print_info(str_to_prt): str_lst = str_to_prt.split(sep='\n') for str_prt in str_lst: @@ -383,18 +350,17 @@ def print_error(str_to_prt): print("[ERROR]" + str_prt, file=sys.stderr) +# norms the signal and apply scaler to all values as a factor def norm_signal(signal, scaler=25): - # TODO: maybe refine normalisation of signal max_val = max(signal) normed_signal = np.zeros(shape=len(signal)) for i, signal_i in enumerate(signal): normed_signal[i] = signal_i / max_val normed_signal[i] = normed_signal[i] * scaler - # plt.plot(normed_signal) - # plt.show() return normed_signal +# norms the values to prepare them for clustering def norm_values_to_cluster(values_to_cluster): new_vals = np.array(values_to_cluster) num_samples = len(values_to_cluster) @@ -409,6 +375,7 @@ def norm_values_to_cluster(values_to_cluster): return new_vals +# finds state_num using state name def get_state_num(state_name, distinct_states): for state_num, states in enumerate(distinct_states): if state_name in states: @@ -564,9 +531,9 @@ if __name__ == '__main__': # plt.show() # sys.exit() - # loop through all traces check if refinement is necessary # resulting_sequence_list = [] # search for param_names, by_param and by_name files + # cachingopts by_param_file = None by_name_file = None param_names_file = None @@ -597,34 +564,44 @@ if __name__ == '__main__': else: print_warning("THE OPTION \"cache_dicts\" IS FOR DEBUGGING PURPOSES ONLY! " "\nDO NOT USE FOR REGULAR APPLICATIONS!" - "\nThe script will not run to the end properly." - "\nNo final parametrization will be done.") + "\nThis will possibly not be maintained in further development.") from_cache = True big_state_name = configurations[0]['name'] if None in (by_param_file, by_name_file, param_names_file): state_durations_by_config = [] state_consumptions_by_config = [] + # loop through all traces check if refinement is necessary and if necessary refine it. for num_config, measurements_by_config in enumerate(configurations): # loop through all occurrences of the looked at state print_info("Looking at state '" + measurements_by_config['name'] + "' with params: " + str(measurements_by_config['parameter']) + "(" + str( num_config + 1) + "/" + str(len(configurations)) + ")") - refine = False + num_needs_refine = 0 print_info("Checking if refinement is necessary...") for measurement in measurements_by_config['offline']: # loop through measurements of particular state # an check if state needs refinement signal = measurement['uW'] # mean = measurement['uW_mean'] - if needs_refinement(signal, opt_refinement_thresh) and not refine: - print_info("Refinement is necessary!") - refine = True - if not refine: + if needs_refinement(signal, opt_refinement_thresh): + num_needs_refine = num_needs_refine + 1 + if num_needs_refine == 0: + print_info( + "No refinement necessary for state '" + measurements_by_config['name'] + + "' with params: " + str(measurements_by_config['parameter'])) + elif num_needs_refine < len(measurements_by_config['offline']) / 2: print_info( "No refinement necessary for state '" + measurements_by_config['name'] + "' with params: " + str(measurements_by_config['parameter'])) + print_warning( + "However this decision was not unanimously. This could hint a poor" + "measurement quality.") else: + if num_needs_refine != len(measurements_by_config['parameter']): + print_warning( + "However this decision was not unanimously. This could hint a poor" + "measurement quality.") # assume that all measurements of the same param configuration are fundamentally # similar -> calculate penalty for first measurement, use it for all if opt_pen_override is None: @@ -651,18 +628,18 @@ if __name__ == '__main__': raw_states_res = calc_raw_states(raw_states_calc_args, opt_num_processes) # extracting result and putting it in correct order -> index of raw_states_list # entry still corresponds with index of measurement in measurements_by_states - # -> If measurements are discarded the correct ones are easily recognized + # -> If measurements are discarded the used ones are easily recognized for ret_val in raw_states_res: num_measurement = ret_val[0] raw_states = ret_val[1] avg_std = ret_val[2] change_avg_std = ret_val[3] - # TODO: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch + # FIXME: Wieso gibt mir meine IDE hier eine Warning aus? Der Index müsste doch # int sein oder nicht? Es scheint auch vernünftig zu klappen... raw_states_list[num_measurement] = raw_states print_info("The average standard deviation for the newly found states in " - + "measurement No. " + str(num_measurement) + " is " + str( - avg_std)) + + "measurement No. " + str(num_measurement) + " is " + + str(avg_std)) print_info("That is a reduction of " + str(change_avg_std)) # l_signal = measurements_by_config['offline'][num_measurement]['uW'] # l_bkpts = [s[1] for s in raw_states] @@ -681,8 +658,9 @@ if __name__ == '__main__': # TODO: MAGIC NUMBER if num_states_dev > 1: print_warning("The number of states varies strongly across measurements." - " Consider choosing a larger value for S or using the " - "pen_modifier option.") + " Consider choosing a larger range for penalty detection." + " It is also possible, that the processed data is not accurate" + " enough to produce proper results.") time.sleep(5) # TODO: Wie bekomme ich da jetzt raus, was die Wahrheit ist? # Einfach Durchschnitt nehmen? @@ -691,6 +669,11 @@ if __name__ == '__main__': counts = np.bincount(num_states_array) num_raw_states = np.argmax(counts) print_info("Choose " + str(num_raw_states) + " as number of raw_states.") + if num_raw_states == 1: + print_info( + "Upon further inspection it is clear that no refinement is necessary." + " The macromodel is usable for this configuration.") + continue # iterate through all found breakpoints and determine start and end points as well # as power consumption num_measurements = len(raw_states_list) @@ -729,8 +712,8 @@ if __name__ == '__main__': if num_used_measurements != len(raw_states_list): if num_used_measurements / len(raw_states_list) <= 0.5: print_warning("Only used " + str(num_used_measurements) + "/" - + str( - len(raw_states_list)) + " Measurements for refinement. " + + str(len(raw_states_list)) + + " Measurements for refinement. " + "Others did not recognize number of states correctly." + "\nYou should verify the integrity of the measurements.") else: @@ -738,22 +721,16 @@ if __name__ == '__main__': + str(len(raw_states_list)) + " Measurements for refinement." + " Others did not recognize number of states correctly.") num_used_measurements = i - # TODO: DEBUG Kram - #sys.exit(0) else: print_info("Used all available measurements.") state_durations_by_config.append((num_config, states_duration_list)) state_consumptions_by_config.append((num_config, states_consumption_list)) - # # TODO: - # if num_config == 6: - # print("BRECHE AUS") - # break # combine all state durations and consumptions to parametrized model if len(state_durations_by_config) == 0: print("No refinement necessary for this state. The macromodel is usable.") - sys.exit() + sys.exit(1) if len(state_durations_by_config) / len(configurations) > 1 / 2 \ and len(state_durations_by_config) != len(configurations): print_warning( @@ -799,10 +776,7 @@ if __name__ == '__main__': print_info("All configs usable.") else: print_info("Using only " + str(usable_configs) + " Configs.") - if num_raw_states == 1: - print_info("Upon further inspection it is clear that no refinement is necessary." - " The macromodel is usable.") - sys.exit(-1) + # build by_name by_name = {} usable_configs_2 = len(state_consumptions_by_config) for i in range(num_raw_states): @@ -824,7 +798,7 @@ if __name__ == '__main__': "power": consumptions_for_state, "duration": durations_for_state, "attributes": ["power", "duration"], - # Da kein richtiger Automat generiert wird, gibt es auch keine Transitionen + # Da kein "richtiger" Automat generiert wird, gibt es auch keine Transitionen "isa": "state" } by_name[state_name] = state_dict @@ -877,6 +851,7 @@ if __name__ == '__main__': paramfit.fit() fit_res_dur_dict = {} fit_res_pow_dict = {} + # fit functions and check if successful for state_name in by_name.keys(): fit_power = paramfit.get_result(state_name, "power") fit_duration = paramfit.get_result(state_name, "duration") @@ -893,6 +868,7 @@ if __name__ == '__main__': # only raw_states with the same number of function parameters can be similar num_param_pow_dict = {} num_param_dur_dict = {} + # print found substate_results for state_name in by_name.keys(): model_function = str(fit_res_pow_dict[state_name].model_function) model_args = fit_res_pow_dict[state_name].model_args @@ -911,6 +887,7 @@ if __name__ == '__main__': model_function = model_function.replace(replace_string, str(arg)) print_info("Duration-Function for state " + state_name + ": " + model_function) + # sort states in buckets for clustering similar_raw_state_buckets = {} for state_name in by_name.keys(): pow_model_function = str(fit_res_pow_dict[state_name].model_function) @@ -927,7 +904,9 @@ if __name__ == '__main__': + str(similar_raw_state_buckets[key_tuple])) similar_states = similar_raw_state_buckets[key_tuple] if len(similar_states) > 1: - # functions are identical -> num_params is identical + # only necessary to cluster if more than one raw_state has the same function + # configuration + # functions are identical -> num_params and used params are identical num_params = num_param_dur_dict[similar_states[0]] + num_param_pow_dict[ similar_states[0]] values_to_cluster = np.zeros((len(similar_states), num_params)) @@ -951,7 +930,7 @@ if __name__ == '__main__': cluster_labels = cluster.labels_ print_info("Cluster labels:\n" + str(cluster_labels)) if cluster.n_clusters_ > 1: - # more than one distinct state found + # more than one distinct state found -> seperation of raw_states necessary distinct_state_dict = {} for num_state, label in enumerate(cluster_labels): if label not in distinct_state_dict.keys(): @@ -960,6 +939,7 @@ if __name__ == '__main__': for distinct_state_key in distinct_state_dict.keys(): distinct_states.append(distinct_state_dict[distinct_state_key]) else: + # all raw_states make up this state distinct_states.append(similar_states) else: distinct_states.append(similar_states) @@ -968,6 +948,7 @@ if __name__ == '__main__': num_raw_states = len(by_name.keys()) resulting_sequence = [int] * num_raw_states for i in range(num_raw_states): + # apply the projection from raw_states to states state_name = "state_" + str(i) state_num = get_state_num(state_name, distinct_states) if state_num == -1: @@ -982,7 +963,7 @@ if __name__ == '__main__': # "\nTHE SCRIPT WILL NOW STOP PREMATURELY," # "SINCE DATA FOR FURTHER COMPUTATION IS MISSING!") # sys.exit(0) - + # parameterize all new states new_by_name = {} for num_state, distinct_state in enumerate(distinct_states): state_name = "State_" + str(num_state) @@ -1034,6 +1015,7 @@ if __name__ == '__main__': print_warning("Fitting(duration) for state " + state_name + " was not succesful!") new_fit_res_pow_dict[state_name] = combined_fit_power new_fit_res_dur_dict[state_name] = combined_fit_duration + # output results result_loc = os.path.join(filepath, "result" + big_state_name + ".txt") with open(result_loc, "w") as f: f.write("Resulting Sequence: " + str(resulting_sequence)) @@ -1066,9 +1048,11 @@ if __name__ == '__main__': f.write("THIS RESULT IS NOT ACCURATE. SEE WARNINGLOG TO GET A BETTER UNDERSTANDING" " WHY.") - - # TODO: removed clustering (temporarily), since it provided too much dificultys - # at the current state + # Removed clustering at this point, since it provided too much difficulties + # at the current state. Clustering is still used, but at another point of execution. + # Now parametrization is done first. raw_states are grouped by their using a dict + # where the key is [power_function, duration_dunction]. Then all raw_states from + # each bucket are clustered by their parameters # i = 0 # cluster_labels_list = [] # num_cluster_list = [] @@ -1249,21 +1233,10 @@ if __name__ == '__main__': print_info("Preprocessing file. Depending on its size, this could take a while.") preprocessed_data = raw_data.get_preprocessed_data() print_info("File fully preprocessed") - # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json + # TODO: Mal schauen, wie ich das mache. Erstmal nur mit json. Ist erstmal raus. Wird nicht + # umgesetzt. print_error("Not implemented yet. Please generate .json files first with dfatool and use" " those.") else: print_error("Unknown dataformat") - sys.exit(-1) - - # print(tx_data[1]['parameter']) - # # parse json to array for PELT - # signal = np.array(tx_data[1]['offline'][0]['uW']) - # - # for i in range(0, len(signal)): - # signal[i] = signal[i]/1000 - # bkps = calc_pelt(signal, model=opt_model, range_max=opt_range_max, num_processes=opt_num_processes, jump=opt_jump, S=opt_S) - # fig, ax = rpt.display(signal, bkps) - # plt.xlabel('Time [us]') - # plt.ylabel('Power [mW]') - # plt.show() + sys.exit(-1) \ No newline at end of file -- cgit v1.2.3