summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <daniel.friesel@uos.de>2019-10-08 08:22:50 +0200
committerDaniel Friesel <daniel.friesel@uos.de>2019-10-08 08:22:50 +0200
commit30926c1b1625978b66179b01978bebd3d3243623 (patch)
tree1a3f4c88be9bba7c2e6ac4f95ceede8e6fd82960
parent3a02d19162975b619ea11259ce763c1902837b81 (diff)
add legacy genetic programming (symbolic regression) benchmark
-rwxr-xr-xbin/gptest.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/bin/gptest.py b/bin/gptest.py
new file mode 100755
index 0000000..dd29a2a
--- /dev/null
+++ b/bin/gptest.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+import sys
+import numpy as np
+from dfatool import PTAModel, RawData, regression_measures, pta_trace_to_aggregate
+from gplearn.genetic import SymbolicRegressor
+from multiprocessing import Pool
+
+def splitidx_srs(length):
+ shuffled = np.random.permutation(np.arange(length))
+ border = int(length * float(2) / 3)
+ training = shuffled[:border]
+ validation = shuffled[border:]
+ return (training, validation)
+
+def _gp_fit(arg):
+ param = arg[0]
+ X = arg[1]
+ Y = arg[2]
+ est_gp = SymbolicRegressor(
+ population_size = param[0],
+ generations = 450,
+ parsimony_coefficient = param[1],
+ function_set = param[2].split(' '),
+ const_range = (-param[3], param[3])
+ )
+
+ training, validation = splitidx_srs(len(Y))
+ X_train = X[training]
+ Y_train = Y[training]
+ X_validation = X[validation]
+ Y_validation = Y[validation]
+
+ try:
+ est_gp.fit(X_train, Y_train)
+ return (param, str(est_gp._program), est_gp._program.raw_fitness_, regression_measures(est_gp.predict(X_validation), Y_validation))
+ except Exception as e:
+ return (param, 'Exception: {}'.format(str(e)), 999999999)
+
+
+if __name__ == '__main__':
+ population_size = [100, 500, 1000, 2000, 5000, 10000]
+ parsimony_coefficient = [0.1, 0.5, 0.1, 1]
+ function_set = ['add mul', 'add mul sub div', 'add mul sub div sqrt log inv']
+ const_lim = [100000, 50000, 10000, 1000, 500, 10, 1]
+ filenames = sys.argv[4:]
+ raw_data = RawData(filenames)
+
+ preprocessed_data = raw_data.get_preprocessed_data()
+ by_name, parameters, arg_count = pta_trace_to_aggregate(preprocessed_data)
+ model = PTAModel(by_name, parameters, arg_count, traces = preprocessed_data)
+
+ by_param = model.by_param
+
+ state_or_tran = sys.argv[1]
+
+ model_attribute = sys.argv[2]
+
+ dimension = int(sys.argv[3])
+
+ X = [[] for i in range(dimension)]
+ Y = []
+
+
+ for key, val in by_param.items():
+ if key[0] == state_or_tran and len(key[1]) == dimension:
+ Y.extend(val[model_attribute])
+ for i in range(dimension):
+ X[i].extend([float(key[1][i])] * len(val[model_attribute]))
+
+
+ X = np.array(X)
+ Y = np.array(Y)
+
+ paramqueue = []
+
+ for popsize in population_size:
+ for coef in parsimony_coefficient:
+ for fs in function_set:
+ for cl in const_lim:
+ for i in range(10):
+ paramqueue.append(((popsize, coef, fs, cl), X.T, Y))
+
+ with Pool() as pool:
+ results = pool.map(_gp_fit, paramqueue)
+
+ for res in sorted(results, key=lambda r: r[2]):
+ print('{} {:.0f} ({:.0f})\n{}'.format(res[0], res[3]['mae'], res[2], res[1]))