SymbolicRegressor: Add hyper-parameters and complexity score calculation

author: Birte Kristina Friesel <birte.friesel@uos.de> 2024-02-21 15:55:21 +0100
committer: Birte Kristina Friesel <birte.friesel@uos.de> 2024-02-21 15:55:21 +0100
commit: 37f5fa6c5d33a6f04a65cf6e6f9c01ffab897f9b (patch)
tree: 94e90fbc375077e7cf7078a3ab035357cd7fbab8 /lib
parent: bb773703ffa9951e31acb21c09e9b1fd151836fa (diff)
1 files changed, 78 insertions, 1 deletions
diff --git a/lib/functions.py b/lib/functions.py
index fdd2cfb..07f1823 100644
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -1177,6 +1177,70 @@ class XGBoostFunction(SKLearnRegressionFunction):
 
 class SymbolicRegressionFunction(SKLearnRegressionFunction):
     def fit(self, param_values, data, ignore_param_indexes=None):
+
+        # population_size : integer, optional (default=1000)
+        #     The number of programs in each generation.
+        population_size = int(os.getenv("DFATOOL_SYMREG_POPULATION_SIZE", "1000"))
+
+        # generations : integer, optional (default=20)
+        #     The number of generations to evolve.
+        generations = int(os.getenv("DFATOOL_SYMREG_GENERATIONS", "20"))
+
+        # tournament_size : integer, optional (default=20)
+        #     The number of programs that will compete to become part of the next
+        #     generation.
+        tournament_size = int(os.getenv("DFATOOL_SYMREG_TOURNAMENT_SIZE", "20"))
+
+        # const_range : tuple of two floats, or None, optional (default=(-1., 1.))
+        #     The range of constants to include in the formulas. If None then no
+        #     constants will be included in the candidate programs.
+        if cr := os.getenv("DFATOOL_SYMREG_CONST_RANGE", None):
+            if cr == "none":
+                const_range = None
+            else:
+                const_range = tuple(map(float, cr.split(",")))
+        else:
+            const_range = (-1.0, 1.0)
+
+        # function_set : iterable, optional (default=('add', 'sub', 'mul', 'div'))
+        #     The functions to use when building and evolving programs. This iterable
+        #     can include strings to indicate either individual functions as outlined
+        #     below, or you can also include your own functions as built using the
+        #     ``make_function`` factory from the ``functions`` module.
+        function_set = tuple(
+            os.getenv("DFATOOL_SYMREG_FUNCTION_SET", "add sub mul div").split()
+        )
+
+        # metric : str, optional (default='mean absolute error')
+        #     The name of the raw fitness metric. Available options include:
+        metric = os.getenv("DFATOOL_SYMREG_METRIC", "mse")
+
+        # parsimony_coefficient : float or "auto", optional (default=0.001)
+        #     This constant penalizes large programs by adjusting their fitness to
+        #     be less favorable for selection. Larger values penalize the program
+        #     more which can control the phenomenon known as 'bloat'. Bloat is when
+        #     evolution is increasing the size of programs without a significant
+        #     increase in fitness, which is costly for computation time and makes for
+        #     a less understandable final result. This parameter may need to be tuned
+        #     over successive runs.
+        #
+        #     If "auto" the parsimony coefficient is recalculated for each generation
+        #     using c = Cov(l,f)/Var( l), where Cov(l,f) is the covariance between
+        #     program size l and program fitness f in the population, and Var(l) is
+        #     the variance of program sizes.
+        parsimony_coefficient = float(
+            os.getenv("DFATOOL_SYMREG_PARSIMONY_COEFFICIENT", "0.001")
+        )
+
+        # n_jobs : integer, optional (default=1)
+        #     The number of jobs to run in parallel for `fit`. If -1, then the number
+        #     of jobs is set to the number of cores.
+        n_jobs = int(os.getenv("DFATOOL_SYMREG_N_JOBS", "1"))
+
+        # verbose : int, optional (default=0)
+        #     Controls the verbosity of the evolution building process.
+        verbose = int(os.getenv("DFATOOL_SYMREG_VERBOSE", "0"))
+
         fit_parameters, self.categorical_to_index, self.ignore_index = param_to_ndarray(
             param_values,
             with_nan=False,
@@ -1195,12 +1259,25 @@ class SymbolicRegressionFunction(SKLearnRegressionFunction):
 
         self._build_feature_names()
         self.regressor = SymbolicRegressor(
-            metric="mse", feature_names=self.feature_names
+            population_size=population_size,
+            generations=generations,
+            tournament_size=tournament_size,
+            const_range=const_range,
+            function_set=function_set,
+            metric=metric,
+            parsimony_coefficient=parsimony_coefficient,
+            n_jobs=n_jobs,
+            verbose=verbose,
+            feature_names=self.feature_names,
         )
         self.regressor.fit(fit_parameters, data)
         self.fit_success = True
         return self
 
+    def get_complexity_score(self):
+        rstr = str(self.regressor)
+        return rstr.count(",") * 2 + 1
+
 
 # first-order linear function (no feature interaction)
 class FOLFunction(ModelFunction):
author	Birte Kristina Friesel <birte.friesel@uos.de>	2024-02-21 15:55:21 +0100
committer	Birte Kristina Friesel <birte.friesel@uos.de>	2024-02-21 15:55:21 +0100
commit	37f5fa6c5d33a6f04a65cf6e6f9c01ffab897f9b (patch)
tree	94e90fbc375077e7cf7078a3ab035357cd7fbab8 /lib
parent	bb773703ffa9951e31acb21c09e9b1fd151836fa (diff)