summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md1
-rw-r--r--lib/paramfit.py58
2 files changed, 31 insertions, 28 deletions
diff --git a/README.md b/README.md
index f2ddb75..f60f38d 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,7 @@ The following variables may be set to alter the behaviour of dfatool components.
| `DFATOOL_DTREE_SKLEARN_DECART` | **0**, 1 | Use sklearn CART ("Decision Tree Regression") algorithm for decision tree generation. Ignore scalar parameters, thus emulating the DECART algorithm. |
| `DFATOOL_DTREE_LMT` | **0**, 1 | Use [Linear Model Tree](https://github.com/cerlymarco/linear-tree) algorithm for regression tree generation. Uses binary nodes and linear functions. Overrides `FUNCTION_LEAVES` (=0) and `NONBINARY_NODES` (=0). |
| `DFATOOL_CART_MAX_DEPTH` | **0** .. *n* | maximum depth for sklearn CART. Default: unlimited. |
+| `DFATOOL_ULS_ERROR_METRIC` | **rmsd**, mae, p50, p90 | Error metric to use when selecting best-fitting function during unsupervised least squares (ULS) regression. Least squares regression itself minimzes root mean square deviation (rmsd), hence rmsd is the default. |
| `DFATOOL_USE_XGBOOST` | **0**, 1 | Use Extreme Gradient Boosting algorithm for decision forest generation. |
| `DFATOOL_XGB_N_ESTIMATORS` | 1 .. **100** .. *n* | Number of estimators (i.e., trees) for XGBoost. |
| `DFATOOL_XGB_MAX_DEPTH` | 2 .. **10** ** *n* | Maximum XGBoost tree depth. |
diff --git a/lib/paramfit.py b/lib/paramfit.py
index 7a82985..586e90d 100644
--- a/lib/paramfit.py
+++ b/lib/paramfit.py
@@ -2,6 +2,7 @@
import logging
import numpy as np
+import os
from multiprocessing import Pool
from scipy import optimize
from .functions import analytic
@@ -15,6 +16,7 @@ from .utils import (
)
logger = logging.getLogger(__name__)
+best_fit_metric = os.getenv("DFATOOL_ULS_ERROR_METRIC", "rmsd")
class ParamFit:
@@ -78,29 +80,29 @@ class ParamFit:
result["key"][0] == key and result["result"]["best"] is not None
): # dürfte an ['best'] != None liegen-> Fit für gefilterten Kram schlägt fehl?
this_result = result["result"]
- if this_result["best_rmsd"] >= min(
- this_result["mean_rmsd"], this_result["median_rmsd"]
+ if this_result["best_err"] >= min(
+ this_result["mean_err"], this_result["median_err"]
):
logger.debug(
"Not modeling {} as function of {}: best ({:.0f}) is worse than ref ({:.0f}, {:.0f})".format(
result["key"][0],
result["key"][1],
- this_result["best_rmsd"],
- this_result["mean_rmsd"],
- this_result["median_rmsd"],
+ this_result["best_err"],
+ this_result["mean_err"],
+ this_result["median_err"],
)
)
# See notes on depends_on_param
- elif this_result["best_rmsd"] >= 0.8 * min(
- this_result["mean_rmsd"], this_result["median_rmsd"]
+ elif this_result["best_err"] >= 0.8 * min(
+ this_result["mean_err"], this_result["median_err"]
):
logger.debug(
"Not modeling {} as function of {}: best ({:.0f} %) is not much better than ref ({:.0f} % mean, {:.0f} % median)".format(
result["key"][0],
result["key"][1],
- this_result["best_rmsd"],
- this_result["mean_rmsd"],
- this_result["median_rmsd"],
+ this_result["best_err"],
+ this_result["mean_err"],
+ this_result["median_err"],
)
)
else:
@@ -130,9 +132,9 @@ def _try_fits(
:returns: a dictionary with the following elements:
best -- name of the best-fitting function (see `analytic.functions`). `None` in case of insufficient data.
- best_rmsd -- mean Root Mean Square Deviation of best-fitting function over all combinations of the remaining parameters
- mean_rmsd -- mean Root Mean Square Deviation of a reference model using the mean of its respective input data as model value
- median_rmsd -- mean Root Mean Square Deviation of a reference model using the median of its respective input data as model value
+ best_err -- mean error of best-fitting function over all combinations of the remaining parameters
+ mean_err -- mean error of a reference model using the mean of its respective input data as model value
+ median_err -- mean error of a reference model using the median of its respective input data as model value
results -- mean goodness-of-fit measures for the individual functions. See `analytic.functions` for keys and `aggregate_measures` for values
:param n_by_param: measurements of a specific model attribute partitioned by parameter values.
@@ -222,16 +224,16 @@ def _try_fits(
raw_results[function_name][measure].append(error_rate)
# print(function_name, res, measures)
mean_measures = aggregate_measures(np.mean(Y), Y)
- ref_results["mean"].append(mean_measures["rmsd"])
+ ref_results["mean"].append(mean_measures[best_fit_metric])
raw_results_by_param[other_parameters]["mean"] = mean_measures
median_measures = aggregate_measures(np.median(Y), Y)
- ref_results["median"].append(median_measures["rmsd"])
+ ref_results["median"].append(median_measures[best_fit_metric])
raw_results_by_param[other_parameters]["median"] = median_measures
if not len(ref_results["mean"]):
# Insufficient data for fitting
# print('[W] Insufficient data for fitting {}'.format(param_index))
- return {"best": None, "best_rmsd": np.inf, "results": results}
+ return {"best": None, "best_err": np.inf, "results": results}
for (
other_parameter_combination,
@@ -243,15 +245,15 @@ def _try_fits(
for function_name, result in other_parameter_results.items():
if len(result) > 0:
results[function_name] = result
- rmsd = result["rmsd"]
- if rmsd < best_fit_val:
- best_fit_val = rmsd
+ err = result[best_fit_metric]
+ if err < best_fit_val:
+ best_fit_val = err
best_fit_name = function_name
results_by_param[other_parameter_combination] = {
"best": best_fit_name,
- "best_rmsd": best_fit_val,
- "mean_rmsd": results["mean"]["rmsd"],
- "median_rmsd": results["median"]["rmsd"],
+ "best_err": best_fit_val,
+ "mean_err": results["mean"][best_fit_metric],
+ "median_err": results["median"][best_fit_metric],
"results": results,
}
@@ -263,16 +265,16 @@ def _try_fits(
results[function_name] = {}
for measure in result.keys():
results[function_name][measure] = np.mean(result[measure])
- rmsd = results[function_name]["rmsd"]
- if rmsd < best_fit_val:
- best_fit_val = rmsd
+ err = results[function_name][best_fit_metric]
+ if err < best_fit_val:
+ best_fit_val = err
best_fit_name = function_name
return {
"best": best_fit_name,
- "best_rmsd": best_fit_val,
- "mean_rmsd": np.mean(ref_results["mean"]),
- "median_rmsd": np.mean(ref_results["median"]),
+ "best_err": best_fit_val,
+ "mean_err": np.mean(ref_results["mean"]),
+ "median_err": np.mean(ref_results["median"]),
"results": results,
"results_by_other_param": results_by_param,
}