summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rwxr-xr-xlib/dfatool.py3
-rw-r--r--lib/utils.py66
2 files changed, 68 insertions, 1 deletions
diff --git a/lib/dfatool.py b/lib/dfatool.py
index 528eabc..8990aed 100755
--- a/lib/dfatool.py
+++ b/lib/dfatool.py
@@ -1014,6 +1014,7 @@ def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functi
if not len(ref_results['mean']):
# Insufficient data for fitting
+ #print('[W] Insufficient data for fitting {}/{}/{}'.format(state_or_tran, model_attribute, param_index))
return {
'best' : None,
'best_rmsd' : np.inf,
@@ -1089,7 +1090,7 @@ def get_fit_result(results, name, attribute, verbose = False):
this_result['mean_rmsd'], this_result['median_rmsd']))
# See notes on depends_on_param
elif this_result['best_rmsd'] >= 0.8 * min(this_result['mean_rmsd'], this_result['median_rmsd']):
- vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ({:.0f}, {:.0f})'.format(
+ vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ref ({:.0f}, {:.0f})'.format(
name, attribute, result['key'][2], this_result['best_rmsd'],
this_result['mean_rmsd'], this_result['median_rmsd']))
else:
diff --git a/lib/utils.py b/lib/utils.py
index b748007..8d1b817 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,3 +1,4 @@
+import itertools
import numpy as np
import re
@@ -98,6 +99,71 @@ def param_slice_eq(a, b, index):
return True
return False
+def prune_dependent_parameters(by_name, parameter_names):
+ """
+ Remove dependent parameters from aggregate.
+
+ :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+ by_name[name][attribute] must be a list or 1-D numpy array.
+ by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+ Other dict members are left as-is
+ :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+
+ Model generation (and its components, such as relevant parameter detection and least squares optimization) only work if input variables (i.e., parameters)
+ are independent of each other. This function computes the correlation coefficient for each pair of parameters and removes those which depend on each other.
+ For each pair of dependent parameters, the lexically greater one is removed (e.g. "a" and "b" -> "b" is removed).
+ """
+
+ parameter_indices_to_remove = list()
+ for parameter_combination in itertools.product(range(len(parameter_names)), range(len(parameter_names))):
+ index_1, index_2 = parameter_combination
+ if index_1 >= index_2:
+ continue
+ parameter_values = [list(), list()] # both parameters have a value
+ parameter_values_1 = list() # parameter 1 has a value
+ parameter_values_2 = list() # parameter 2 has a value
+ for name in by_name:
+ for measurement in by_name[name]['param']:
+ value_1 = measurement[index_1]
+ value_2 = measurement[index_2]
+ if is_numeric(value_1):
+ parameter_values_1.append(value_1)
+ if is_numeric(value_2):
+ parameter_values_2.append(value_2)
+ if is_numeric(value_1) and is_numeric(value_2):
+ parameter_values[0].append(value_1)
+ parameter_values[1].append(value_2)
+ if len(parameter_values[0]):
+ correlation = np.corrcoef(parameter_values)[0][1]
+ if correlation != np.nan and np.abs(correlation) > 0.5:
+ print('[!] Parameters {} <-> {} are correlated with coefficcient {}'.format(parameter_names[index_1], parameter_names[index_2], correlation))
+ if len(parameter_values_1) < len(parameter_values_2):
+ index_to_remove = index_1
+ else:
+ index_to_remove = index_2
+ print(' Removing parameter {}'.format(parameter_names[index_to_remove]))
+ parameter_indices_to_remove.append(index_to_remove)
+ remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove)
+
+def remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove):
+ """
+ Remove parameters listed in `parameter_indices` from aggregate `by_name` and `parameter_names`.
+
+ :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+ by_name[name][attribute] must be a list or 1-D numpy array.
+ by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+ Other dict members are left as-is
+ :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+ :param parameter_indices_to_remove: List of parameter indices to be removed
+ """
+
+ # Start removal from the end of the list to avoid renumbering of list elemenets
+ for parameter_index in sorted(parameter_indices_to_remove, reverse = True):
+ for name in by_name:
+ for measurement in by_name[name]['param']:
+ measurement.pop(parameter_index)
+ parameter_names.pop(parameter_index)
+
def compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False):
"""
Compute standard deviation and correlation coefficient for various data partitions.