2 files changed, 68 insertions, 1 deletions
diff --git a/lib/dfatool.py b/lib/dfatool.py
index 528eabc..8990aed 100755
--- a/lib/dfatool.py
+++ b/lib/dfatool.py
@@ -1014,6 +1014,7 @@ def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functi
 
     if not len(ref_results['mean']):
         # Insufficient data for fitting
+        #print('[W] Insufficient data for fitting {}/{}/{}'.format(state_or_tran, model_attribute, param_index))
         return {
             'best' : None,
             'best_rmsd' : np.inf,
@@ -1089,7 +1090,7 @@ def get_fit_result(results, name, attribute, verbose = False):
                     this_result['mean_rmsd'], this_result['median_rmsd']))
             # See notes on depends_on_param
             elif this_result['best_rmsd'] >= 0.8 * min(this_result['mean_rmsd'], this_result['median_rmsd']):
-                vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ({:.0f}, {:.0f})'.format(
+                vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ref ({:.0f}, {:.0f})'.format(
                     name, attribute, result['key'][2], this_result['best_rmsd'],
                     this_result['mean_rmsd'], this_result['median_rmsd']))
             else:
diff --git a/lib/utils.py b/lib/utils.py
index b748007..8d1b817 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,3 +1,4 @@
+import itertools
 import numpy as np
 import re
 
@@ -98,6 +99,71 @@ def param_slice_eq(a, b, index):
         return True
     return False
 
+def prune_dependent_parameters(by_name, parameter_names):
+    """
+    Remove dependent parameters from aggregate.
+
+    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+        by_name[name][attribute] must be a list or 1-D numpy array.
+        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+        Other dict members are left as-is
+    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+
+    Model generation (and its components, such as relevant parameter detection and least squares optimization) only work if input variables (i.e., parameters)
+    are independent of each other. This function computes the correlation coefficient for each pair of parameters and removes those which depend on each other.
+    For each pair of dependent parameters, the lexically greater one is removed (e.g. "a" and "b" -> "b" is removed).
+    """
+
+    parameter_indices_to_remove = list()
+    for parameter_combination in itertools.product(range(len(parameter_names)), range(len(parameter_names))):
+        index_1, index_2 = parameter_combination
+        if index_1 >= index_2:
+            continue
+        parameter_values = [list(), list()] # both parameters have a value
+        parameter_values_1 = list() # parameter 1 has a value
+        parameter_values_2 = list() # parameter 2 has a value
+        for name in by_name:
+            for measurement in by_name[name]['param']:
+                value_1 = measurement[index_1]
+                value_2 = measurement[index_2]
+                if is_numeric(value_1):
+                    parameter_values_1.append(value_1)
+                if is_numeric(value_2):
+                    parameter_values_2.append(value_2)
+                if is_numeric(value_1) and is_numeric(value_2):
+                    parameter_values[0].append(value_1)
+                    parameter_values[1].append(value_2)
+        if len(parameter_values[0]):
+            correlation = np.corrcoef(parameter_values)[0][1]
+            if correlation != np.nan and np.abs(correlation) > 0.5:
+                print('[!] Parameters {} <-> {} are correlated with coefficcient {}'.format(parameter_names[index_1], parameter_names[index_2], correlation))
+                if len(parameter_values_1) < len(parameter_values_2):
+                    index_to_remove = index_1
+                else:
+                    index_to_remove = index_2
+                print('    Removing parameter {}'.format(parameter_names[index_to_remove]))
+                parameter_indices_to_remove.append(index_to_remove)
+    remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove)
+
+def remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove):
+    """
+    Remove parameters listed in `parameter_indices` from aggregate `by_name` and `parameter_names`.
+
+    :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place.
+        by_name[name][attribute] must be a list or 1-D numpy array.
+        by_name[stanamete_or_trans]['param'] must be a list of parameter values.
+        Other dict members are left as-is
+    :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place.
+    :param parameter_indices_to_remove: List of parameter indices to be removed
+    """
+
+    # Start removal from the end of the list to avoid renumbering of list elemenets
+    for parameter_index in sorted(parameter_indices_to_remove, reverse = True):
+        for name in by_name:
+            for measurement in by_name[name]['param']:
+                measurement.pop(parameter_index)
+        parameter_names.pop(parameter_index)
+
 def compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False):
     """
     Compute standard deviation and correlation coefficient for various data partitions.