diff options
Diffstat (limited to 'lib')
-rwxr-xr-x | lib/dfatool.py | 3 | ||||
-rw-r--r-- | lib/utils.py | 66 |
2 files changed, 68 insertions, 1 deletions
diff --git a/lib/dfatool.py b/lib/dfatool.py index 528eabc..8990aed 100755 --- a/lib/dfatool.py +++ b/lib/dfatool.py @@ -1014,6 +1014,7 @@ def _try_fits(by_param, state_or_tran, model_attribute, param_index, safe_functi if not len(ref_results['mean']): # Insufficient data for fitting + #print('[W] Insufficient data for fitting {}/{}/{}'.format(state_or_tran, model_attribute, param_index)) return { 'best' : None, 'best_rmsd' : np.inf, @@ -1089,7 +1090,7 @@ def get_fit_result(results, name, attribute, verbose = False): this_result['mean_rmsd'], this_result['median_rmsd'])) # See notes on depends_on_param elif this_result['best_rmsd'] >= 0.8 * min(this_result['mean_rmsd'], this_result['median_rmsd']): - vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ({:.0f}, {:.0f})'.format( + vprint(verbose, '[I] Not modeling {} {} as function of {}: best ({:.0f}) is not much better than ref ({:.0f}, {:.0f})'.format( name, attribute, result['key'][2], this_result['best_rmsd'], this_result['mean_rmsd'], this_result['median_rmsd'])) else: diff --git a/lib/utils.py b/lib/utils.py index b748007..8d1b817 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,3 +1,4 @@ +import itertools import numpy as np import re @@ -98,6 +99,71 @@ def param_slice_eq(a, b, index): return True return False +def prune_dependent_parameters(by_name, parameter_names): + """ + Remove dependent parameters from aggregate. + + :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place. + by_name[name][attribute] must be a list or 1-D numpy array. + by_name[stanamete_or_trans]['param'] must be a list of parameter values. + Other dict members are left as-is + :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place. + + Model generation (and its components, such as relevant parameter detection and least squares optimization) only work if input variables (i.e., parameters) + are independent of each other. This function computes the correlation coefficient for each pair of parameters and removes those which depend on each other. + For each pair of dependent parameters, the lexically greater one is removed (e.g. "a" and "b" -> "b" is removed). + """ + + parameter_indices_to_remove = list() + for parameter_combination in itertools.product(range(len(parameter_names)), range(len(parameter_names))): + index_1, index_2 = parameter_combination + if index_1 >= index_2: + continue + parameter_values = [list(), list()] # both parameters have a value + parameter_values_1 = list() # parameter 1 has a value + parameter_values_2 = list() # parameter 2 has a value + for name in by_name: + for measurement in by_name[name]['param']: + value_1 = measurement[index_1] + value_2 = measurement[index_2] + if is_numeric(value_1): + parameter_values_1.append(value_1) + if is_numeric(value_2): + parameter_values_2.append(value_2) + if is_numeric(value_1) and is_numeric(value_2): + parameter_values[0].append(value_1) + parameter_values[1].append(value_2) + if len(parameter_values[0]): + correlation = np.corrcoef(parameter_values)[0][1] + if correlation != np.nan and np.abs(correlation) > 0.5: + print('[!] Parameters {} <-> {} are correlated with coefficcient {}'.format(parameter_names[index_1], parameter_names[index_2], correlation)) + if len(parameter_values_1) < len(parameter_values_2): + index_to_remove = index_1 + else: + index_to_remove = index_2 + print(' Removing parameter {}'.format(parameter_names[index_to_remove])) + parameter_indices_to_remove.append(index_to_remove) + remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove) + +def remove_parameters_by_indices(by_name, parameter_names, parameter_indices_to_remove): + """ + Remove parameters listed in `parameter_indices` from aggregate `by_name` and `parameter_names`. + + :param by_name: measurements partitioned by state/transition/... name and attribute, edited in-place. + by_name[name][attribute] must be a list or 1-D numpy array. + by_name[stanamete_or_trans]['param'] must be a list of parameter values. + Other dict members are left as-is + :param parameter_names: List of parameter names in the order they are used in by_name[name]['param'], edited in-place. + :param parameter_indices_to_remove: List of parameter indices to be removed + """ + + # Start removal from the end of the list to avoid renumbering of list elemenets + for parameter_index in sorted(parameter_indices_to_remove, reverse = True): + for name in by_name: + for measurement in by_name[name]['param']: + measurement.pop(parameter_index) + parameter_names.pop(parameter_index) + def compute_param_statistics(by_name, by_param, parameter_names, arg_count, state_or_trans, attribute, verbose = False): """ Compute standard deviation and correlation coefficient for various data partitions. |