diff options
author | Daniel Friesel <derf@finalrewind.org> | 2015-09-12 14:03:09 +0200 |
---|---|---|
committer | Daniel Friesel <derf@finalrewind.org> | 2015-09-12 14:03:09 +0200 |
commit | 3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (patch) | |
tree | dead79a7978971ebe80c84b75d2d4559360d5e14 /scripts | |
parent | 4caa67e1f2ca8ec055acf24f04d99da028e5c06f (diff) |
use Text::Levenshtein(XS) for fuzzy station name matching
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/acronyms.pl | 26 |
1 files changed, 23 insertions, 3 deletions
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl index 6dfba4b..de79b81 100755 --- a/scripts/acronyms.pl +++ b/scripts/acronyms.pl @@ -15,9 +15,13 @@ use warnings; use 5.014; use utf8; -use List::MoreUtils qw(firstval); +use List::Util qw(min); +use List::MoreUtils qw(firstval pairwise); +use Text::LevenshteinXS qw(distance); -our $VERSION = '1.00'; +# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available + +our $VERSION = '1.02'; my @stations = ( EOF @@ -148,7 +152,19 @@ sub get_station_by_name { return ($actual_match); } - return ( grep { $_->[1] =~ m{$name}i } @stations ); + my @distances = map { distance( $nname, $_->[1] ) } @stations; + my $min_dist = min(@distances); + my $minp1_dist = min( grep { $_ != $min_dist } @distances ); + my @station_map = pairwise { [ $a, $b ] } @stations, @distances; + + # arbitrary selection: edit distance < 5 is probably a typo, >= 5 + # probably means the station does not exist / has an odd name + if ( $min_dist < 5 ) { + return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; + } + + # always return a list when the edit distance is large + return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } 1; @@ -238,6 +254,10 @@ None. =item * List::MoreUtils(3pm) +=item * List::Util(3pm) + +=item * Text::LevenshteinXS(3pm) + =back =head1 BUGS AND LIMITATIONS |