From 3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Sat, 12 Sep 2015 14:03:09 +0200 Subject: use Text::Levenshtein(XS) for fuzzy station name matching --- scripts/acronyms.pl | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'scripts') diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl index 6dfba4b..de79b81 100755 --- a/scripts/acronyms.pl +++ b/scripts/acronyms.pl @@ -15,9 +15,13 @@ use warnings; use 5.014; use utf8; -use List::MoreUtils qw(firstval); +use List::Util qw(min); +use List::MoreUtils qw(firstval pairwise); +use Text::LevenshteinXS qw(distance); -our $VERSION = '1.00'; +# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available + +our $VERSION = '1.02'; my @stations = ( EOF @@ -148,7 +152,19 @@ sub get_station_by_name { return ($actual_match); } - return ( grep { $_->[1] =~ m{$name}i } @stations ); + my @distances = map { distance( $nname, $_->[1] ) } @stations; + my $min_dist = min(@distances); + my $minp1_dist = min( grep { $_ != $min_dist } @distances ); + my @station_map = pairwise { [ $a, $b ] } @stations, @distances; + + # arbitrary selection: edit distance < 5 is probably a typo, >= 5 + # probably means the station does not exist / has an odd name + if ( $min_dist < 5 ) { + return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; + } + + # always return a list when the edit distance is large + return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } 1; @@ -238,6 +254,10 @@ None. =item * List::MoreUtils(3pm) +=item * List::Util(3pm) + +=item * Text::LevenshteinXS(3pm) + =back =head1 BUGS AND LIMITATIONS -- cgit v1.2.3