diff options
author | Daniel Friesel <derf@finalrewind.org> | 2015-09-12 14:03:09 +0200 |
---|---|---|
committer | Daniel Friesel <derf@finalrewind.org> | 2015-09-12 14:03:09 +0200 |
commit | 3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (patch) | |
tree | dead79a7978971ebe80c84b75d2d4559360d5e14 /lib/Travel/Status/DE | |
parent | 4caa67e1f2ca8ec055acf24f04d99da028e5c06f (diff) |
use Text::Levenshtein(XS) for fuzzy station name matching
Diffstat (limited to 'lib/Travel/Status/DE')
-rw-r--r-- | lib/Travel/Status/DE/IRIS/Stations.pm | 24 |
1 files changed, 22 insertions, 2 deletions
diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm index 7fdd6cc..2c8b8b4 100644 --- a/lib/Travel/Status/DE/IRIS/Stations.pm +++ b/lib/Travel/Status/DE/IRIS/Stations.pm @@ -5,7 +5,11 @@ use warnings; use 5.014; use utf8; -use List::MoreUtils qw(firstval); +use List::Util qw(min); +use List::MoreUtils qw(firstval pairwise); +use Text::LevenshteinXS qw(distance); + +# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available our $VERSION = '1.02'; @@ -15265,7 +15269,19 @@ sub get_station_by_name { return ($actual_match); } - return ( grep { $_->[1] =~ m{$name}i } @stations ); + my @distances = map { distance( $nname, $_->[1] ) } @stations; + my $min_dist = min(@distances); + my $minp1_dist = min( grep { $_ != $min_dist } @distances ); + my @station_map = pairwise { [ $a, $b ] } @stations, @distances; + + # arbitrary selection: edit distance < 5 is probably a typo, >= 5 + # probably means the station does not exist / has an odd name + if ( $min_dist < 5 ) { + return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; + } + + # always return a list when the edit distance is large + return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } 1; @@ -15355,6 +15371,10 @@ None. =item * List::MoreUtils(3pm) +=item * List::Util(3pm) + +=item * Text::LevenshteinXS(3pm) + =back =head1 BUGS AND LIMITATIONS |