From 3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Sat, 12 Sep 2015 14:03:09 +0200 Subject: use Text::Levenshtein(XS) for fuzzy station name matching --- Build.PL | 1 + Changelog | 4 ++++ README | 14 ++++++++++++++ bin/db-iris | 3 ++- lib/Travel/Status/DE/IRIS/Stations.pm | 24 ++++++++++++++++++++++-- scripts/acronyms.pl | 26 +++++++++++++++++++++++--- t/21-iris-stations.t | 34 +++++++++++++++++++--------------- 7 files changed, 85 insertions(+), 21 deletions(-) diff --git a/Build.PL b/Build.PL index 936a835..07ccb44 100644 --- a/Build.PL +++ b/Build.PL @@ -29,6 +29,7 @@ Module::Build->new( 'List::MoreUtils' => 0, 'List::Util' => 0, 'LWP::UserAgent' => 0, + 'Text::LevenshteinXS' => 0, 'XML::LibXML' => 0, }, sign => 1, diff --git a/Changelog b/Changelog index 9af803f..296a73b 100644 --- a/Changelog +++ b/Changelog @@ -1,6 +1,10 @@ git HEAD * Result: Add info key 900 + * Station: Improve get_station matching quality by using the Levenshtein + edit distance instead of simple substring matching + * new dependency: Text::LevenshteinXS (see README for notes about + drop-in replacements) Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 diff --git a/README b/README index 33327b6..43d5eaa 100644 --- a/README +++ b/README @@ -14,8 +14,22 @@ Dependencies * List::Compare * List::MoreUtils * LWP::UserAgent +* Text::LevenshteinXS * XML::LibXML +Note about Text::LevenshteinXS: This module is old and unmaintained, but +appears to be packaged for slightly more distros than its successor +Text::Levenshtein::XS. If it is not available for your distro (and you do +not wish to build it), the following drop-in replacements are available: + +* Text::Levenshtein::XS +* Text::Levenshtein (about 10 times slower than the XS modules) + +To use them, simply run: +> sed -i 's/Text::LevenshteinXS/Text::Levenshtein::XS/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm +or +> sed -i 's/Text::LevenshteinXS/Text::Levenshtein/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm + Installation ------------ diff --git a/bin/db-iris b/bin/db-iris index 298c792..90368ca 100755 --- a/bin/db-iris +++ b/bin/db-iris @@ -153,7 +153,8 @@ sub get_station { else { say STDERR "The input '$input_name' is ambiguous. Please choose one " . 'of the following:'; - say STDERR join( "\n", map { $_->[1] } @stations ); + say STDERR + join( "\n", map { $_->[1] . ' (' . $_->[0] . ')' } @stations ); exit(1); } } diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm index 7fdd6cc..2c8b8b4 100644 --- a/lib/Travel/Status/DE/IRIS/Stations.pm +++ b/lib/Travel/Status/DE/IRIS/Stations.pm @@ -5,7 +5,11 @@ use warnings; use 5.014; use utf8; -use List::MoreUtils qw(firstval); +use List::Util qw(min); +use List::MoreUtils qw(firstval pairwise); +use Text::LevenshteinXS qw(distance); + +# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available our $VERSION = '1.02'; @@ -15265,7 +15269,19 @@ sub get_station_by_name { return ($actual_match); } - return ( grep { $_->[1] =~ m{$name}i } @stations ); + my @distances = map { distance( $nname, $_->[1] ) } @stations; + my $min_dist = min(@distances); + my $minp1_dist = min( grep { $_ != $min_dist } @distances ); + my @station_map = pairwise { [ $a, $b ] } @stations, @distances; + + # arbitrary selection: edit distance < 5 is probably a typo, >= 5 + # probably means the station does not exist / has an odd name + if ( $min_dist < 5 ) { + return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; + } + + # always return a list when the edit distance is large + return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } 1; @@ -15355,6 +15371,10 @@ None. =item * List::MoreUtils(3pm) +=item * List::Util(3pm) + +=item * Text::LevenshteinXS(3pm) + =back =head1 BUGS AND LIMITATIONS diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl index 6dfba4b..de79b81 100755 --- a/scripts/acronyms.pl +++ b/scripts/acronyms.pl @@ -15,9 +15,13 @@ use warnings; use 5.014; use utf8; -use List::MoreUtils qw(firstval); +use List::Util qw(min); +use List::MoreUtils qw(firstval pairwise); +use Text::LevenshteinXS qw(distance); -our $VERSION = '1.00'; +# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available + +our $VERSION = '1.02'; my @stations = ( EOF @@ -148,7 +152,19 @@ sub get_station_by_name { return ($actual_match); } - return ( grep { $_->[1] =~ m{$name}i } @stations ); + my @distances = map { distance( $nname, $_->[1] ) } @stations; + my $min_dist = min(@distances); + my $minp1_dist = min( grep { $_ != $min_dist } @distances ); + my @station_map = pairwise { [ $a, $b ] } @stations, @distances; + + # arbitrary selection: edit distance < 5 is probably a typo, >= 5 + # probably means the station does not exist / has an odd name + if ( $min_dist < 5 ) { + return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; + } + + # always return a list when the edit distance is large + return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } 1; @@ -238,6 +254,10 @@ None. =item * List::MoreUtils(3pm) +=item * List::Util(3pm) + +=item * Text::LevenshteinXS(3pm) + =back =head1 BUGS AND LIMITATIONS diff --git a/t/21-iris-stations.t b/t/21-iris-stations.t index 55938c4..c440fd0 100644 --- a/t/21-iris-stations.t +++ b/t/21-iris-stations.t @@ -10,16 +10,10 @@ BEGIN { } require_ok('Travel::Status::DE::IRIS::Stations'); -my @emptypairs = grep { not (length($_->[0]) and length($_->[1])) } - Travel::Status::DE::IRIS::Stations::get_stations; +my @emptypairs = grep { not( length( $_->[0] ) and length( $_->[1] ) ) } + Travel::Status::DE::IRIS::Stations::get_stations; -is_deeply(\@emptypairs, [], 'no stations with empty code / name'); - -is_deeply( - [], - [ Travel::Status::DE::IRIS::Stations::get_station('doesnotexist') ], - 'get_station: returns empty list for no match' -); +is_deeply( \@emptypairs, [], 'no stations with empty code / name' ); is_deeply( [ [ 'EE', 'Essen Hbf' ] ], @@ -52,13 +46,23 @@ is_deeply( ); is_deeply( - [ [ 'EG', 'Gelsenk Hbf' ], [ 'EGZO', 'Gelsenk Zoo' ] ], - [ Travel::Status::DE::IRIS::Stations::get_station('Gelsenk') ], - 'get_station: partial match by name works' + [ [ 'KM', 'M\'gladbach Hbf' ] ], + [ Travel::Status::DE::IRIS::Stations::get_station('mgladbach hbf') ], + 'get_station: close fuzzy match works (one result)' +); + +is_deeply( + [ [ 'KM', 'M\'gladbach Hbf' ] ], + [ Travel::Status::DE::IRIS::Stations::get_station('Mgladbach Bbf') ], + 'get_station: close fuzzy match is case insensitive' ); is_deeply( - [ [ 'EG', 'Gelsenk Hbf' ], [ 'EGZO', 'Gelsenk Zoo' ] ], - [ Travel::Status::DE::IRIS::Stations::get_station('gelsenk') ], - 'get_station: partial match by name is case insensitive' + [ + [ 'NKL', 'Kirchenlaibach' ], + [ 'KM', 'M\'gladbach Hbf' ], + [ 'XSRC', 'Reichenbach Kt' ] + ], + [ Travel::Status::DE::IRIS::Stations::get_station('Moenchengladbach Hbf') ], + 'get_station: partial match works (several results for very fuzzy match)' ); -- cgit v1.2.3