summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Build.PL1
-rw-r--r--Changelog4
-rw-r--r--README14
-rwxr-xr-xbin/db-iris3
-rw-r--r--lib/Travel/Status/DE/IRIS/Stations.pm24
-rwxr-xr-xscripts/acronyms.pl26
-rw-r--r--t/21-iris-stations.t34
7 files changed, 85 insertions, 21 deletions
diff --git a/Build.PL b/Build.PL
index 936a835..07ccb44 100644
--- a/Build.PL
+++ b/Build.PL
@@ -29,6 +29,7 @@ Module::Build->new(
'List::MoreUtils' => 0,
'List::Util' => 0,
'LWP::UserAgent' => 0,
+ 'Text::LevenshteinXS' => 0,
'XML::LibXML' => 0,
},
sign => 1,
diff --git a/Changelog b/Changelog
index 9af803f..296a73b 100644
--- a/Changelog
+++ b/Changelog
@@ -1,6 +1,10 @@
git HEAD
* Result: Add info key 900
+ * Station: Improve get_station matching quality by using the Levenshtein
+ edit distance instead of simple substring matching
+ * new dependency: Text::LevenshteinXS (see README for notes about
+ drop-in replacements)
Travel::Status::DE::IRIS 1.02 - Tue May 26 2015
diff --git a/README b/README
index 33327b6..43d5eaa 100644
--- a/README
+++ b/README
@@ -14,8 +14,22 @@ Dependencies
* List::Compare
* List::MoreUtils
* LWP::UserAgent
+* Text::LevenshteinXS
* XML::LibXML
+Note about Text::LevenshteinXS: This module is old and unmaintained, but
+appears to be packaged for slightly more distros than its successor
+Text::Levenshtein::XS. If it is not available for your distro (and you do
+not wish to build it), the following drop-in replacements are available:
+
+* Text::Levenshtein::XS
+* Text::Levenshtein (about 10 times slower than the XS modules)
+
+To use them, simply run:
+> sed -i 's/Text::LevenshteinXS/Text::Levenshtein::XS/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm
+or
+> sed -i 's/Text::LevenshteinXS/Text::Levenshtein/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm
+
Installation
------------
diff --git a/bin/db-iris b/bin/db-iris
index 298c792..90368ca 100755
--- a/bin/db-iris
+++ b/bin/db-iris
@@ -153,7 +153,8 @@ sub get_station {
else {
say STDERR "The input '$input_name' is ambiguous. Please choose one "
. 'of the following:';
- say STDERR join( "\n", map { $_->[1] } @stations );
+ say STDERR
+ join( "\n", map { $_->[1] . ' (' . $_->[0] . ')' } @stations );
exit(1);
}
}
diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm
index 7fdd6cc..2c8b8b4 100644
--- a/lib/Travel/Status/DE/IRIS/Stations.pm
+++ b/lib/Travel/Status/DE/IRIS/Stations.pm
@@ -5,7 +5,11 @@ use warnings;
use 5.014;
use utf8;
-use List::MoreUtils qw(firstval);
+use List::Util qw(min);
+use List::MoreUtils qw(firstval pairwise);
+use Text::LevenshteinXS qw(distance);
+
+# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available
our $VERSION = '1.02';
@@ -15265,7 +15269,19 @@ sub get_station_by_name {
return ($actual_match);
}
- return ( grep { $_->[1] =~ m{$name}i } @stations );
+ my @distances = map { distance( $nname, $_->[1] ) } @stations;
+ my $min_dist = min(@distances);
+ my $minp1_dist = min( grep { $_ != $min_dist } @distances );
+ my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
+
+ # arbitrary selection: edit distance < 5 is probably a typo, >= 5
+ # probably means the station does not exist / has an odd name
+ if ( $min_dist < 5 ) {
+ return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
+ }
+
+ # always return a list when the edit distance is large
+ return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
}
1;
@@ -15355,6 +15371,10 @@ None.
=item * List::MoreUtils(3pm)
+=item * List::Util(3pm)
+
+=item * Text::LevenshteinXS(3pm)
+
=back
=head1 BUGS AND LIMITATIONS
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl
index 6dfba4b..de79b81 100755
--- a/scripts/acronyms.pl
+++ b/scripts/acronyms.pl
@@ -15,9 +15,13 @@ use warnings;
use 5.014;
use utf8;
-use List::MoreUtils qw(firstval);
+use List::Util qw(min);
+use List::MoreUtils qw(firstval pairwise);
+use Text::LevenshteinXS qw(distance);
-our $VERSION = '1.00';
+# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available
+
+our $VERSION = '1.02';
my @stations = (
EOF
@@ -148,7 +152,19 @@ sub get_station_by_name {
return ($actual_match);
}
- return ( grep { $_->[1] =~ m{$name}i } @stations );
+ my @distances = map { distance( $nname, $_->[1] ) } @stations;
+ my $min_dist = min(@distances);
+ my $minp1_dist = min( grep { $_ != $min_dist } @distances );
+ my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
+
+ # arbitrary selection: edit distance < 5 is probably a typo, >= 5
+ # probably means the station does not exist / has an odd name
+ if ( $min_dist < 5 ) {
+ return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
+ }
+
+ # always return a list when the edit distance is large
+ return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
}
1;
@@ -238,6 +254,10 @@ None.
=item * List::MoreUtils(3pm)
+=item * List::Util(3pm)
+
+=item * Text::LevenshteinXS(3pm)
+
=back
=head1 BUGS AND LIMITATIONS
diff --git a/t/21-iris-stations.t b/t/21-iris-stations.t
index 55938c4..c440fd0 100644
--- a/t/21-iris-stations.t
+++ b/t/21-iris-stations.t
@@ -10,16 +10,10 @@ BEGIN {
}
require_ok('Travel::Status::DE::IRIS::Stations');
-my @emptypairs = grep { not (length($_->[0]) and length($_->[1])) }
- Travel::Status::DE::IRIS::Stations::get_stations;
+my @emptypairs = grep { not( length( $_->[0] ) and length( $_->[1] ) ) }
+ Travel::Status::DE::IRIS::Stations::get_stations;
-is_deeply(\@emptypairs, [], 'no stations with empty code / name');
-
-is_deeply(
- [],
- [ Travel::Status::DE::IRIS::Stations::get_station('doesnotexist') ],
- 'get_station: returns empty list for no match'
-);
+is_deeply( \@emptypairs, [], 'no stations with empty code / name' );
is_deeply(
[ [ 'EE', 'Essen Hbf' ] ],
@@ -52,13 +46,23 @@ is_deeply(
);
is_deeply(
- [ [ 'EG', 'Gelsenk Hbf' ], [ 'EGZO', 'Gelsenk Zoo' ] ],
- [ Travel::Status::DE::IRIS::Stations::get_station('Gelsenk') ],
- 'get_station: partial match by name works'
+ [ [ 'KM', 'M\'gladbach Hbf' ] ],
+ [ Travel::Status::DE::IRIS::Stations::get_station('mgladbach hbf') ],
+ 'get_station: close fuzzy match works (one result)'
+);
+
+is_deeply(
+ [ [ 'KM', 'M\'gladbach Hbf' ] ],
+ [ Travel::Status::DE::IRIS::Stations::get_station('Mgladbach Bbf') ],
+ 'get_station: close fuzzy match is case insensitive'
);
is_deeply(
- [ [ 'EG', 'Gelsenk Hbf' ], [ 'EGZO', 'Gelsenk Zoo' ] ],
- [ Travel::Status::DE::IRIS::Stations::get_station('gelsenk') ],
- 'get_station: partial match by name is case insensitive'
+ [
+ [ 'NKL', 'Kirchenlaibach' ],
+ [ 'KM', 'M\'gladbach Hbf' ],
+ [ 'XSRC', 'Reichenbach Kt' ]
+ ],
+ [ Travel::Status::DE::IRIS::Stations::get_station('Moenchengladbach Hbf') ],
+ 'get_station: partial match works (several results for very fuzzy match)'
);