From f06afc08e82be0eccbd3b438470503bedaac69d2 Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Mon, 14 Sep 2015 21:07:10 +0200 Subject: re-enable substring matching -> use both for station lookup --- Build.PL | 1 + Changelog | 3 ++- README | 1 + lib/Travel/Status/DE/IRIS/Stations.pm | 13 +++++-------- scripts/acronyms.pl | 21 +++++++++------------ t/21-iris-stations.t | 13 ++++++++----- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Build.PL b/Build.PL index 07ccb44..7422f2c 100644 --- a/Build.PL +++ b/Build.PL @@ -28,6 +28,7 @@ Module::Build->new( 'List::Compare' => '0.29', 'List::MoreUtils' => 0, 'List::Util' => 0, + 'List::UtilsBy' => 0, 'LWP::UserAgent' => 0, 'Text::LevenshteinXS' => 0, 'XML::LibXML' => 0, diff --git a/Changelog b/Changelog index 296a73b..c23d539 100644 --- a/Changelog +++ b/Changelog @@ -2,9 +2,10 @@ git HEAD * Result: Add info key 900 * Station: Improve get_station matching quality by using the Levenshtein - edit distance instead of simple substring matching + edit distance instead in addition to simple substring matching * new dependency: Text::LevenshteinXS (see README for notes about drop-in replacements) + * new dependency: List::UtilsBy Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 diff --git a/README b/README index 43d5eaa..a2e7ec8 100644 --- a/README +++ b/README @@ -13,6 +13,7 @@ Dependencies * DateTime::Format::Strptime * List::Compare * List::MoreUtils +* List::UtilsBy * LWP::UserAgent * Text::LevenshteinXS * XML::LibXML diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm index 2c8b8b4..87b73e4 100644 --- a/lib/Travel/Status/DE/IRIS/Stations.pm +++ b/lib/Travel/Status/DE/IRIS/Stations.pm @@ -6,6 +6,7 @@ use 5.014; use utf8; use List::Util qw(min); +use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); @@ -15271,17 +15272,13 @@ sub get_station_by_name { my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); - my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; - # arbitrary selection: edit distance < 5 is probably a typo, >= 5 - # probably means the station does not exist / has an odd name - if ( $min_dist < 5 ) { - return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; - } + my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; + my @levenshtein_matches + = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; - # always return a list when the edit distance is large - return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; + return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); } 1; diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl index de79b81..6ac2d16 100755 --- a/scripts/acronyms.pl +++ b/scripts/acronyms.pl @@ -16,6 +16,7 @@ use 5.014; use utf8; use List::Util qw(min); +use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); @@ -125,7 +126,7 @@ sub normalize { } sub get_station { - my ( $name ) = @_; + my ($name) = @_; my $ds100_match = firstval { $name eq $_->[0] } @stations; @@ -137,34 +138,30 @@ sub get_station { } sub get_station_by_name { - my ( $name ) = @_; + my ($name) = @_; my $nname = lc($name); - my $actual_match = firstval { $nname eq lc($_->[1]) } @stations; + my $actual_match = firstval { $nname eq lc( $_->[1] ) } @stations; if ($actual_match) { return ($actual_match); } $nname = normalize($nname); - $actual_match = firstval { $nname eq normalize(lc($_->[1])) } @stations; + $actual_match = firstval { $nname eq normalize( lc( $_->[1] ) ) } @stations; if ($actual_match) { return ($actual_match); } my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); - my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; - # arbitrary selection: edit distance < 5 is probably a typo, >= 5 - # probably means the station does not exist / has an odd name - if ( $min_dist < 5 ) { - return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; - } + my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; + my @levenshtein_matches + = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; - # always return a list when the edit distance is large - return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; + return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); } 1; diff --git a/t/21-iris-stations.t b/t/21-iris-stations.t index c440fd0..b346525 100644 --- a/t/21-iris-stations.t +++ b/t/21-iris-stations.t @@ -2,6 +2,7 @@ use strict; use warnings; use 5.014; +use utf8; use Test::More tests => 11; @@ -59,10 +60,12 @@ is_deeply( is_deeply( [ - [ 'NKL', 'Kirchenlaibach' ], - [ 'KM', 'M\'gladbach Hbf' ], - [ 'XSRC', 'Reichenbach Kt' ] + [ 'EEBE', 'E-Bergeborbeck' ], + [ 'EEBB', 'E-Borbeck' ], + [ 'EEBS', 'E-Borbeck Süd' ], + [ 'EGAR', 'Garbeck' ], + [ 'EWBC', 'Wolbeck' ], ], - [ Travel::Status::DE::IRIS::Stations::get_station('Moenchengladbach Hbf') ], - 'get_station: partial match works (several results for very fuzzy match)' + [ Travel::Status::DE::IRIS::Stations::get_station('Borbeck') ], + 'get_station: partial match with substring and levenshtein' ); -- cgit v1.2.3