re-enable substring matching -> use both for station lookup

author: Daniel Friesel <derf@finalrewind.org> 2015-09-14 21:07:10 +0200
committer: Daniel Friesel <derf@finalrewind.org> 2015-09-14 21:07:10 +0200
commit: f06afc08e82be0eccbd3b438470503bedaac69d2 (patch)
tree: 529b6925334f7e928ce8c7dd451588b70b976c14
parent: 3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (diff)
6 files changed, 26 insertions, 26 deletions
diff --git a/Build.PL b/Build.PL
index 07ccb44..7422f2c 100644
--- a/Build.PL
+++ b/Build.PL
@@ -28,6 +28,7 @@ Module::Build->new(
 		'List::Compare' => '0.29',
 		'List::MoreUtils' => 0,
 		'List::Util' => 0,
+		'List::UtilsBy' => 0,
 		'LWP::UserAgent' => 0,
 		'Text::LevenshteinXS' => 0,
 		'XML::LibXML' => 0,
diff --git a/Changelog b/Changelog
index 296a73b..c23d539 100644
--- a/Changelog
+++ b/Changelog
@@ -2,9 +2,10 @@ git HEAD
 
     * Result: Add info key 900
     * Station: Improve get_station matching quality by using the Levenshtein
-      edit distance instead of simple substring matching
+      edit distance instead in addition to simple substring matching
     * new dependency: Text::LevenshteinXS (see README for notes about
       drop-in replacements)
+    * new dependency: List::UtilsBy
 
 Travel::Status::DE::IRIS 1.02 - Tue May 26 2015
 
diff --git a/README b/README
index 43d5eaa..a2e7ec8 100644
--- a/README
+++ b/README
@@ -13,6 +13,7 @@ Dependencies
 * DateTime::Format::Strptime
 * List::Compare
 * List::MoreUtils
+* List::UtilsBy
 * LWP::UserAgent
 * Text::LevenshteinXS
 * XML::LibXML
diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm
index 2c8b8b4..87b73e4 100644
--- a/lib/Travel/Status/DE/IRIS/Stations.pm
+++ b/lib/Travel/Status/DE/IRIS/Stations.pm
@@ -6,6 +6,7 @@ use 5.014;
 use utf8;
 
 use List::Util qw(min);
+use List::UtilsBy qw(uniq_by);
 use List::MoreUtils qw(firstval pairwise);
 use Text::LevenshteinXS qw(distance);
 
@@ -15271,17 +15272,13 @@ sub get_station_by_name {
 
 	my @distances   = map { distance( $nname, $_->[1] ) } @stations;
 	my $min_dist    = min(@distances);
-	my $minp1_dist  = min( grep { $_ != $min_dist } @distances );
 	my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
 
-	# arbitrary selection: edit distance < 5 is probably a typo, >= 5
-	# probably means the station does not exist / has an odd name
-	if ( $min_dist < 5 ) {
-		return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
-	}
+	my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
+	my @levenshtein_matches
+	  = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
 
-	# always return a list when the edit distance is large
-	return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
+	return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
 }
 
 1;
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl
index de79b81..6ac2d16 100755
--- a/scripts/acronyms.pl
+++ b/scripts/acronyms.pl
@@ -16,6 +16,7 @@ use 5.014;
 use utf8;
 
 use List::Util qw(min);
+use List::UtilsBy qw(uniq_by);
 use List::MoreUtils qw(firstval pairwise);
 use Text::LevenshteinXS qw(distance);
 
@@ -125,7 +126,7 @@ sub normalize {
 }
 
 sub get_station {
-	my ( $name ) = @_;
+	my ($name) = @_;
 
 	my $ds100_match = firstval { $name eq $_->[0] } @stations;
 
@@ -137,34 +138,30 @@ sub get_station {
 }
 
 sub get_station_by_name {
-	my ( $name ) = @_;
+	my ($name) = @_;
 
 	my $nname = lc($name);
-	my $actual_match = firstval { $nname eq lc($_->[1]) } @stations;
+	my $actual_match = firstval { $nname eq lc( $_->[1] ) } @stations;
 
 	if ($actual_match) {
 		return ($actual_match);
 	}
 
 	$nname = normalize($nname);
-	$actual_match = firstval { $nname eq normalize(lc($_->[1])) } @stations;
+	$actual_match = firstval { $nname eq normalize( lc( $_->[1] ) ) } @stations;
 	if ($actual_match) {
 		return ($actual_match);
 	}
 
 	my @distances   = map { distance( $nname, $_->[1] ) } @stations;
 	my $min_dist    = min(@distances);
-	my $minp1_dist  = min( grep { $_ != $min_dist } @distances );
 	my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
 
-	# arbitrary selection: edit distance < 5 is probably a typo, >= 5
-	# probably means the station does not exist / has an odd name
-	if ( $min_dist < 5 ) {
-		return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
-	}
+	my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
+	my @levenshtein_matches
+	  = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
 
-	# always return a list when the edit distance is large
-	return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
+	return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
 }
 
 1;
diff --git a/t/21-iris-stations.t b/t/21-iris-stations.t
index c440fd0..b346525 100644
--- a/t/21-iris-stations.t
+++ b/t/21-iris-stations.t
@@ -2,6 +2,7 @@
 use strict;
 use warnings;
 use 5.014;
+use utf8;
 
 use Test::More tests => 11;
 
@@ -59,10 +60,12 @@ is_deeply(
 
 is_deeply(
 	[
-		[ 'NKL',  'Kirchenlaibach' ],
-		[ 'KM',   'M\'gladbach Hbf' ],
-		[ 'XSRC', 'Reichenbach Kt' ]
+		[ 'EEBE', 'E-Bergeborbeck' ],
+		[ 'EEBB', 'E-Borbeck' ],
+		[ 'EEBS', 'E-Borbeck Süd' ],
+		[ 'EGAR', 'Garbeck' ],
+		[ 'EWBC', 'Wolbeck' ],
 	],
-	[ Travel::Status::DE::IRIS::Stations::get_station('Moenchengladbach Hbf') ],
-	'get_station: partial match works (several results for very fuzzy match)'
+	[ Travel::Status::DE::IRIS::Stations::get_station('Borbeck') ],
+	'get_station: partial match with substring and levenshtein'
 );
author	Daniel Friesel <derf@finalrewind.org>	2015-09-14 21:07:10 +0200
committer	Daniel Friesel <derf@finalrewind.org>	2015-09-14 21:07:10 +0200
commit	f06afc08e82be0eccbd3b438470503bedaac69d2 (patch)
tree	529b6925334f7e928ce8c7dd451588b70b976c14
parent	3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (diff)