summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <derf@finalrewind.org>2015-09-14 21:07:10 +0200
committerDaniel Friesel <derf@finalrewind.org>2015-09-14 21:07:10 +0200
commitf06afc08e82be0eccbd3b438470503bedaac69d2 (patch)
tree529b6925334f7e928ce8c7dd451588b70b976c14
parent3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (diff)
re-enable substring matching -> use both for station lookup
-rw-r--r--Build.PL1
-rw-r--r--Changelog3
-rw-r--r--README1
-rw-r--r--lib/Travel/Status/DE/IRIS/Stations.pm13
-rwxr-xr-xscripts/acronyms.pl21
-rw-r--r--t/21-iris-stations.t13
6 files changed, 26 insertions, 26 deletions
diff --git a/Build.PL b/Build.PL
index 07ccb44..7422f2c 100644
--- a/Build.PL
+++ b/Build.PL
@@ -28,6 +28,7 @@ Module::Build->new(
'List::Compare' => '0.29',
'List::MoreUtils' => 0,
'List::Util' => 0,
+ 'List::UtilsBy' => 0,
'LWP::UserAgent' => 0,
'Text::LevenshteinXS' => 0,
'XML::LibXML' => 0,
diff --git a/Changelog b/Changelog
index 296a73b..c23d539 100644
--- a/Changelog
+++ b/Changelog
@@ -2,9 +2,10 @@ git HEAD
* Result: Add info key 900
* Station: Improve get_station matching quality by using the Levenshtein
- edit distance instead of simple substring matching
+ edit distance instead in addition to simple substring matching
* new dependency: Text::LevenshteinXS (see README for notes about
drop-in replacements)
+ * new dependency: List::UtilsBy
Travel::Status::DE::IRIS 1.02 - Tue May 26 2015
diff --git a/README b/README
index 43d5eaa..a2e7ec8 100644
--- a/README
+++ b/README
@@ -13,6 +13,7 @@ Dependencies
* DateTime::Format::Strptime
* List::Compare
* List::MoreUtils
+* List::UtilsBy
* LWP::UserAgent
* Text::LevenshteinXS
* XML::LibXML
diff --git a/lib/Travel/Status/DE/IRIS/Stations.pm b/lib/Travel/Status/DE/IRIS/Stations.pm
index 2c8b8b4..87b73e4 100644
--- a/lib/Travel/Status/DE/IRIS/Stations.pm
+++ b/lib/Travel/Status/DE/IRIS/Stations.pm
@@ -6,6 +6,7 @@ use 5.014;
use utf8;
use List::Util qw(min);
+use List::UtilsBy qw(uniq_by);
use List::MoreUtils qw(firstval pairwise);
use Text::LevenshteinXS qw(distance);
@@ -15271,17 +15272,13 @@ sub get_station_by_name {
my @distances = map { distance( $nname, $_->[1] ) } @stations;
my $min_dist = min(@distances);
- my $minp1_dist = min( grep { $_ != $min_dist } @distances );
my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
- # arbitrary selection: edit distance < 5 is probably a typo, >= 5
- # probably means the station does not exist / has an odd name
- if ( $min_dist < 5 ) {
- return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
- }
+ my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
+ my @levenshtein_matches
+ = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
- # always return a list when the edit distance is large
- return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
+ return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
}
1;
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl
index de79b81..6ac2d16 100755
--- a/scripts/acronyms.pl
+++ b/scripts/acronyms.pl
@@ -16,6 +16,7 @@ use 5.014;
use utf8;
use List::Util qw(min);
+use List::UtilsBy qw(uniq_by);
use List::MoreUtils qw(firstval pairwise);
use Text::LevenshteinXS qw(distance);
@@ -125,7 +126,7 @@ sub normalize {
}
sub get_station {
- my ( $name ) = @_;
+ my ($name) = @_;
my $ds100_match = firstval { $name eq $_->[0] } @stations;
@@ -137,34 +138,30 @@ sub get_station {
}
sub get_station_by_name {
- my ( $name ) = @_;
+ my ($name) = @_;
my $nname = lc($name);
- my $actual_match = firstval { $nname eq lc($_->[1]) } @stations;
+ my $actual_match = firstval { $nname eq lc( $_->[1] ) } @stations;
if ($actual_match) {
return ($actual_match);
}
$nname = normalize($nname);
- $actual_match = firstval { $nname eq normalize(lc($_->[1])) } @stations;
+ $actual_match = firstval { $nname eq normalize( lc( $_->[1] ) ) } @stations;
if ($actual_match) {
return ($actual_match);
}
my @distances = map { distance( $nname, $_->[1] ) } @stations;
my $min_dist = min(@distances);
- my $minp1_dist = min( grep { $_ != $min_dist } @distances );
my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
- # arbitrary selection: edit distance < 5 is probably a typo, >= 5
- # probably means the station does not exist / has an odd name
- if ( $min_dist < 5 ) {
- return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
- }
+ my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
+ my @levenshtein_matches
+ = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
- # always return a list when the edit distance is large
- return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
+ return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
}
1;
diff --git a/t/21-iris-stations.t b/t/21-iris-stations.t
index c440fd0..b346525 100644
--- a/t/21-iris-stations.t
+++ b/t/21-iris-stations.t
@@ -2,6 +2,7 @@
use strict;
use warnings;
use 5.014;
+use utf8;
use Test::More tests => 11;
@@ -59,10 +60,12 @@ is_deeply(
is_deeply(
[
- [ 'NKL', 'Kirchenlaibach' ],
- [ 'KM', 'M\'gladbach Hbf' ],
- [ 'XSRC', 'Reichenbach Kt' ]
+ [ 'EEBE', 'E-Bergeborbeck' ],
+ [ 'EEBB', 'E-Borbeck' ],
+ [ 'EEBS', 'E-Borbeck Süd' ],
+ [ 'EGAR', 'Garbeck' ],
+ [ 'EWBC', 'Wolbeck' ],
],
- [ Travel::Status::DE::IRIS::Stations::get_station('Moenchengladbach Hbf') ],
- 'get_station: partial match works (several results for very fuzzy match)'
+ [ Travel::Status::DE::IRIS::Stations::get_station('Borbeck') ],
+ 'get_station: partial match with substring and levenshtein'
);