summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorDaniel Friesel <derf@finalrewind.org>2015-09-12 14:03:09 +0200
committerDaniel Friesel <derf@finalrewind.org>2015-09-12 14:03:09 +0200
commit3cccdc35bc5b4edcc97d486cf9ed50fc7b2ca82a (patch)
treedead79a7978971ebe80c84b75d2d4559360d5e14 /scripts
parent4caa67e1f2ca8ec055acf24f04d99da028e5c06f (diff)
use Text::Levenshtein(XS) for fuzzy station name matching
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/acronyms.pl26
1 files changed, 23 insertions, 3 deletions
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl
index 6dfba4b..de79b81 100755
--- a/scripts/acronyms.pl
+++ b/scripts/acronyms.pl
@@ -15,9 +15,13 @@ use warnings;
use 5.014;
use utf8;
-use List::MoreUtils qw(firstval);
+use List::Util qw(min);
+use List::MoreUtils qw(firstval pairwise);
+use Text::LevenshteinXS qw(distance);
-our $VERSION = '1.00';
+# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available
+
+our $VERSION = '1.02';
my @stations = (
EOF
@@ -148,7 +152,19 @@ sub get_station_by_name {
return ($actual_match);
}
- return ( grep { $_->[1] =~ m{$name}i } @stations );
+ my @distances = map { distance( $nname, $_->[1] ) } @stations;
+ my $min_dist = min(@distances);
+ my $minp1_dist = min( grep { $_ != $min_dist } @distances );
+ my @station_map = pairwise { [ $a, $b ] } @stations, @distances;
+
+ # arbitrary selection: edit distance < 5 is probably a typo, >= 5
+ # probably means the station does not exist / has an odd name
+ if ( $min_dist < 5 ) {
+ return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
+ }
+
+ # always return a list when the edit distance is large
+ return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
}
1;
@@ -238,6 +254,10 @@ None.
=item * List::MoreUtils(3pm)
+=item * List::Util(3pm)
+
+=item * Text::LevenshteinXS(3pm)
+
=back
=head1 BUGS AND LIMITATIONS