use Bahn OpenData site for DS100 station list

author: Daniel Friesel <derf@finalrewind.org> 2015-11-12 11:06:10 +0100
committer: Daniel Friesel <derf@finalrewind.org> 2015-11-12 11:06:10 +0100
commit: 4339f0db26d78251fe6c4f2b1f7be768a7ed3a2c (patch)
tree: cdc0e11b1e1ebe73bc4cbad4b4c02ab66300ee5d /scripts
parent: e527672c35108f9aede265181a64dc8a1bbb26bb (diff)
2 files changed, 19 insertions, 74 deletions
diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl
index 6ac2d16..9c14164 100755
--- a/scripts/acronyms.pl
+++ b/scripts/acronyms.pl
@@ -3,9 +3,8 @@
 use strict;
 use warnings;
 use 5.010;
-use Encode qw(decode encode);
-use List::Util qw(max sum);
-use List::MoreUtils qw(true);
+use Encode qw(encode);
+use Text::CSV;
 
 say <<'EOF';
 package Travel::Status::DE::IRIS::Stations;
@@ -22,86 +21,32 @@ use Text::LevenshteinXS qw(distance);
 
 # TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available
 
-our $VERSION = '1.02';
+our $VERSION = '1.04';
 
 my @stations = (
 EOF
 
-my @buf;
-
-sub process_block {
-	my @histogram;
-	my @borders = (0);
-	my $run = 0;
-
-	my $length = max (map { length($_) } @buf);
-
-	for my $i (0 .. $length) {
-		$histogram[$i] = true { length($_) < $i or substr($_, $i, 1) eq q{ } } @buf;
-
-		if ($histogram[$i] == @buf) {
-			if (not $run) {
-				push(@borders, $i);
-				$run = 1;
-			}
-		}
-		else {
-			$run = 0;
-		}
-	}
-	for my $i (0 .. $#borders / 2) {
-		for my $line (@buf) {
-			my $station_offset = $borders[2 * $i];
-			my $name_offset = $borders[2 * $i + 1];
-			my $station_length = $name_offset - $station_offset;
-			my $name_length = $borders[2 * $i + 2] ? ($borders[2 * $i + 2] - $name_offset) : undef;
-
-			if (length($line) < $station_offset) {
-				next;
-			}
-
-			my $station = substr($line, $station_offset, $station_length);
-			my $name = $name_length ? substr($line, $name_offset, $name_length) : substr($line, $name_offset);
-
-			$station =~ s{^\s+}{};
-			$station =~ s{\s+$}{};
-			$station =~ s{\s+}{ }g;
-			$name =~ s{!}{ }g;
-			$name =~ s{^\s+}{};
-			$name =~ s{\s+$}{};
-			$name =~ s{\s+}{ }g;
-			$name =~ s{'}{\\'}g;
-
-			if (length($station) == 0) {
-				next;
-			}
-
-			printf("\t['%s','%s'],\n", encode('UTF-8', $station), encode('UTF-8', $name));
-		}
-	}
-}
-
+my $csv = Text::CSV->new({binary => 1, sep_char => q{;}});
 while (my $line = <STDIN>) {
-	chomp $line;
-	$line = decode('UTF-8', $line);
+#	chomp $line;
+#	$line = decode('UTF-8', $line);
 
-	if (length($line) == 0 and @buf) {
-		process_block();
-		@buf = ();
-	}
+	my $status = $csv->parse($line);
+	my @fields = $csv->fields;
 
-	if ($line !~ m{ ^ [A-Z]{2} }x and $line !~ m{ \s [A-Z]{2,5} \s }x) {
+	if ($fields[0] eq 'Abk') {
 		next;
 	}
 
-	$line =~ s{RB-Gr km}{RB-Gr!km}g;
-	$line =~ s{RB-Gr!km\s++}{RB-Gr!km!}g;
-	$line =~ s{Bad }{Bad!}g;
+	my ($station, $name, $country, $location, $valid_since) = @fields;
 
-	push(@buf, $line);
-}
-if (@buf) {
-	process_block();
+	$name =~ s{!}{ }g;
+	$name =~ s{^\s+}{};
+	$name =~ s{\s+$}{};
+	$name =~ s{\s+}{ }g;
+	$name =~ s{'}{\\'}g;
+
+	printf("\t['%s','%s'],\n", encode('UTF-8', $station), encode('UTF-8', $name));
 }
 
 say <<'EOF';
diff --git a/scripts/acronyms.sh b/scripts/acronyms.sh
index 71f90d1..57b48ce 100755
--- a/scripts/acronyms.sh
+++ b/scripts/acronyms.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
 
-curl -s http://fahrweg.dbnetze.com/file/fahrweg-de/2394144/vHBDX5OndmGwv-JTA9EzuNArX1E/2361656/data/betriebsstellen.pdf \
-| pdftotext -layout - - | perl scripts/acronyms.pl \
+curl -s http://data.deutschebahn.com/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2015-05.csv \
+| perl scripts/acronyms.pl \
 > lib/Travel/Status/DE/IRIS/Stations.pm
author	Daniel Friesel <derf@finalrewind.org>	2015-11-12 11:06:10 +0100
committer	Daniel Friesel <derf@finalrewind.org>	2015-11-12 11:06:10 +0100
commit	4339f0db26d78251fe6c4f2b1f7be768a7ed3a2c (patch)
tree	cdc0e11b1e1ebe73bc4cbad4b4c02ab66300ee5d /scripts
parent	e527672c35108f9aede265181a64dc8a1bbb26bb (diff)