diff options
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/acronyms.pl | 89 | ||||
| -rwxr-xr-x | scripts/acronyms.sh | 4 | 
2 files changed, 19 insertions, 74 deletions
| diff --git a/scripts/acronyms.pl b/scripts/acronyms.pl index 6ac2d16..9c14164 100755 --- a/scripts/acronyms.pl +++ b/scripts/acronyms.pl @@ -3,9 +3,8 @@  use strict;  use warnings;  use 5.010; -use Encode qw(decode encode); -use List::Util qw(max sum); -use List::MoreUtils qw(true); +use Encode qw(encode); +use Text::CSV;  say <<'EOF';  package Travel::Status::DE::IRIS::Stations; @@ -22,86 +21,32 @@ use Text::LevenshteinXS qw(distance);  # TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available -our $VERSION = '1.02'; +our $VERSION = '1.04';  my @stations = (  EOF -my @buf; - -sub process_block { -	my @histogram; -	my @borders = (0); -	my $run = 0; - -	my $length = max (map { length($_) } @buf); - -	for my $i (0 .. $length) { -		$histogram[$i] = true { length($_) < $i or substr($_, $i, 1) eq q{ } } @buf; - -		if ($histogram[$i] == @buf) { -			if (not $run) { -				push(@borders, $i); -				$run = 1; -			} -		} -		else { -			$run = 0; -		} -	} -	for my $i (0 .. $#borders / 2) { -		for my $line (@buf) { -			my $station_offset = $borders[2 * $i]; -			my $name_offset = $borders[2 * $i + 1]; -			my $station_length = $name_offset - $station_offset; -			my $name_length = $borders[2 * $i + 2] ? ($borders[2 * $i + 2] - $name_offset) : undef; - -			if (length($line) < $station_offset) { -				next; -			} - -			my $station = substr($line, $station_offset, $station_length); -			my $name = $name_length ? substr($line, $name_offset, $name_length) : substr($line, $name_offset); - -			$station =~ s{^\s+}{}; -			$station =~ s{\s+$}{}; -			$station =~ s{\s+}{ }g; -			$name =~ s{!}{ }g; -			$name =~ s{^\s+}{}; -			$name =~ s{\s+$}{}; -			$name =~ s{\s+}{ }g; -			$name =~ s{'}{\\'}g; - -			if (length($station) == 0) { -				next; -			} - -			printf("\t['%s','%s'],\n", encode('UTF-8', $station), encode('UTF-8', $name)); -		} -	} -} - +my $csv = Text::CSV->new({binary => 1, sep_char => q{;}});  while (my $line = <STDIN>) { -	chomp $line; -	$line = decode('UTF-8', $line); +#	chomp $line; +#	$line = decode('UTF-8', $line); -	if (length($line) == 0 and @buf) { -		process_block(); -		@buf = (); -	} +	my $status = $csv->parse($line); +	my @fields = $csv->fields; -	if ($line !~ m{ ^ [A-Z]{2} }x and $line !~ m{ \s [A-Z]{2,5} \s }x) { +	if ($fields[0] eq 'Abk') {  		next;  	} -	$line =~ s{RB-Gr km}{RB-Gr!km}g; -	$line =~ s{RB-Gr!km\s++}{RB-Gr!km!}g; -	$line =~ s{Bad }{Bad!}g; +	my ($station, $name, $country, $location, $valid_since) = @fields; -	push(@buf, $line); -} -if (@buf) { -	process_block(); +	$name =~ s{!}{ }g; +	$name =~ s{^\s+}{}; +	$name =~ s{\s+$}{}; +	$name =~ s{\s+}{ }g; +	$name =~ s{'}{\\'}g; + +	printf("\t['%s','%s'],\n", encode('UTF-8', $station), encode('UTF-8', $name));  }  say <<'EOF'; diff --git a/scripts/acronyms.sh b/scripts/acronyms.sh index 71f90d1..57b48ce 100755 --- a/scripts/acronyms.sh +++ b/scripts/acronyms.sh @@ -1,5 +1,5 @@  #!/bin/sh -curl -s http://fahrweg.dbnetze.com/file/fahrweg-de/2394144/vHBDX5OndmGwv-JTA9EzuNArX1E/2361656/data/betriebsstellen.pdf \ -| pdftotext -layout - - | perl scripts/acronyms.pl \ +curl -s http://data.deutschebahn.com/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2015-05.csv \ +| perl scripts/acronyms.pl \  > lib/Travel/Status/DE/IRIS/Stations.pm | 
