#!/usr/bin/env perl # Copyright (C) 2020 Daniel Friesel # # SPDX-License-Identifier: BSD-2-Clause use strict; use warnings; use 5.020; use utf8; use File::Slurp qw(write_file); use IPC::Run3; use JSON; use List::Util qw(uniq); sub show_usage { my ($exit_code) = @_; say STDERR "Usage: db-wagenreihung-to-json \n"; say STDERR "You can obtain the latest PDF from "; exit $exit_code; } my @weekdays = (qw(Mo Di Mi Do Fr Sa So)); my @months = (qw(0 I II III IV V VI VII VIII IX X XI XII)); my %weekday = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays ); my %month = map { ( $months[$_] => $_ ) } ( 0 .. $#months ); my %valid; my $year; sub parse_weekday_range { my ($range) = @_; my $ret = [ ( \0 ) x 7 ]; if ( length($range) == 2 ) { $ret->[ $weekday{$range} ] = \1; return $ret; } my ( $start, $end ) = split( qr{-}, $range ); for my $i ( $weekday{$start} .. $weekday{$end} ) { $ret->[$i] = \1; } return $ret; } sub parse_weekdays { my ($text) = @_; if ( $text eq 'tgl.' ) { return [ ( \1 ) x 7 ]; } my $ret = [ ( \0 ) x 7 ]; for my $range ( split( qr{[+]}, $text ) ) { my $new_ret = parse_weekday_range($range); for my $i ( 0 .. 6 ) { if ( ${ $new_ret->[$i] } ) { $ret->[$i] = \1; } } } return $ret; } sub parse_dates { my ($text) = @_; my @valid_ranges; my @valid_dates; my @invalid_dates; my $mode = "range"; for my $date ( split( qr{,\s*}, $text ) ) { if ( $date =~ m{^auch} ) { $mode = "extra"; } elsif ( $date =~ m{^nicht} ) { $mode = "except"; } elsif ( $date =~ m{^(ab|bis)} ) { $mode = "range"; } if ( $mode eq 'range' and $date =~ m{ ^ ab \s* (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] ); } elsif ( $mode eq 'range' and $date =~ m{ ^ bis \s* (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] ); } elsif ( $date =~ m{ ^ (? \d{2} ) \. (? [^-]+ )\. - (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] ); } } my %ret = ( valid => [], invalid => [], ); for my $range (@valid_ranges) { my $from_date = $valid{from}; my $through_date = $valid{through}; if ( $range->[0] ) { $from_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[0][0] }, $range->[0][1] ); } if ( $range->[1] ) { $through_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[1][0] }, $range->[1][1] ); } push( @{ $ret{valid} }, "${from_date}/${through_date}" ); } return %ret; } sub parse_condition { my ($line) = @_; my %ret; if ( $line =~ m{ ^ \s* (? .*? ) \s - \s (? [^,]+ ) , \s* (? (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (? .* ) $ }x ) { $ret{from} = $+{from}; $ret{to} = $+{to}; $ret{weekdays} = parse_weekdays( $+{weekdays} ); my %validity = parse_dates( $+{dates} ); while ( my ( $k, $v ) = each %validity ) { $ret{$k} = $v; } $ret{raw} = $line; } else { $ret{raw} = $line; } return {%ret}; } if ( @ARGV == 0 ) { show_usage(1); } run3( [ "which", "pdftotext" ], \undef, \undef, \undef ); if ($?) { say STDERR "Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n"; say STDERR "On Debian and derivatives, it should be available in the 'poppler-utils' package."; exit 2; } my $wr_pdf = $ARGV[0]; my $wr_txt; my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ]; # stdin: /dev/null # stdout: $wr_txt # stderr: forwarded to terminal run3( $command, \undef, \$wr_txt, undef, { binmode_stdout => ':encoding(utf-8)' } ); if ($?) { my $exit_status = $? >> 8; say STDERR "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)"; exit 2; } my @lines = split( qr{\n}, $wr_txt ); my ( $type, $number ); my %map; my %wagon_map; my $state = "intro"; for my $line (@lines) { if ( $line =~ m{ ^ \s* Gültigkeit: \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) \s* – \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) }x ) { %valid = ( "from" => sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ), "through" => sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ), ); $year = $+{ty}; } if ( $state eq "intro" ) { if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) { $state = ""; } else { next; } } if ( $line eq 'F. Anhang V: Beschilderungsplan' ) { last; } if ( $line =~ m{ ^ (? [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (? [ 0-9]{2,12}) }x ) { $type = $+{type}; $type =~ tr{ }{}d; $number = $+{number}; $number =~ tr{ }{}d; if ( $number !~ m{ ^ \d+ $ }x ) { next; } $state = "train_no"; #say "$type $number"; $map{$number} = { raw => $type, }; if ( $type eq 'ICE-A' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-W' ) { $map{$number}{type} = 'ICE 3'; $map{$number}{short} = '3'; } elsif ( $type eq 'ICE' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-T' ) { $map{$number}{type} = 'ICE T'; $map{$number}{short} = 'T'; } elsif ( $type eq 'IC-A' ) { $map{$number}{type} = 'Metropolitan'; $map{$number}{short} = 'M'; } elsif ( $type eq 'IC-D' ) { $map{$number}{type} = 'IC2'; $map{$number}{short} = '2'; } elsif ( $type eq 'IC-E' ) { $map{$number}{type} = 'IC2 KISS'; $map{$number}{short} = '2'; } else { $map{$number}{type} = $type; } } elsif ( $line =~ m{ ^ (? LPF T? (-[A-Z])? ) (? [ 0-9]{2,16}) }x ) { # Not in passenger service ("Leehrfahrt"). Ignore it for now. # FWIW, LPF(T) can be further distinguished: # LPFT-A: ICE BR 401/402 # LPFT-B: ICE BR 412 # LPFT-T: ICE BR 411 / 415 / 605 # LPFT-W: ICE BR 403 / 406 #$number = undef; #$state = ""; $type = $+{type}; $type =~ tr{ }{}d; $number = $+{number}; $number =~ tr{ }{}d; if ( $number !~ m{ ^ \d+ $ }x ) { next; } $state = "train_no"; $map{$number} = { raw => $type, type => $type, empty => \1, }; } elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) { $state = "route"; $map{$number}{route} = ""; } # ICE types can be further distinguished by rolling stock. For instance, # ICE 4 is the only ICE-A variant with restaurant and first-class seats # in the same wagon ("ARmz"). elsif ( $number and $line =~ m{^\d*\s+ARmz} ) { if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 4'; $map{$number}{short} = '4'; } } elsif ( $number and $line =~ m{^\d*\s+WRmz} ) { #say "ICE 1 / ICE 3 Redesign (WRmz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 1'; $map{$number}{short} = '1'; } elsif ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Redesign'; $map{$number}{short} = '3R'; } } elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) { #say "ICE 2 (WRmbsz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 2'; $map{$number}{short} = '2'; } } elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) { #say "ICE 3 Velaro"; if ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Velaro'; $map{$number}{short} = '3V'; } } elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) { if ( $map{$number}{type} eq 'IC2' or $map{$number}{type} eq 'IC2 KISS' ) { # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker # trains must start with D (indicating that they double decker # wagons). If an IC2 train has other wagons, it isn't really an IC2 # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as # IC-D, but actually uses IC1 wagons. $map{$number}{type} = 'IC'; delete $map{$number}{short}; } } if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (? .* ) $ }x ) { if ( length( $map{$number}{route} ) ) { $map{$number}{route} .= ' '; } $map{$number}{route} .= $+{route}; } elsif ( $state eq "route" and $line eq "" ) { $state = "conditions"; } elsif ( $state eq "conditions" and $line =~ m{ Hg | Tfz | BrH }x ) { $state = "details"; } if ( $number and $line =~ m{ ^ \d* \s{1,10} (? ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (? \d+ )? (?: $ | \s{3} (? .* ) $ ) }x ) { $state = "wagonorder"; my $wagon_type = $+{type}; my $wagon_number = $+{number}; my $rest = $+{rest}; push( @{ $wagon_map{$number} }, [ $wagon_type, $wagon_number ] ); if ( $rest and $rest =~ m{\S} ) { push( @{ $map{$number}{wagonorder_notes} }, $line ); } $map{$number}{has_wagon}{$wagon_type} = \1; } if ( $state eq "conditions" and length($line) and $line =~ m{ \S }x ) { push( @{ $map{$number}{conditions} }, parse_condition($line) ); } if ( $state eq "details" and length($line) and $line =~ m{ Hg | Tfz | BrH }x ) { push( @{ $map{$number}{details} }, $line ); } } for my $train ( values %map ) { if ( not $train->{route} ) { next; } my $route = $train->{route}; my @parts = split( qr{ - }, $route ); if ( @parts < 2 ) { $train->{route} = { raw => $route }; next; } my ( $pre_start, $start, @middle, $end, $post_end ); if ( $parts[0] =~ m{ ^ [(] }x ) { $pre_start = shift @parts; $pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x; } if ( $parts[-1] =~ m{ ^ [(] }x ) { $post_end = pop @parts; $post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x; } $start = shift @parts; $end = pop @parts; $train->{route} = { preStart => $pre_start, start => $start, middle => scalar @parts ? [@parts] : undef, end => $end, postEnd => $post_end, }; } for my $train ( values %map ) { if ( @{ $train->{wagonorder_notes} // [] } == 0 ) { next; } my $first_line = $train->{wagonorder_notes}[0]; my $from_offset = 0; my $to_offset = 0; # International trains (e.g. EC 8) are super special and may have another # identifier after the "cycle to" train number. if ( $first_line =~ m{ ^ (? .*? [^0-9] ) (? \d+ ) (? [A-Z ]+ ) (? \d+ ) (?: \s+ \d+ )? $ }x ) { $from_offset = length( $+{lead} ); $to_offset = $from_offset + length( $+{from} ) + length( $+{middle} ); } if ( not $from_offset ) { next; } for my $i_line ( @{ $train->{wagonorder_notes} } ) { # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes. # We don't want that. my $line = substr( $i_line, 0 ); # Some cycles do not have a "from" entry, but they do have an odd # F-identifier, which confuses the cycle detection code. remove it. $line =~ s{F[0-9] [0-9]{3}}{.. ...}; if ( length($line) <= $from_offset ) { next; } my $umlauf = substr( $line, $from_offset ); if ( $umlauf =~ m{ ^ (\d+) }x ) { push( @{ $train->{cycle}{from} }, $1 ); } if ( length($line) > $to_offset ) { $umlauf = substr( $line, $to_offset ); if ( $umlauf =~ m{ ^ (\d+) }x ) { push( @{ $train->{cycle}{to} }, $1 ); } } } if ( exists $train->{cycle}{from} ) { @{ $train->{cycle}{from} } = uniq @{ $train->{cycle}{from} }; @{ $train->{cycle}{to} } = uniq @{ $train->{cycle}{to} }; } } #for my $train_number ( keys %map ) { # my $tr = $map{$train_number}{route} // q{}; # $tr =~ s{ - .* - }{ - }; # my $this = sprintf("\"%d %s\"", $train_number, $tr); # for my $from ( @{ $map{$train_number}{umlauf}{from} // [] } ) { # $tr = $map{$from}{route} // q{}; # $tr =~ s{ - .* - }{ - }; # $from = sprintf("\"%d %s\"", $from, $tr); # say "$from -> $this;"; # } # for my $to ( @{ $map{$train_number}{umlauf}{to} // [] } ) { # $tr = $map{$to}{route} // q{}; # $tr =~ s{ -.*- }{ - }; # $to = sprintf("\"%d %s\"", $to, $tr); # say "$this -> $to;"; # } #} #exit 0; for my $train_number ( keys %map ) { my $wagon_numbers_ok = 1; my $wagon_types_ok = 1; my %bincount; my %type_by_wagon_number; for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { # Locomotives do not have a wagon number. That's OK. next; } elsif ( not $wagon_number ) { $wagon_numbers_ok = 0; } else { $bincount{$wagon_number}++; push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type ); } } if ($wagon_numbers_ok) { for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; push( @{ $map{$train_number}{wagons} }, $wagon ); if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { next; } if ( $bincount{$wagon_number} > 1 ) { if ( $type_by_wagon_number{$wagon_number}[0] ne $type_by_wagon_number{$wagon_number}[1] ) { $wagon_types_ok = 0; } # TODO else -> wagon_list_ok = 0 ? } } } if ( not $wagon_types_ok ) { # train type may differ depending on date of week or similar. We don't # parse/handle that yet. delete $map{$train_number}{type}; delete $map{$train_number}{short}; delete $map{$train_number}{wagons}; delete $map{$train_number}{has_wagon}; } } # broken umlauf (indentation changes on page break) delete $map{104}{cycle}; delete $map{77689}{cycle}; delete $map{78112}{cycle}; # indentation change between consecutive lines. wat. delete $map{939}{cycle}; # use canonical output (i.e., sort hash keys) to allow for easy diffing. say JSON->new->utf8->canonical->encode( { valid => {%valid}, train => {%map} } );