#!/usr/bin/env perl # Copyright (C) 2020 Daniel Friesel # # SPDX-License-Identifier: BSD-2-Clause use strict; use warnings; use 5.020; use utf8; use Encode qw(decode); use IPC::Run3; use JSON; use List::Util qw(uniq); use Travel::Status::DE::IRIS::Stations; sub show_usage { my ($exit_code) = @_; say STDERR "Usage: db-wagenreihung-to-json \n"; say STDERR "You can obtain the latest PDF from "; exit $exit_code; } my @weekdays = (qw(Mo Di Mi Do Fr Sa So)); my @months = (qw(0 I II III IV V VI VII VIII IX X XI XII)); my %weekday = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays ); my %month = map { ( $months[$_] => $_ ) } ( 0 .. $#months ); my ( $valid, $valid_from, $valid_through ); my $year; my %ds100_to_name; for my $station ( Travel::Status::DE::IRIS::Stations::get_stations() ) { $ds100_to_name{ $station->[0] } = $station->[1]; } sub parse_weekday_range { my ($range) = @_; my $ret = [ ( \0 ) x 7 ]; if ( length($range) == 2 ) { $ret->[ $weekday{$range} ] = \1; return $ret; } my ( $start, $end ) = split( qr{-}, $range ); for my $i ( $weekday{$start} .. $weekday{$end} ) { $ret->[$i] = \1; } return $ret; } sub parse_weekdays { my ($text) = @_; if ( $text eq 'tgl.' ) { return [ ( \1 ) x 7 ]; } my $ret = [ ( \0 ) x 7 ]; for my $range ( split( qr{[+]}, $text ) ) { my $new_ret = parse_weekday_range($range); for my $i ( 0 .. 6 ) { if ( ${ $new_ret->[$i] } ) { $ret->[$i] = \1; } } } return $ret; } sub parse_dates { my ($text) = @_; my @valid_ranges; my @valid_dates; my @invalid_dates; my $mode = "range"; for my $date ( split( qr{,\s*}, $text ) ) { if ( $date =~ m{^auch} ) { $mode = "extra"; } elsif ( $date =~ m{^nicht} ) { $mode = "except"; } elsif ( $date =~ m{^(ab|bis)} ) { $mode = "range"; } if ( $mode eq 'range' and $date =~ m{ ^ ab \s* (? \d{2} ) \. (? [IVX]* ) }x ) { push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] ); } elsif ( $mode eq 'range' and $date =~ m{ ^ bis \s* (? \d{2} ) \. (? [IVX]* ) }x ) { push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] ); } elsif ( $date =~ m{ ^ (? \d{2} ) \. (? [^-]+ )\. - (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] ); } elsif ( $mode eq 'extra' and $date =~ m{ (? \d{2} ) \. (? [IVX]* ) }x ) { push( @valid_dates, [ $+{month}, $+{day} ] ); } elsif ( $mode eq 'except' and $date =~ m{ (? \d{2} ) \. (? [IVX]* ) }x ) { push( @invalid_dates, [ $+{month}, $+{day} ] ); } } my %ret = ( valid => [], invalid => [], ); for my $range (@valid_ranges) { my $from_date = $valid_from; my $through_date = $valid_through; if ( $range->[0] ) { $from_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[0][0] }, $range->[0][1] ); } if ( $range->[1] ) { $through_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[1][0] }, $range->[1][1] ); } push( @{ $ret{valid} }, "${from_date}/${through_date}" ); } # for several dates of the same month, only the last date has the month set # (e.g. "24., 31.XII."). Walk through the list in reverse so ensure the # month is always available. my $month = undef; for my $date ( reverse @valid_dates ) { if ( not $date->[0] and not $month ) { say STDERR "Skipping unhandled valid date in \"$text\""; next; } # TODO Winterfahrplan: Use $year-1 for month XII push( @{ $ret{valid} }, sprintf( '%04d-%02d-%02d', $year, $month{ $date->[0] || $month }, $date->[1] ) ); if ( $date->[0] ) { $month = $date->[0]; } } $month = undef; for my $date ( reverse @invalid_dates ) { if ( not $date->[0] and not $month ) { say STDERR "Skipping unhandled invalid date in \"$text\""; next; } # TODO Winterfahrplan: Use $year-1 for month XII push( @{ $ret{invalid} }, sprintf( '%04d-%02d-%02d', $year, $month{ $date->[0] || $month }, $date->[1] ) ); if ( $date->[0] ) { $month = $date->[0]; } } return %ret; } sub parse_condition { my ($line) = @_; my %ret; if ( $line =~ m{ ^ \s* (? .*? ) \s - \s (? [^,]+ ) , \s* (? N \s* )? (? (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (? .* ) $ }x ) { $ret{from} = $+{from}; $ret{to} = $+{to}; $ret{weekdays} = parse_weekdays( $+{weekdays} ); my %validity = parse_dates( $+{dates} ); while ( my ( $k, $v ) = each %validity ) { $ret{$k} = $v; } $ret{raw} = $line; } else { $ret{raw} = $line; } return {%ret}; } if ( @ARGV == 0 ) { show_usage(1); } run3( [ "which", "pdftotext" ], \undef, \undef, \undef ); if ($?) { say STDERR "Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n"; say STDERR "On Debian and derivatives, it should be available in the 'poppler-utils' package."; exit 2; } my $wr_pdf = decode( 'UTF-8', $ARGV[0] ); my $wr_name = ( split( qr{/}, $wr_pdf ) )[-1]; my $wr_txt; my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ]; # stdin: /dev/null # stdout: $wr_txt # stderr: forwarded to terminal run3( $command, \undef, \$wr_txt, undef, { binmode_stdout => ':encoding(utf-8)' } ); if ($?) { my $exit_status = $? >> 8; say STDERR "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)"; exit 2; } my @lines = split( qr{\n}, $wr_txt ); my ( $type, $number ); my %map; my $state = "intro"; for my $line (@lines) { if ( $line =~ m{ ^ \s* Gültigkeit: \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) \s* – \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) }x ) { $valid_from = sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ); $valid_through = sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ); $valid = "${valid_from}/${valid_through}"; $year = $+{ty}; } if ( $state eq "intro" ) { if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) { $state = ""; } else { next; } } if ( $line eq 'F. Anhang V: Beschilderungsplan' ) { last; } if ( $line =~ m{ ^ (? [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (? [ 0-9]{2,12}) (?: \s+ (? \S .* ) )? }x ) { $type = $+{type}; $type =~ tr{ }{}d; $number = $+{number}; $number =~ tr{ }{}d; my $name = $+{name}; if ( $number !~ m{ ^ \d+ $ }x ) { next; } $state = "train_no"; #say "$type $number"; if ( exists $map{$number} ) { # train numbers are not unique (they may have several wagon # orders with non-intersecting date ranges) my @dup = @{ $map{$number}{dup} // [] }; delete $map{$number}{dup}; push( @dup, $map{$number} ); $map{$number} = { rawType => $type, dup => [@dup], }; } else { $map{$number} = { rawType => $type, }; } if ($name) { $map{$number}{name} = $name; } if ( $type eq 'ICE-A' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-W' ) { $map{$number}{type} = 'ICE 3'; $map{$number}{shortType} = '3'; } elsif ( $type eq 'ICE' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-T' ) { $map{$number}{type} = 'ICE T'; $map{$number}{shortType} = 'T'; } elsif ( $type eq 'IC-A' ) { $map{$number}{type} = 'Metropolitan'; $map{$number}{shortType} = 'M'; } elsif ( $type eq 'IC-D' ) { $map{$number}{type} = 'IC2 Twindexx'; $map{$number}{shortType} = '2'; } elsif ( $type eq 'IC-E' ) { $map{$number}{type} = 'IC2 KISS'; $map{$number}{shortType} = '2'; } else { $map{$number}{type} = $type; } } elsif ( $line =~ m{ ^ (? L[NP]F T? (-[A-Z])? ) (? [ 0-9]{2,16}) }x ) { # Probably not in passenger service ("Leehrfahrt") # FWIW, LPF(T) can be further distinguished: # LPFT-A: ICE BR 401/402 # LPFT-B: ICE BR 412 # LPFT-T: ICE BR 411 / 415 / 605 # LPFT-W: ICE BR 403 / 406 # LNF seems to be related to couchettes / sleeper cars. #$number = undef; #$state = ""; $type = $+{type}; $type =~ tr{ }{}d; $number = $+{number}; $number =~ tr{ }{}d; if ( $number !~ m{ ^ \d+ $ }x ) { next; } $state = "train_no"; $map{$number} = { rawType => $type, type => $type, empty => \1, }; } elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) { $state = "route"; $map{$number}{route} = ""; } # ICE types can be further distinguished by rolling stock. For instance, # ICE 4 is the only ICE-A variant with restaurant and first-class seats # in the same wagon ("ARmz"). elsif ( $number and $line =~ m{^\d*\s+ARmz} ) { if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 4'; $map{$number}{shortType} = '4'; } } elsif ( $number and $line =~ m{^\d*\s+WRmz} ) { #say "ICE 1 / ICE 3 Redesign (WRmz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 1'; $map{$number}{shortType} = '1'; } elsif ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Redesign'; $map{$number}{shortType} = '3R'; } } elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) { #say "ICE 2 (WRmbsz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 2'; $map{$number}{shortType} = '2'; } } elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) { #say "ICE 3 Velaro"; if ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Velaro'; $map{$number}{shortType} = '3V'; } } elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) { if ( $map{$number}{type} eq 'IC2 Twindexx' or $map{$number}{type} eq 'IC2 KISS' ) { # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker # trains must start with D (indicating that they double decker # wagons). If an IC2 train has other wagons, it isn't really an IC2 # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as # IC-D, but actually uses IC1 wagons. $map{$number}{type} = 'IC'; delete $map{$number}{shortType}; } } if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (? .* ) $ }x ) { if ( length( $map{$number}{route} ) ) { $map{$number}{route} .= ' '; } $map{$number}{route} .= $+{route}; } elsif ( $state eq "route" and $line eq "" ) { $state = "schedules"; } elsif ( $state eq "schedules" and $line =~ m{ Hg | Tfz | BrH }x ) { $state = "details"; } if ( $number and $line =~ m{ ^ \d* \s{1,10} (? ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (? \d+ )? (?: $ | \s{3} (? .* ) $ ) }x ) { $state = "wagonorder"; my $wagon_type = $+{type}; my $wagon_number = $+{number}; my $rest = $+{rest}; push( @{ $map{$number}{raw_wagons} }, [ $wagon_type, $wagon_number ] ); if ( $rest and $rest =~ m{\S} ) { push( @{ $map{$number}{wagonorder_notes} }, $line ); } $map{$number}{hasWagon}{$wagon_type} = \1; } if ( $state eq "schedules" and length($line) and $line =~ m{ \S }x ) { push( @{ $map{$number}{schedules} }, parse_condition($line) ); } if ( $state eq "details" and length($line) and $line =~ m{ Hg | Tfz | BrH }x ) { push( @{ $map{$number}{details} }, $line ); } } for my $train_no ( keys %map ) { my @entries = @{ $map{$train_no}{dup} // [] }; delete $map{$train_no}{dup}; push( @entries, $map{$train_no} ); $map{$train_no} = [@entries]; } for my $trains ( values %map ) { for my $train ( @{$trains} ) { if ( not $train->{route} ) { next; } my $route = $train->{route}; my @parts = split( qr{ - }, $route ); if ( @parts < 2 ) { $train->{route} = { raw => $route }; next; } my ( $pre_start, $start, @middle, $end, $post_end ); if ( $parts[0] =~ m{ ^ [(] }x ) { $pre_start = shift @parts; $pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x; } if ( $parts[-1] =~ m{ ^ [(] }x ) { $post_end = pop @parts; $post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x; } $start = shift @parts; $end = pop @parts; $train->{route} = { preStart => $pre_start, start => $start, middle => scalar @parts ? [@parts] : undef, end => $end, postEnd => $post_end, }; for my $k ( keys %{ $train->{route} } ) { if ( not defined $train->{route}{$k} ) { # avoid null values (leave out the property instead) delete $train->{route}{$k}; } } } } for my $trains ( values %map ) { for my $train ( @{$trains} ) { if ( @{ $train->{wagonorder_notes} // [] } == 0 ) { next; } my $first_line = $train->{wagonorder_notes}[0]; my $from_offset = 0; my $to_offset = 0; # International trains (e.g. EC 8) are super special and may have another # identifier after the "cycle to" train number. if ( $first_line =~ m{ ^ (? .*? [^0-9] ) (? \d+ ) (? [A-Z ]+ ) (? \d+ ) (?: \s+ \d+ )? $ }x ) { $from_offset = length( $+{lead} ); $to_offset = $from_offset + length( $+{from} ) + length( $+{middle} ); } if ( not $from_offset ) { next; } my $cycle_id = "unknown"; for my $i_line ( @{ $train->{wagonorder_notes} } ) { # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes. # We don't want that. my $line = substr( $i_line, 0 ); # Fx abc appears to identify cycle groups. # It also confuses the cycle detection code for cycles which do not have # a "from" entry. if ( $line =~ s{(F[0-9] [0-9]{3})}{.. ...} ) { $cycle_id = $1; } if ( length($line) <= $from_offset ) { next; } my $umlauf = substr( $line, $from_offset ); if ( $umlauf =~ m{ ^ (\d+) }x ) { push( @{ $train->{cycle}{$cycle_id}{from} }, $1 ); } if ( length($line) > $to_offset ) { $umlauf = substr( $line, $to_offset ); if ( $umlauf =~ m{ ^ (\d+) }x ) { push( @{ $train->{cycle}{$cycle_id}{to} }, $1 ); } } } for my $cycle ( values %{ $train->{cycle} // {} } ) { @{ $cycle->{from} } = uniq @{ $cycle->{from} // [] }; @{ $cycle->{to} } = uniq @{ $cycle->{to} // [] }; } } } for my $trains ( values %map ) { for my $train ( @{$trains} ) { if ( not $train->{details} ) { next; } my @details = @{ $train->{details} // [] }; my %common_setup; my @setups; for my $line (@details) { my %setup; if ( $line =~ m{ Tfz1:(\d+) }x ) { $setup{series} = $1; } if ( $line =~ m{ Tfz2:(\d+) }x ) { $setup{series2} = $1; } if ( $line =~ m{ Hg(\d+) }x ) { $setup{vmax} = 0 + $1; } if ( $line =~ m{ BrH(\d+) }x ) { $setup{brakingPercentage} = 0 + $1; } if ( $line =~ m{ (\d+)m }x ) { $setup{length} = 0 + $1; } if ( %setup and $line =~ m{ ^ \s* ([+]? [A-Z ]{1,7}) }x ) { my $station = $1; $station =~ s{\s+$}{}; my $special = ( $station =~ s{^[+]}{} ); if ( $ds100_to_name{$station} ) { $station = $ds100_to_name{$station}; } if ($special) { $station = "+${station}"; } $setup{station} = $station; } push( @setups, {%setup} ); } for my $key (qw(brakingPercentage length series series2 vmax)) { if ( ( my @uniq = uniq grep { $_ } map { $_->{$key} } @setups ) == 1 ) { $common_setup{$key} = $key =~ m{series} ? $uniq[0] : 0 + $uniq[0]; } } delete $train->{details}; if (%common_setup) { $train->{commonAttr} = {%common_setup}; } $train->{attrVariants} = [@setups]; } } for my $trains ( values %map ) { for my $train ( @{$trains} ) { my $wagon_numbers_ok = 1; my $wagon_types_ok = 1; my $wagon_list_ok = 1; my %bincount; my %type_by_wagon_number; for my $wagon ( @{ $train->{raw_wagons} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { # Locomotives do not have a wagon number. That's OK. next; } elsif ( not $wagon_number ) { $wagon_numbers_ok = 0; } else { $bincount{$wagon_number}++; push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type ); } } if ($wagon_numbers_ok) { for my $wagon ( @{ $train->{raw_wagons} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; my $json_wagon = { type => $wagon_type }; if ($wagon_number) { $json_wagon->{number} = 0 + $wagon_number; } push( @{ $train->{wagons} }, $json_wagon ); if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { next; } if ( $bincount{$wagon_number} > 1 ) { $wagon_list_ok = 0; if ( $type_by_wagon_number{$wagon_number}[0] ne $type_by_wagon_number{$wagon_number}[1] ) { $wagon_types_ok = 0; } } } } if ( not $wagon_types_ok ) { # train type may differ depending on date of week or similar. We don't # parse/handle that yet. $train->{type} = $train->{rawType}; delete $train->{shortType}; delete $train->{wagons}; delete $train->{hasWagon}; } elsif ( not $wagon_list_ok ) { # train type appears to be correct, but the list contains duplicates # (e.g. due to unhandled schedule variants) delete $train->{wagons}; } } } # Cleanup for my $trains ( values %map ) { for my $train ( @{$trains} ) { delete $train->{raw_wagons}; delete $train->{wagonorder_notes}; } } for my $bork (qw(104 1588 1700 77689 78112 939 2310)) { for my $train ( @{ $map{$bork} } ) { delete $train->{cycle}; } } # use canonical output (i.e., sort hash keys) to allow for easy diffing. say JSON->new->utf8->canonical->encode( { deprecated => \0, source => $wr_name, train_variants => {%map}, valid => $valid, } );