From bf0096940bc48d670aa779a08ed18c1d7914841e Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Mon, 21 Dec 2020 10:07:38 +0100 Subject: rename to db-zugbildung-to-json --- bin/db-wagenreihung-to-json | 576 -------------------------------------------- bin/db-zugbildung-to-json | 576 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 576 insertions(+), 576 deletions(-) delete mode 100755 bin/db-wagenreihung-to-json create mode 100755 bin/db-zugbildung-to-json diff --git a/bin/db-wagenreihung-to-json b/bin/db-wagenreihung-to-json deleted file mode 100755 index 8ed2289..0000000 --- a/bin/db-wagenreihung-to-json +++ /dev/null @@ -1,576 +0,0 @@ -#!/usr/bin/env perl -# Copyright (C) 2020 Daniel Friesel -# -# SPDX-License-Identifier: BSD-2-Clause - -use strict; -use warnings; -use 5.020; -use utf8; - -use File::Slurp qw(write_file); -use IPC::Run3; -use JSON; -use List::Util qw(uniq); - -sub show_usage { - my ($exit_code) = @_; - - say STDERR "Usage: db-wagenreihung-to-json \n"; - say STDERR -"You can obtain the latest PDF from "; - - exit $exit_code; -} - -my @weekdays = (qw(Mo Di Mi Do Fr Sa So)); -my @months = (qw(0 I II III IV V VI VII VIII IX X XI XII)); -my %weekday = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays ); -my %month = map { ( $months[$_] => $_ ) } ( 0 .. $#months ); - -my %valid; -my $year; - -sub parse_weekday_range { - my ($range) = @_; - my $ret = [ ( \0 ) x 7 ]; - - if ( length($range) == 2 ) { - $ret->[ $weekday{$range} ] = \1; - return $ret; - } - - my ( $start, $end ) = split( qr{-}, $range ); - - for my $i ( $weekday{$start} .. $weekday{$end} ) { - $ret->[$i] = \1; - } - - return $ret; -} - -sub parse_weekdays { - my ($text) = @_; - - if ( $text eq 'tgl.' ) { - return [ ( \1 ) x 7 ]; - } - - my $ret = [ ( \0 ) x 7 ]; - - for my $range ( split( qr{[+]}, $text ) ) { - my $new_ret = parse_weekday_range($range); - for my $i ( 0 .. 6 ) { - if ( ${ $new_ret->[$i] } ) { - $ret->[$i] = \1; - } - } - } - return $ret; -} - -sub parse_dates { - my ($text) = @_; - - my @valid_ranges; - my @valid_dates; - my @invalid_dates; - - my $mode = "range"; - for my $date ( split( qr{,\s*}, $text ) ) { - if ( $date =~ m{^auch} ) { - $mode = "extra"; - } - elsif ( $date =~ m{^nicht} ) { - $mode = "except"; - } - elsif ( $date =~ m{^(ab|bis)} ) { - $mode = "range"; - } - - if ( $mode eq 'range' - and $date =~ m{ ^ ab \s* (? \d{2} ) \. (? [^.]* ) }x ) - { - push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] ); - } - elsif ( $mode eq 'range' - and $date =~ m{ ^ bis \s* (? \d{2} ) \. (? [^.]* ) }x ) - { - push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] ); - } - elsif ( $date - =~ m{ ^ (? \d{2} ) \. (? [^-]+ )\. - (? \d{2} ) \. (? [^.]* ) }x - ) - { - push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] ); - } - } - - my %ret = ( - valid => [], - invalid => [], - ); - - for my $range (@valid_ranges) { - my $from_date = $valid{from}; - my $through_date = $valid{through}; - if ( $range->[0] ) { - $from_date = sprintf( '%04d-%02d-%02d', - $year, $month{ $range->[0][0] }, - $range->[0][1] ); - } - if ( $range->[1] ) { - $through_date = sprintf( '%04d-%02d-%02d', - $year, $month{ $range->[1][0] }, - $range->[1][1] ); - } - push( @{ $ret{valid} }, "${from_date}/${through_date}" ); - } - - return %ret; -} - -sub parse_condition { - my ($line) = @_; - my %ret; - - if ( $line - =~ m{ ^ \s* (? .*? ) \s - \s (? [^,]+ ) , \s* (? (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (? .* ) $ }x - ) - { - $ret{from} = $+{from}; - $ret{to} = $+{to}; - $ret{weekdays} = parse_weekdays( $+{weekdays} ); - my %validity = parse_dates( $+{dates} ); - while ( my ( $k, $v ) = each %validity ) { - $ret{$k} = $v; - } - $ret{raw} = $line; - } - else { - $ret{raw} = $line; - } - - return {%ret}; -} - -if ( @ARGV == 0 ) { - show_usage(1); -} - -run3( [ "which", "pdftotext" ], \undef, \undef, \undef ); -if ($?) { - say STDERR -"Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n"; - say STDERR -"On Debian and derivatives, it should be available in the 'poppler-utils' package."; - exit 2; -} - -my $wr_pdf = $ARGV[0]; -my $wr_txt; -my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ]; - -# stdin: /dev/null -# stdout: $wr_txt -# stderr: forwarded to terminal -run3( $command, \undef, \$wr_txt, undef, - { binmode_stdout => ':encoding(utf-8)' } ); - -if ($?) { - my $exit_status = $? >> 8; - say STDERR - "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)"; - exit 2; -} - -my @lines = split( qr{\n}, $wr_txt ); - -my ( $type, $number ); -my %map; -my %wagon_map; - -my $state = "intro"; - -for my $line (@lines) { - - if ( $line - =~ m{ ^ \s* Gültigkeit: \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) \s* – \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) }x - ) - { - %valid = ( - "from" => sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ), - "through" => sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ), - ); - $year = $+{ty}; - } - - if ( $state eq "intro" ) { - if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) { - $state = ""; - } - else { - next; - } - } - - if ( $line eq 'F. Anhang V: Beschilderungsplan' ) { - last; - } - - if ( $line - =~ m{ ^ (? [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (? [ 0-9]{2,12}) }x - ) - { - $type = $+{type}; - $type =~ tr{ }{}d; - $number = $+{number}; - $number =~ tr{ }{}d; - - if ( $number !~ m{ ^ \d+ $ }x ) { - next; - } - - $state = "train_no"; - - #say "$type $number"; - $map{$number} = { - raw => $type, - }; - - if ( $type eq 'ICE-A' ) { - $map{$number}{type} = 'ICE 1/2/4'; - } - elsif ( $type eq 'ICE-W' ) { - $map{$number}{type} = 'ICE 3'; - $map{$number}{short} = '3'; - } - elsif ( $type eq 'ICE' ) { - $map{$number}{type} = 'ICE 1/2/4'; - } - elsif ( $type eq 'ICE-T' ) { - $map{$number}{type} = 'ICE T'; - $map{$number}{short} = 'T'; - } - elsif ( $type eq 'IC-A' ) { - $map{$number}{type} = 'Metropolitan'; - $map{$number}{short} = 'M'; - } - elsif ( $type eq 'IC-D' ) { - $map{$number}{type} = 'IC2'; - $map{$number}{short} = '2'; - } - elsif ( $type eq 'IC-E' ) { - $map{$number}{type} = 'IC2 KISS'; - $map{$number}{short} = '2'; - } - else { - $map{$number}{type} = $type; - } - } - elsif ( $line - =~ m{ ^ (? L[NP]F T? (-[A-Z])? ) (? [ 0-9]{2,16}) }x ) - { - - # Probably not in passenger service ("Leehrfahrt") - # FWIW, LPF(T) can be further distinguished: - # LPFT-A: ICE BR 401/402 - # LPFT-B: ICE BR 412 - # LPFT-T: ICE BR 411 / 415 / 605 - # LPFT-W: ICE BR 403 / 406 - # LNF seems to be related to couchettes / sleeper cars. - #$number = undef; - #$state = ""; - - $type = $+{type}; - $type =~ tr{ }{}d; - $number = $+{number}; - $number =~ tr{ }{}d; - - if ( $number !~ m{ ^ \d+ $ }x ) { - next; - } - - $state = "train_no"; - - $map{$number} = { - raw => $type, - type => $type, - empty => \1, - }; - } - - elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) { - $state = "route"; - $map{$number}{route} = ""; - } - - # ICE types can be further distinguished by rolling stock. For instance, - # ICE 4 is the only ICE-A variant with restaurant and first-class seats - # in the same wagon ("ARmz"). - elsif ( $number and $line =~ m{^\d*\s+ARmz} ) { - if ( $map{$number}{type} eq 'ICE 1/2/4' ) { - $map{$number}{type} = 'ICE 4'; - $map{$number}{short} = '4'; - } - } - elsif ( $number and $line =~ m{^\d*\s+WRmz} ) { - - #say "ICE 1 / ICE 3 Redesign (WRmz)"; - if ( $map{$number}{type} eq 'ICE 1/2/4' ) { - $map{$number}{type} = 'ICE 1'; - $map{$number}{short} = '1'; - } - elsif ( $map{$number}{type} eq 'ICE 3' ) { - $map{$number}{type} = 'ICE 3 Redesign'; - $map{$number}{short} = '3R'; - } - } - elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) { - - #say "ICE 2 (WRmbsz)"; - if ( $map{$number}{type} eq 'ICE 1/2/4' ) { - $map{$number}{type} = 'ICE 2'; - $map{$number}{short} = '2'; - } - } - elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) { - - #say "ICE 3 Velaro"; - if ( $map{$number}{type} eq 'ICE 3' ) { - $map{$number}{type} = 'ICE 3 Velaro'; - $map{$number}{short} = '3V'; - } - } - elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) { - if ( $map{$number}{type} eq 'IC2' or $map{$number}{type} eq 'IC2 KISS' ) - { - # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker - # trains must start with D (indicating that they double decker - # wagons). If an IC2 train has other wagons, it isn't really an IC2 - # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as - # IC-D, but actually uses IC1 wagons. - $map{$number}{type} = 'IC'; - delete $map{$number}{short}; - } - } - - if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (? .* ) $ }x ) { - if ( length( $map{$number}{route} ) ) { - $map{$number}{route} .= ' '; - } - $map{$number}{route} .= $+{route}; - } - elsif ( $state eq "route" and $line eq "" ) { - $state = "schedules"; - } - elsif ( $state eq "schedules" and $line =~ m{ Hg | Tfz | BrH }x ) { - $state = "details"; - } - - if ( $number - and $line - =~ m{ ^ \d* \s{1,10} (? ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (? \d+ )? (?: $ | \s{3} (? .* ) $ ) }x - ) - { - $state = "wagonorder"; - my $wagon_type = $+{type}; - my $wagon_number = $+{number}; - my $rest = $+{rest}; - - push( @{ $wagon_map{$number} }, [ $wagon_type, $wagon_number ] ); - - if ( $rest and $rest =~ m{\S} ) { - push( @{ $map{$number}{wagonorder_notes} }, $line ); - } - - $map{$number}{has_wagon}{$wagon_type} = \1; - } - - if ( $state eq "schedules" and length($line) and $line =~ m{ \S }x ) { - push( @{ $map{$number}{schedules} }, parse_condition($line) ); - } - - if ( $state eq "details" - and length($line) - and $line =~ m{ Hg | Tfz | BrH }x ) - { - push( @{ $map{$number}{details} }, $line ); - } -} - -for my $train ( values %map ) { - if ( not $train->{route} ) { - next; - } - - my $route = $train->{route}; - my @parts = split( qr{ - }, $route ); - - if ( @parts < 2 ) { - $train->{route} = { raw => $route }; - next; - } - my ( $pre_start, $start, @middle, $end, $post_end ); - - if ( $parts[0] =~ m{ ^ [(] }x ) { - $pre_start = shift @parts; - $pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x; - } - - if ( $parts[-1] =~ m{ ^ [(] }x ) { - $post_end = pop @parts; - $post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x; - } - - $start = shift @parts; - $end = pop @parts; - - $train->{route} = { - preStart => $pre_start, - start => $start, - middle => scalar @parts ? [@parts] : undef, - end => $end, - postEnd => $post_end, - }; -} - -for my $train ( values %map ) { - if ( @{ $train->{wagonorder_notes} // [] } == 0 ) { - next; - } - my $first_line = $train->{wagonorder_notes}[0]; - my $from_offset = 0; - my $to_offset = 0; - - # International trains (e.g. EC 8) are super special and may have another - # identifier after the "cycle to" train number. - if ( $first_line - =~ m{ ^ (? .*? [^0-9] ) (? \d+ ) (? [A-Z ]+ ) (? \d+ ) (?: \s+ \d+ )? $ }x - ) - { - $from_offset = length( $+{lead} ); - $to_offset = $from_offset + length( $+{from} ) + length( $+{middle} ); - } - if ( not $from_offset ) { - next; - } - for my $i_line ( @{ $train->{wagonorder_notes} } ) { - - # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes. - # We don't want that. - my $line = substr( $i_line, 0 ); - - # Some cycles do not have a "from" entry, but they do have an odd - # F-identifier, which confuses the cycle detection code. remove it. - $line =~ s{F[0-9] [0-9]{3}}{.. ...}; - if ( length($line) <= $from_offset ) { - next; - } - my $umlauf = substr( $line, $from_offset ); - if ( $umlauf =~ m{ ^ (\d+) }x ) { - push( @{ $train->{cycle}{from} }, $1 ); - } - if ( length($line) > $to_offset ) { - $umlauf = substr( $line, $to_offset ); - if ( $umlauf =~ m{ ^ (\d+) }x ) { - push( @{ $train->{cycle}{to} }, $1 ); - } - } - } - if ( exists $train->{cycle}{from} ) { - @{ $train->{cycle}{from} } = uniq @{ $train->{cycle}{from} }; - @{ $train->{cycle}{to} } = uniq @{ $train->{cycle}{to} }; - } -} - -#for my $train_number ( keys %map ) { -# my $tr = $map{$train_number}{route} // q{}; -# $tr =~ s{ - .* - }{ - }; -# my $this = sprintf("\"%d %s\"", $train_number, $tr); -# for my $from ( @{ $map{$train_number}{umlauf}{from} // [] } ) { -# $tr = $map{$from}{route} // q{}; -# $tr =~ s{ - .* - }{ - }; -# $from = sprintf("\"%d %s\"", $from, $tr); -# say "$from -> $this;"; -# } -# for my $to ( @{ $map{$train_number}{umlauf}{to} // [] } ) { -# $tr = $map{$to}{route} // q{}; -# $tr =~ s{ -.*- }{ - }; -# $to = sprintf("\"%d %s\"", $to, $tr); -# say "$this -> $to;"; -# } -#} - -#exit 0; - -for my $train_number ( keys %map ) { - my $wagon_numbers_ok = 1; - my $wagon_types_ok = 1; - my %bincount; - my %type_by_wagon_number; - for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { - my ( $wagon_type, $wagon_number ) = @{$wagon}; - if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { - - # Locomotives do not have a wagon number. That's OK. - next; - } - elsif ( not $wagon_number ) { - $wagon_numbers_ok = 0; - } - else { - $bincount{$wagon_number}++; - push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type ); - } - } - if ($wagon_numbers_ok) { - for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { - my ( $wagon_type, $wagon_number ) = @{$wagon}; - - push( @{ $map{$train_number}{wagons} }, $wagon ); - - if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { - next; - } - - if ( $bincount{$wagon_number} > 1 ) { - if ( $type_by_wagon_number{$wagon_number}[0] ne - $type_by_wagon_number{$wagon_number}[1] ) - { - $wagon_types_ok = 0; - } - - # TODO else -> wagon_list_ok = 0 ? - } - } - } - if ( not $wagon_types_ok ) { - - # train type may differ depending on date of week or similar. We don't - # parse/handle that yet. - $map{$train_number}{type} = $map{$train_number}{raw}; - delete $map{$train_number}{short}; - delete $map{$train_number}{wagons}; - delete $map{$train_number}{has_wagon}; - } -} - -# broken umlauf (indentation changes on page break) -delete $map{104}{cycle}; -delete $map{1588}{cycle}; -delete $map{1700}{cycle}; -delete $map{77689}{cycle}; -delete $map{78112}{cycle}; - -# indentation change between consecutive lines. wat. -delete $map{939}{cycle}; -delete $map{2310}{cycle}; - -# use canonical output (i.e., sort hash keys) to allow for easy diffing. -say JSON->new->utf8->canonical->encode( - { - valid => {%valid}, - train => {%map} - } -); diff --git a/bin/db-zugbildung-to-json b/bin/db-zugbildung-to-json new file mode 100755 index 0000000..8ed2289 --- /dev/null +++ b/bin/db-zugbildung-to-json @@ -0,0 +1,576 @@ +#!/usr/bin/env perl +# Copyright (C) 2020 Daniel Friesel +# +# SPDX-License-Identifier: BSD-2-Clause + +use strict; +use warnings; +use 5.020; +use utf8; + +use File::Slurp qw(write_file); +use IPC::Run3; +use JSON; +use List::Util qw(uniq); + +sub show_usage { + my ($exit_code) = @_; + + say STDERR "Usage: db-wagenreihung-to-json \n"; + say STDERR +"You can obtain the latest PDF from "; + + exit $exit_code; +} + +my @weekdays = (qw(Mo Di Mi Do Fr Sa So)); +my @months = (qw(0 I II III IV V VI VII VIII IX X XI XII)); +my %weekday = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays ); +my %month = map { ( $months[$_] => $_ ) } ( 0 .. $#months ); + +my %valid; +my $year; + +sub parse_weekday_range { + my ($range) = @_; + my $ret = [ ( \0 ) x 7 ]; + + if ( length($range) == 2 ) { + $ret->[ $weekday{$range} ] = \1; + return $ret; + } + + my ( $start, $end ) = split( qr{-}, $range ); + + for my $i ( $weekday{$start} .. $weekday{$end} ) { + $ret->[$i] = \1; + } + + return $ret; +} + +sub parse_weekdays { + my ($text) = @_; + + if ( $text eq 'tgl.' ) { + return [ ( \1 ) x 7 ]; + } + + my $ret = [ ( \0 ) x 7 ]; + + for my $range ( split( qr{[+]}, $text ) ) { + my $new_ret = parse_weekday_range($range); + for my $i ( 0 .. 6 ) { + if ( ${ $new_ret->[$i] } ) { + $ret->[$i] = \1; + } + } + } + return $ret; +} + +sub parse_dates { + my ($text) = @_; + + my @valid_ranges; + my @valid_dates; + my @invalid_dates; + + my $mode = "range"; + for my $date ( split( qr{,\s*}, $text ) ) { + if ( $date =~ m{^auch} ) { + $mode = "extra"; + } + elsif ( $date =~ m{^nicht} ) { + $mode = "except"; + } + elsif ( $date =~ m{^(ab|bis)} ) { + $mode = "range"; + } + + if ( $mode eq 'range' + and $date =~ m{ ^ ab \s* (? \d{2} ) \. (? [^.]* ) }x ) + { + push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] ); + } + elsif ( $mode eq 'range' + and $date =~ m{ ^ bis \s* (? \d{2} ) \. (? [^.]* ) }x ) + { + push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] ); + } + elsif ( $date + =~ m{ ^ (? \d{2} ) \. (? [^-]+ )\. - (? \d{2} ) \. (? [^.]* ) }x + ) + { + push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] ); + } + } + + my %ret = ( + valid => [], + invalid => [], + ); + + for my $range (@valid_ranges) { + my $from_date = $valid{from}; + my $through_date = $valid{through}; + if ( $range->[0] ) { + $from_date = sprintf( '%04d-%02d-%02d', + $year, $month{ $range->[0][0] }, + $range->[0][1] ); + } + if ( $range->[1] ) { + $through_date = sprintf( '%04d-%02d-%02d', + $year, $month{ $range->[1][0] }, + $range->[1][1] ); + } + push( @{ $ret{valid} }, "${from_date}/${through_date}" ); + } + + return %ret; +} + +sub parse_condition { + my ($line) = @_; + my %ret; + + if ( $line + =~ m{ ^ \s* (? .*? ) \s - \s (? [^,]+ ) , \s* (? (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (? .* ) $ }x + ) + { + $ret{from} = $+{from}; + $ret{to} = $+{to}; + $ret{weekdays} = parse_weekdays( $+{weekdays} ); + my %validity = parse_dates( $+{dates} ); + while ( my ( $k, $v ) = each %validity ) { + $ret{$k} = $v; + } + $ret{raw} = $line; + } + else { + $ret{raw} = $line; + } + + return {%ret}; +} + +if ( @ARGV == 0 ) { + show_usage(1); +} + +run3( [ "which", "pdftotext" ], \undef, \undef, \undef ); +if ($?) { + say STDERR +"Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n"; + say STDERR +"On Debian and derivatives, it should be available in the 'poppler-utils' package."; + exit 2; +} + +my $wr_pdf = $ARGV[0]; +my $wr_txt; +my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ]; + +# stdin: /dev/null +# stdout: $wr_txt +# stderr: forwarded to terminal +run3( $command, \undef, \$wr_txt, undef, + { binmode_stdout => ':encoding(utf-8)' } ); + +if ($?) { + my $exit_status = $? >> 8; + say STDERR + "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)"; + exit 2; +} + +my @lines = split( qr{\n}, $wr_txt ); + +my ( $type, $number ); +my %map; +my %wagon_map; + +my $state = "intro"; + +for my $line (@lines) { + + if ( $line + =~ m{ ^ \s* Gültigkeit: \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) \s* – \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) }x + ) + { + %valid = ( + "from" => sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ), + "through" => sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ), + ); + $year = $+{ty}; + } + + if ( $state eq "intro" ) { + if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) { + $state = ""; + } + else { + next; + } + } + + if ( $line eq 'F. Anhang V: Beschilderungsplan' ) { + last; + } + + if ( $line + =~ m{ ^ (? [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (? [ 0-9]{2,12}) }x + ) + { + $type = $+{type}; + $type =~ tr{ }{}d; + $number = $+{number}; + $number =~ tr{ }{}d; + + if ( $number !~ m{ ^ \d+ $ }x ) { + next; + } + + $state = "train_no"; + + #say "$type $number"; + $map{$number} = { + raw => $type, + }; + + if ( $type eq 'ICE-A' ) { + $map{$number}{type} = 'ICE 1/2/4'; + } + elsif ( $type eq 'ICE-W' ) { + $map{$number}{type} = 'ICE 3'; + $map{$number}{short} = '3'; + } + elsif ( $type eq 'ICE' ) { + $map{$number}{type} = 'ICE 1/2/4'; + } + elsif ( $type eq 'ICE-T' ) { + $map{$number}{type} = 'ICE T'; + $map{$number}{short} = 'T'; + } + elsif ( $type eq 'IC-A' ) { + $map{$number}{type} = 'Metropolitan'; + $map{$number}{short} = 'M'; + } + elsif ( $type eq 'IC-D' ) { + $map{$number}{type} = 'IC2'; + $map{$number}{short} = '2'; + } + elsif ( $type eq 'IC-E' ) { + $map{$number}{type} = 'IC2 KISS'; + $map{$number}{short} = '2'; + } + else { + $map{$number}{type} = $type; + } + } + elsif ( $line + =~ m{ ^ (? L[NP]F T? (-[A-Z])? ) (? [ 0-9]{2,16}) }x ) + { + + # Probably not in passenger service ("Leehrfahrt") + # FWIW, LPF(T) can be further distinguished: + # LPFT-A: ICE BR 401/402 + # LPFT-B: ICE BR 412 + # LPFT-T: ICE BR 411 / 415 / 605 + # LPFT-W: ICE BR 403 / 406 + # LNF seems to be related to couchettes / sleeper cars. + #$number = undef; + #$state = ""; + + $type = $+{type}; + $type =~ tr{ }{}d; + $number = $+{number}; + $number =~ tr{ }{}d; + + if ( $number !~ m{ ^ \d+ $ }x ) { + next; + } + + $state = "train_no"; + + $map{$number} = { + raw => $type, + type => $type, + empty => \1, + }; + } + + elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) { + $state = "route"; + $map{$number}{route} = ""; + } + + # ICE types can be further distinguished by rolling stock. For instance, + # ICE 4 is the only ICE-A variant with restaurant and first-class seats + # in the same wagon ("ARmz"). + elsif ( $number and $line =~ m{^\d*\s+ARmz} ) { + if ( $map{$number}{type} eq 'ICE 1/2/4' ) { + $map{$number}{type} = 'ICE 4'; + $map{$number}{short} = '4'; + } + } + elsif ( $number and $line =~ m{^\d*\s+WRmz} ) { + + #say "ICE 1 / ICE 3 Redesign (WRmz)"; + if ( $map{$number}{type} eq 'ICE 1/2/4' ) { + $map{$number}{type} = 'ICE 1'; + $map{$number}{short} = '1'; + } + elsif ( $map{$number}{type} eq 'ICE 3' ) { + $map{$number}{type} = 'ICE 3 Redesign'; + $map{$number}{short} = '3R'; + } + } + elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) { + + #say "ICE 2 (WRmbsz)"; + if ( $map{$number}{type} eq 'ICE 1/2/4' ) { + $map{$number}{type} = 'ICE 2'; + $map{$number}{short} = '2'; + } + } + elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) { + + #say "ICE 3 Velaro"; + if ( $map{$number}{type} eq 'ICE 3' ) { + $map{$number}{type} = 'ICE 3 Velaro'; + $map{$number}{short} = '3V'; + } + } + elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) { + if ( $map{$number}{type} eq 'IC2' or $map{$number}{type} eq 'IC2 KISS' ) + { + # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker + # trains must start with D (indicating that they double decker + # wagons). If an IC2 train has other wagons, it isn't really an IC2 + # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as + # IC-D, but actually uses IC1 wagons. + $map{$number}{type} = 'IC'; + delete $map{$number}{short}; + } + } + + if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (? .* ) $ }x ) { + if ( length( $map{$number}{route} ) ) { + $map{$number}{route} .= ' '; + } + $map{$number}{route} .= $+{route}; + } + elsif ( $state eq "route" and $line eq "" ) { + $state = "schedules"; + } + elsif ( $state eq "schedules" and $line =~ m{ Hg | Tfz | BrH }x ) { + $state = "details"; + } + + if ( $number + and $line + =~ m{ ^ \d* \s{1,10} (? ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (? \d+ )? (?: $ | \s{3} (? .* ) $ ) }x + ) + { + $state = "wagonorder"; + my $wagon_type = $+{type}; + my $wagon_number = $+{number}; + my $rest = $+{rest}; + + push( @{ $wagon_map{$number} }, [ $wagon_type, $wagon_number ] ); + + if ( $rest and $rest =~ m{\S} ) { + push( @{ $map{$number}{wagonorder_notes} }, $line ); + } + + $map{$number}{has_wagon}{$wagon_type} = \1; + } + + if ( $state eq "schedules" and length($line) and $line =~ m{ \S }x ) { + push( @{ $map{$number}{schedules} }, parse_condition($line) ); + } + + if ( $state eq "details" + and length($line) + and $line =~ m{ Hg | Tfz | BrH }x ) + { + push( @{ $map{$number}{details} }, $line ); + } +} + +for my $train ( values %map ) { + if ( not $train->{route} ) { + next; + } + + my $route = $train->{route}; + my @parts = split( qr{ - }, $route ); + + if ( @parts < 2 ) { + $train->{route} = { raw => $route }; + next; + } + my ( $pre_start, $start, @middle, $end, $post_end ); + + if ( $parts[0] =~ m{ ^ [(] }x ) { + $pre_start = shift @parts; + $pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x; + } + + if ( $parts[-1] =~ m{ ^ [(] }x ) { + $post_end = pop @parts; + $post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x; + } + + $start = shift @parts; + $end = pop @parts; + + $train->{route} = { + preStart => $pre_start, + start => $start, + middle => scalar @parts ? [@parts] : undef, + end => $end, + postEnd => $post_end, + }; +} + +for my $train ( values %map ) { + if ( @{ $train->{wagonorder_notes} // [] } == 0 ) { + next; + } + my $first_line = $train->{wagonorder_notes}[0]; + my $from_offset = 0; + my $to_offset = 0; + + # International trains (e.g. EC 8) are super special and may have another + # identifier after the "cycle to" train number. + if ( $first_line + =~ m{ ^ (? .*? [^0-9] ) (? \d+ ) (? [A-Z ]+ ) (? \d+ ) (?: \s+ \d+ )? $ }x + ) + { + $from_offset = length( $+{lead} ); + $to_offset = $from_offset + length( $+{from} ) + length( $+{middle} ); + } + if ( not $from_offset ) { + next; + } + for my $i_line ( @{ $train->{wagonorder_notes} } ) { + + # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes. + # We don't want that. + my $line = substr( $i_line, 0 ); + + # Some cycles do not have a "from" entry, but they do have an odd + # F-identifier, which confuses the cycle detection code. remove it. + $line =~ s{F[0-9] [0-9]{3}}{.. ...}; + if ( length($line) <= $from_offset ) { + next; + } + my $umlauf = substr( $line, $from_offset ); + if ( $umlauf =~ m{ ^ (\d+) }x ) { + push( @{ $train->{cycle}{from} }, $1 ); + } + if ( length($line) > $to_offset ) { + $umlauf = substr( $line, $to_offset ); + if ( $umlauf =~ m{ ^ (\d+) }x ) { + push( @{ $train->{cycle}{to} }, $1 ); + } + } + } + if ( exists $train->{cycle}{from} ) { + @{ $train->{cycle}{from} } = uniq @{ $train->{cycle}{from} }; + @{ $train->{cycle}{to} } = uniq @{ $train->{cycle}{to} }; + } +} + +#for my $train_number ( keys %map ) { +# my $tr = $map{$train_number}{route} // q{}; +# $tr =~ s{ - .* - }{ - }; +# my $this = sprintf("\"%d %s\"", $train_number, $tr); +# for my $from ( @{ $map{$train_number}{umlauf}{from} // [] } ) { +# $tr = $map{$from}{route} // q{}; +# $tr =~ s{ - .* - }{ - }; +# $from = sprintf("\"%d %s\"", $from, $tr); +# say "$from -> $this;"; +# } +# for my $to ( @{ $map{$train_number}{umlauf}{to} // [] } ) { +# $tr = $map{$to}{route} // q{}; +# $tr =~ s{ -.*- }{ - }; +# $to = sprintf("\"%d %s\"", $to, $tr); +# say "$this -> $to;"; +# } +#} + +#exit 0; + +for my $train_number ( keys %map ) { + my $wagon_numbers_ok = 1; + my $wagon_types_ok = 1; + my %bincount; + my %type_by_wagon_number; + for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { + my ( $wagon_type, $wagon_number ) = @{$wagon}; + if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { + + # Locomotives do not have a wagon number. That's OK. + next; + } + elsif ( not $wagon_number ) { + $wagon_numbers_ok = 0; + } + else { + $bincount{$wagon_number}++; + push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type ); + } + } + if ($wagon_numbers_ok) { + for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { + my ( $wagon_type, $wagon_number ) = @{$wagon}; + + push( @{ $map{$train_number}{wagons} }, $wagon ); + + if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) { + next; + } + + if ( $bincount{$wagon_number} > 1 ) { + if ( $type_by_wagon_number{$wagon_number}[0] ne + $type_by_wagon_number{$wagon_number}[1] ) + { + $wagon_types_ok = 0; + } + + # TODO else -> wagon_list_ok = 0 ? + } + } + } + if ( not $wagon_types_ok ) { + + # train type may differ depending on date of week or similar. We don't + # parse/handle that yet. + $map{$train_number}{type} = $map{$train_number}{raw}; + delete $map{$train_number}{short}; + delete $map{$train_number}{wagons}; + delete $map{$train_number}{has_wagon}; + } +} + +# broken umlauf (indentation changes on page break) +delete $map{104}{cycle}; +delete $map{1588}{cycle}; +delete $map{1700}{cycle}; +delete $map{77689}{cycle}; +delete $map{78112}{cycle}; + +# indentation change between consecutive lines. wat. +delete $map{939}{cycle}; +delete $map{2310}{cycle}; + +# use canonical output (i.e., sort hash keys) to allow for easy diffing. +say JSON->new->utf8->canonical->encode( + { + valid => {%valid}, + train => {%map} + } +); -- cgit v1.2.3