#!/usr/bin/env perl # Copyright (C) 2020 Daniel Friesel # # SPDX-License-Identifier: BSD-2-Clause use strict; use warnings; use 5.020; use utf8; use File::Slurp qw(write_file); use IPC::Run3; use JSON; sub show_usage { my ($exit_code) = @_; say STDERR "Usage: db-wagenreihung-to-json \n"; say STDERR "You can obtain the latest PDF from "; exit $exit_code; } my @weekdays = (qw(Mo Di Mi Do Fr Sa So)); my @months = (qw(0 I II III IV V VI VII VIII IX X XI XII)); my %weekday = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays ); my %month = map { ( $months[$_] => $_ ) } ( 0 .. $#months ); my %valid; my $year; sub parse_weekday_range { my ($range) = @_; my $ret = [ ( \0 ) x 7 ]; if ( length($range) == 2 ) { $ret->[ $weekday{$range} ] = \1; return $ret; } my ( $start, $end ) = split( qr{-}, $range ); for my $i ( $weekday{$start} .. $weekday{$end} ) { $ret->[$i] = \1; } return $ret; } sub parse_weekdays { my ($text) = @_; if ( $text eq 'tgl.' ) { return [ ( \1 ) x 7 ]; } my $ret = [ ( \0 ) x 7 ]; for my $range ( split( qr{[+]}, $text ) ) { my $new_ret = parse_weekday_range($range); for my $i ( 0 .. 6 ) { if ( ${ $new_ret->[$i] } ) { $ret->[$i] = \1; } } } return $ret; } sub parse_dates { my ($text) = @_; my @valid_ranges; my @valid_dates; my @invalid_dates; my $mode = "range"; for my $date ( split( qr{,\s*}, $text ) ) { if ( $date =~ m{^auch} ) { $mode = "extra"; } elsif ( $date =~ m{^nicht} ) { $mode = "except"; } elsif ( $date =~ m{^(ab|bis)} ) { $mode = "range"; } if ( $mode eq 'range' and $date =~ m{ ^ ab \s* (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] ); } elsif ( $mode eq 'range' and $date =~ m{ ^ bis \s* (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] ); } elsif ( $date =~ m{ ^ (? \d{2} ) \. (? [^-]+ )\. - (? \d{2} ) \. (? [^.]* ) }x ) { push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] ); } } my %ret = ( valid => [], invalid => [], ); for my $range (@valid_ranges) { my $from_date = $valid{from}; my $through_date = $valid{through}; if ( $range->[0] ) { $from_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[0][0] }, $range->[0][1] ); } if ( $range->[1] ) { $through_date = sprintf( '%04d-%02d-%02d', $year, $month{ $range->[1][0] }, $range->[1][1] ); } push( @{ $ret{valid} }, "${from_date}/${through_date}" ); } return %ret; } sub parse_condition { my ($line) = @_; my %ret; if ( $line =~ m{ ^ \s* (? .*? ) \s - \s (? [^,]+ ) , \s* (? (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (? .* ) $ }x ) { $ret{from} = $+{from}; $ret{to} = $+{to}; $ret{weekdays} = parse_weekdays( $+{weekdays} ); my %validity = parse_dates( $+{dates} ); while ( my ( $k, $v ) = each %validity ) { $ret{$k} = $v; } $ret{raw} = $line; } else { $ret{raw} = $line; } return {%ret}; } if ( @ARGV == 0 ) { show_usage(1); } run3( [ "which", "pdftotext" ], \undef, \undef, \undef ); if ($?) { say STDERR "Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n"; say STDERR "On Debian and derivatives, it should be available in the 'poppler-utils' package."; exit 2; } my $wr_pdf = $ARGV[0]; my $wr_txt; my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ]; # stdin: /dev/null # stdout: $wr_txt # stderr: forwarded to terminal run3( $command, \undef, \$wr_txt, undef, { binmode_stdout => ':encoding(utf-8)' } ); if ($?) { my $exit_status = $? >> 8; say STDERR "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)"; exit 2; } my @lines = split( qr{\n}, $wr_txt ); my ( $type, $number ); my %map; my %wagon_map; my $state = "intro"; for my $line (@lines) { if ( $line =~ m{ ^ \s* Gültigkeit: \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) \s* – \s* (? \d{2} ) \. (? \d{2} ) \. (? \d{4} ) }x ) { %valid = ( "from" => sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ), "through" => sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ), ); $year = $+{ty}; $map{valid} = {%valid}; } if ( $state eq "intro" ) { if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) { $state = ""; } else { next; } } if ( $line =~ m{ ^ (? [EINR][CJN][ A-Z-]{6}) (? [ 0-9]{2,12}) }x ) { $type = $+{type}; $type =~ tr{ }{}d; $number = $+{number}; $number =~ tr{ }{}d; if ( $number !~ m{ ^ \d+ $ }x ) { next; } $state = "train_no"; #say "$type $number"; $map{$number} = { raw => $type, }; if ( $type eq 'ICE-A' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-W' ) { $map{$number}{type} = 'ICE 3'; $map{$number}{short} = '3'; } elsif ( $type eq 'ICE' ) { $map{$number}{type} = 'ICE 1/2/4'; } elsif ( $type eq 'ICE-T' ) { $map{$number}{type} = 'ICE T'; $map{$number}{short} = 'T'; } elsif ( $type eq 'IC-A' ) { $map{$number}{type} = 'Metropolitan'; $map{$number}{short} = 'M'; } elsif ( $type eq 'IC-D' ) { $map{$number}{type} = 'IC2'; $map{$number}{short} = '2'; } elsif ( $type eq 'IC-E' ) { $map{$number}{type} = 'IC2 KISS'; $map{$number}{short} = '2'; } else { $map{$number}{type} = $type; } } elsif ( $line =~ m{^LPF} ) { # Not in passenger service ("Leehrfahrt"). Ignore it for now. # FWIW, LPF(T) can be further distinguished: # LPFT-A: ICE BR 401/402 # LPFT-B: ICE BR 412 # LPFT-T: ICE BR 411 / 415 / 605 # LPFT-W: ICE BR 403 / 406 $number = undef; $state = ""; } elsif ( $state eq "train_no" and $line =~ m{ ^ \s{5} \S }x ) { $state = "route"; $map{$number}{route} = ""; } # ICE types can be further distinguished by rolling stock. For instance, # ICE 4 is the only ICE-A variant with restaurant and first-class seats # in the same wagon ("ARmz"). elsif ( $number and $line =~ m{^\d*\s+ARmz} ) { if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 4'; $map{$number}{short} = '4'; } } elsif ( $number and $line =~ m{^\d*\s+WRmz} ) { #say "ICE 1 / ICE 3 Redesign (WRmz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 1'; $map{$number}{short} = '1'; } elsif ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Redesign'; $map{$number}{short} = '3R'; } } elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) { #say "ICE 2 (WRmbsz)"; if ( $map{$number}{type} eq 'ICE 1/2/4' ) { $map{$number}{type} = 'ICE 2'; $map{$number}{short} = '2'; } } elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) { #say "ICE 3 Velaro"; if ( $map{$number}{type} eq 'ICE 3' ) { $map{$number}{type} = 'ICE 3 Velaro'; $map{$number}{short} = '3V'; } } if ( $state eq "route" and $line =~ m{ ^ \s{5} (? .* ) $ }x ) { if ( length( $map{$number}{route} ) ) { $map{$number}{route} .= ' '; } $map{$number}{route} .= $+{route}; } elsif ( $state eq "route" and $line eq "" ) { $state = "conditions"; } elsif ( $state eq "conditions" and $line =~ m{ Hg | Tfz | BrH }x ) { $state = "details"; } if ( $number and $line =~ m{ ^ \d* \s{1,10} (? D? (WL)? (A|AB|B|W) R? D? [aceimpvw] \S* ) \s* (? \d+ )? (?: $ | \s{6} ) }x ) { $state = "wagonorder"; push( @{ $wagon_map{$number} }, [ $+{type}, $+{number} ] ); #$map{$number}{has_wagon}{$1} = \1; $map{$number}{wagons}{$1} = \1; } if ( $state eq "conditions" and length($line) and $line =~ m{ \S }x ) { push( @{ $map{$number}{conditions} }, parse_condition($line) ); } if ( $state eq "details" and length($line) and $line =~ m{ Hg | Tfz | BrH }x ) { push( @{ $map{$number}{details} }, $line ); } } for my $train_number ( keys %map ) { my $wagon_numbers_ok = 1; my $wagon_types_ok = 1; my %bincount; my %type_by_wagon_number; for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; if ( not $wagon_number ) { $wagon_numbers_ok = 0; } else { $bincount{$wagon_number}++; push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type ); } } if ($wagon_numbers_ok) { for my $wagon ( @{ $wagon_map{$train_number} // [] } ) { my ( $wagon_type, $wagon_number ) = @{$wagon}; $map{$train_number}{wagon}{ $wagon->[1] } = $wagon_type; if ( $bincount{$wagon_number} > 1 ) { if ( $type_by_wagon_number{$wagon_number}[0] ne $type_by_wagon_number{$wagon_number}[1] ) { $wagon_types_ok = 0; } } } } if ( not $wagon_types_ok ) { # train type may differ depending on date of week or similar. We don't # parse/handle that yet. delete $map{$train_number}{type}; delete $map{$train_number}{short}; delete $map{$train_number}{wagon}; delete $map{$train_number}{has_wagon}; } } # use canonical output (i.e., sort hash keys) to allow for easy diffing. #say JSON->new->utf8->canonical->encode( {valid => {%valid}, train => {%map}} ); say JSON->new->utf8->canonical->encode( {%map} );