#!/usr/bin/env perl
# Copyright (C) 2020 Daniel Friesel
#
# SPDX-License-Identifier: BSD-2-Clause

use strict;
use warnings;
use 5.020;
use utf8;

use Encode qw(decode);
use IPC::Run3;
use JSON;
use List::Util qw(uniq);
use Travel::Status::DE::IRIS::Stations;

sub show_usage {
	my ($exit_code) = @_;

	say STDERR "Usage: db-wagenreihung-to-json <wagenreihung.pdf>\n";
	say STDERR
"You can obtain the latest PDF from <https://data.deutschebahn.com/dataset/zugbildungsplanzugbildungsplan-zpar>";

	exit $exit_code;
}

my @weekdays = (qw(Mo Di Mi Do Fr Sa So));
my @months   = (qw(0 I II III IV V VI VII VIII IX X XI XII));
my %weekday  = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays );
my %month    = map { ( $months[$_] => $_ ) } ( 0 .. $#months );

my ( $valid, $valid_from, $valid_through );
my $year;

my %ds100_to_name;

for my $station ( Travel::Status::DE::IRIS::Stations::get_stations() ) {
	$ds100_to_name{ $station->[0] } = $station->[1];
}

sub parse_weekday_range {
	my ($range) = @_;
	my $ret = [ ( \0 ) x 7 ];

	if ( length($range) == 2 ) {
		$ret->[ $weekday{$range} ] = \1;
		return $ret;
	}

	my ( $start, $end ) = split( qr{-}, $range );

	for my $i ( $weekday{$start} .. $weekday{$end} ) {
		$ret->[$i] = \1;
	}

	return $ret;
}

sub parse_weekdays {
	my ($text) = @_;

	if ( $text eq 'tgl.' ) {
		return [ ( \1 ) x 7 ];
	}

	my $ret = [ ( \0 ) x 7 ];

	for my $range ( split( qr{[+]}, $text ) ) {
		my $new_ret = parse_weekday_range($range);
		for my $i ( 0 .. 6 ) {
			if ( ${ $new_ret->[$i] } ) {
				$ret->[$i] = \1;
			}
		}
	}
	return $ret;
}

sub parse_dates {
	my ($text) = @_;

	my @valid_ranges;
	my @valid_dates;
	my @invalid_dates;

	my $mode = "range";
	for my $date ( split( qr{,\s*}, $text ) ) {
		if ( $date =~ m{^auch} ) {
			$mode = "extra";
		}
		elsif ( $date =~ m{^nicht} ) {
			$mode = "except";
		}
		elsif ( $date =~ m{^(ab|bis)} ) {
			$mode = "range";
		}

		if (    $mode eq 'range'
			and $date =~ m{ ^ ab \s* (?<day> \d{2} ) \. (?<month> [IVX]* ) }x )
		{
			push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] );
		}
		elsif ( $mode eq 'range'
			and $date =~ m{ ^ bis \s* (?<day> \d{2} ) \. (?<month> [IVX]* ) }x )
		{
			push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] );
		}
		elsif ( $date
			=~ m{ ^ (?<fd> \d{2} ) \. (?<fm> [^-]+ )\. - (?<td> \d{2} ) \. (?<tm> [^.]* ) }x
		  )
		{
			push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] );
		}
		elsif ( $mode eq 'extra'
			and $date =~ m{ (?<day> \d{2} ) \. (?<month> [IVX]* ) }x )
		{
			push( @valid_dates, [ $+{month}, $+{day} ] );
		}
		elsif ( $mode eq 'except'
			and $date =~ m{ (?<day> \d{2} ) \. (?<month> [IVX]* ) }x )
		{
			push( @invalid_dates, [ $+{month}, $+{day} ] );
		}
	}

	my %ret = (
		valid   => [],
		invalid => [],
	);

	for my $range (@valid_ranges) {
		my $from_date    = $valid_from;
		my $through_date = $valid_through;
		if ( $range->[0] ) {
			$from_date = sprintf( '%04d-%02d-%02d',
				$year, $month{ $range->[0][0] },
				$range->[0][1] );
		}
		if ( $range->[1] ) {
			$through_date = sprintf( '%04d-%02d-%02d',
				$year, $month{ $range->[1][0] },
				$range->[1][1] );
		}
		push( @{ $ret{valid} }, "${from_date}/${through_date}" );
	}

	# for several dates of the same month, only the last date has the month set
	# (e.g. "24., 31.XII."). Walk through the list in reverse so ensure the
	# month is always available.
	my $month = undef;
	for my $date ( reverse @valid_dates ) {
		if ( not $date->[0] and not $month ) {
			say STDERR "Skipping unhandled valid date in \"$text\"";
			next;
		}

		# TODO Winterfahrplan: Use $year-1 for month XII
		push(
			@{ $ret{valid} },
			sprintf( '%04d-%02d-%02d',
				$year, $month{ $date->[0] || $month },
				$date->[1] )
		);
		if ( $date->[0] ) {
			$month = $date->[0];
		}
	}
	$month = undef;
	for my $date ( reverse @invalid_dates ) {
		if ( not $date->[0] and not $month ) {
			say STDERR "Skipping unhandled invalid date in \"$text\"";
			next;
		}

		# TODO Winterfahrplan: Use $year-1 for month XII
		push(
			@{ $ret{invalid} },
			sprintf( '%04d-%02d-%02d',
				$year, $month{ $date->[0] || $month },
				$date->[1] )
		);
		if ( $date->[0] ) {
			$month = $date->[0];
		}
	}

	return %ret;
}

sub parse_condition {
	my ($line) = @_;
	my %ret;

	if ( $line
		=~ m{ ^ \s* (?<from> .*? ) \s - \s (?<to> [^,]+ ) , \s* (?<unknown> N \s* )? (?<weekdays> (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (?<dates> .* ) $ }x
	  )
	{
		$ret{from}     = $+{from};
		$ret{to}       = $+{to};
		$ret{weekdays} = parse_weekdays( $+{weekdays} );
		my %validity = parse_dates( $+{dates} );
		while ( my ( $k, $v ) = each %validity ) {
			$ret{$k} = $v;
		}
		$ret{raw} = $line;
	}
	else {
		$ret{raw} = $line;
	}

	return {%ret};
}

if ( @ARGV == 0 ) {
	show_usage(1);
}

run3( [ "which", "pdftotext" ], \undef, \undef, \undef );
if ($?) {
	say STDERR
"Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n";
	say STDERR
"On Debian and derivatives, it should be available in the 'poppler-utils' package.";
	exit 2;
}

my $wr_pdf  = decode( 'UTF-8', $ARGV[0] );
my $wr_name = ( split( qr{/}, $wr_pdf ) )[-1];
my $wr_txt;
my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ];

# stdin: /dev/null
# stdout: $wr_txt
# stderr: forwarded to terminal
run3( $command, \undef, \$wr_txt, undef,
	{ binmode_stdout => ':encoding(utf-8)' } );

if ($?) {
	my $exit_status = $? >> 8;
	say STDERR
	  "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)";
	exit 2;
}

my @lines = split( qr{\n}, $wr_txt );

my ( $type, $number );
my %map;

my $state = "intro";

for my $line (@lines) {

	if ( $line
		=~ m{ ^ \s* Gültigkeit: \s* (?<fd> \d{2} ) \. (?<fm> \d{2} ) \. (?<fy> \d{4} ) \s* – \s* (?<td> \d{2} ) \. (?<tm> \d{2} ) \. (?<ty> \d{4} ) }x
	  )
	{
		$valid_from    = sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} );
		$valid_through = sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} );
		$valid         = "${valid_from}/${valid_through}";
		$year          = $+{ty};
	}

	if ( $state eq "intro" ) {
		if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) {
			$state = "";
		}
		else {
			next;
		}
	}

	if ( $line eq 'F. Anhang V: Beschilderungsplan' ) {
		last;
	}

	if ( $line
		=~ m{ ^ (?<type> [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (?<number> [ 0-9]{2,12}) (?: \s+ (?<name> \S .* ) )? }x
	  )
	{
		$type = $+{type};
		$type =~ tr{ }{}d;
		$number = $+{number};
		$number =~ tr{ }{}d;

		my $name = $+{name};

		if ( $number !~ m{ ^ \d+ $ }x ) {
			next;
		}

		$state = "train_no";

		#say "$type $number";

		if ( exists $map{$number} ) {

			# train numbers are not unique (they may have several wagon
			# orders with non-intersecting date ranges)
			my @dup = @{ $map{$number}{dup} // [] };
			delete $map{$number}{dup};
			push( @dup, $map{$number} );
			$map{$number} = {
				rawType => $type,
				dup     => [@dup],
			};
		}
		else {
			$map{$number} = {
				rawType => $type,
			};
		}

		if ($name) {
			$map{$number}{name} = $name;
		}

		if ( $type eq 'ICE-A' ) {
			$map{$number}{type} = 'ICE 1/2/4';
		}
		elsif ( $type eq 'ICE-W' ) {
			$map{$number}{type}      = 'ICE 3';
			$map{$number}{shortType} = '3';
		}
		elsif ( $type eq 'ICE' ) {
			$map{$number}{type} = 'ICE 1/2/4';
		}
		elsif ( $type eq 'ICE-T' ) {
			$map{$number}{type}      = 'ICE T';
			$map{$number}{shortType} = 'T';
		}
		elsif ( $type eq 'IC-A' ) {
			$map{$number}{type}      = 'Metropolitan';
			$map{$number}{shortType} = 'M';
		}
		elsif ( $type eq 'IC-D' ) {
			$map{$number}{type}      = 'IC2 Twindexx';
			$map{$number}{shortType} = '2';
		}
		elsif ( $type eq 'IC-E' ) {
			$map{$number}{type}      = 'IC2 KISS';
			$map{$number}{shortType} = '2';
		}
		else {
			$map{$number}{type} = $type;
		}
	}
	elsif ( $line
		=~ m{ ^ (?<type> L[NP]F T? (-[A-Z])? ) (?<number> [ 0-9]{2,16}) }x )
	{

		# Probably not in passenger service ("Leehrfahrt")
		# FWIW, LPF(T) can be further distinguished:
		# LPFT-A: ICE BR 401/402
		# LPFT-B: ICE BR 412
		# LPFT-T: ICE BR 411 / 415 / 605
		# LPFT-W: ICE BR 403 / 406
		# LNF seems to be related to couchettes / sleeper cars.
		#$number = undef;
		#$state  = "";

		$type = $+{type};
		$type =~ tr{ }{}d;
		$number = $+{number};
		$number =~ tr{ }{}d;

		if ( $number !~ m{ ^ \d+ $ }x ) {
			next;
		}

		$state = "train_no";

		$map{$number} = {
			rawType => $type,
			type    => $type,
			empty   => \1,
		};
	}

	elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) {
		$state = "route";
		$map{$number}{route} = "";
	}

	# ICE types can be further distinguished by rolling stock. For instance,
	# ICE 4 is the only ICE-A variant with restaurant and first-class seats
	# in the same wagon ("ARmz").
	elsif ( $number and $line =~ m{^\d*\s+ARmz} ) {
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}      = 'ICE 4';
			$map{$number}{shortType} = '4';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+WRmz} ) {

		#say "ICE 1 / ICE 3 Redesign (WRmz)";
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}      = 'ICE 1';
			$map{$number}{shortType} = '1';
		}
		elsif ( $map{$number}{type} eq 'ICE 3' ) {
			$map{$number}{type}      = 'ICE 3 Redesign';
			$map{$number}{shortType} = '3R';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) {

		#say "ICE 2 (WRmbsz)";
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}      = 'ICE 2';
			$map{$number}{shortType} = '2';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) {

		#say "ICE 3 Velaro";
		if ( $map{$number}{type} eq 'ICE 3' ) {
			$map{$number}{type}      = 'ICE 3 Velaro';
			$map{$number}{shortType} = '3V';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) {
		if (   $map{$number}{type} eq 'IC2 Twindexx'
			or $map{$number}{type} eq 'IC2 KISS' )
		{
        # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker
        # trains must start with D (indicating that they double decker
        # wagons). If an IC2 train has other wagons, it isn't really an IC2
        # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as
        # IC-D, but actually uses IC1 wagons.
			$map{$number}{type} = 'IC';
			delete $map{$number}{shortType};
		}
	}

	if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (?<route> .* ) $ }x ) {
		if ( length( $map{$number}{route} ) ) {
			$map{$number}{route} .= ' ';
		}
		$map{$number}{route} .= $+{route};
	}
	elsif ( $state eq "route" and $line eq "" ) {
		$state = "schedules";
	}
	elsif ( $state eq "schedules" and $line =~ m{ Hg | Tfz | BrH }x ) {
		$state = "details";
	}

	if (    $number
		and $line
		=~ m{ ^ \d* \s{1,10} (?<type> ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (?<number> \d+ )? (?: $ | \s{3} (?<rest> .* ) $ ) }x
	  )
	{
		$state = "wagonorder";
		my $wagon_type   = $+{type};
		my $wagon_number = $+{number};
		my $rest         = $+{rest};

		push( @{ $map{$number}{raw_wagons} }, [ $wagon_type, $wagon_number ] );

		if ( $rest and $rest =~ m{\S} ) {
			push( @{ $map{$number}{wagonorder_notes} }, $line );
		}

		$map{$number}{hasWagon}{$wagon_type} = \1;
	}

	if ( $state eq "schedules" and length($line) and $line =~ m{ \S }x ) {
		push( @{ $map{$number}{schedules} }, parse_condition($line) );
	}

	if (    $state eq "details"
		and length($line)
		and $line =~ m{ Hg | Tfz | BrH }x )
	{
		push( @{ $map{$number}{details} }, $line );
	}
}

for my $train_no ( keys %map ) {
	my @entries = @{ $map{$train_no}{dup} // [] };
	delete $map{$train_no}{dup};
	push( @entries, $map{$train_no} );
	$map{$train_no} = [@entries];
}

for my $trains ( values %map ) {
	for my $train ( @{$trains} ) {
		if ( not $train->{route} ) {
			next;
		}

		my $route = $train->{route};
		my @parts = split( qr{ - }, $route );

		if ( @parts < 2 ) {
			$train->{route} = { raw => $route };
			next;
		}
		my ( $pre_start, $start, @middle, $end, $post_end );

		if ( $parts[0] =~ m{ ^ [(] }x ) {
			$pre_start = shift @parts;
			$pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x;
		}

		if ( $parts[-1] =~ m{ ^ [(] }x ) {
			$post_end = pop @parts;
			$post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x;
		}

		$start = shift @parts;
		$end   = pop @parts;

		$train->{route} = {
			preStart => $pre_start,
			start    => $start,
			middle   => scalar @parts ? [@parts] : undef,
			end      => $end,
			postEnd  => $post_end,
		};

		for my $k ( keys %{ $train->{route} } ) {
			if ( not defined $train->{route}{$k} ) {

				# avoid null values (leave out the property instead)
				delete $train->{route}{$k};
			}
		}
	}
}

for my $trains ( values %map ) {
	for my $train ( @{$trains} ) {
		if ( @{ $train->{wagonorder_notes} // [] } == 0 ) {
			next;
		}
		my $first_line  = $train->{wagonorder_notes}[0];
		my $from_offset = 0;
		my $to_offset   = 0;

       # International trains (e.g. EC 8) are super special and may have another
       # identifier after the "cycle to" train number.
		if ( $first_line
			=~ m{ ^ (?<lead> .*? [^0-9] ) (?<from> \d+ ) (?<middle> [A-Z ]+ ) (?<to> \d+ ) (?: \s+ \d+ )? $ }x
		  )
		{
			$from_offset = length( $+{lead} );
			$to_offset
			  = $from_offset + length( $+{from} ) + length( $+{middle} );
		}
		if ( not $from_offset ) {
			next;
		}
		my $cycle_id = "unknown";
		for my $i_line ( @{ $train->{wagonorder_notes} } ) {

       # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes.
       # We don't want that.
			my $line = substr( $i_line, 0 );

        # Fx abc appears to identify cycle groups.
        # It also confuses the cycle detection code for cycles which do not have
        # a "from" entry.
			if ( $line =~ s{(F[0-9] [0-9]{3})}{.. ...} ) {
				$cycle_id = $1;
			}
			if ( length($line) <= $from_offset ) {
				next;
			}
			my $umlauf = substr( $line, $from_offset );
			if ( $umlauf =~ m{ ^ (\d+) }x ) {
				push( @{ $train->{cycle}{$cycle_id}{from} }, $1 );
			}
			if ( length($line) > $to_offset ) {
				$umlauf = substr( $line, $to_offset );
				if ( $umlauf =~ m{ ^ (\d+) }x ) {
					push( @{ $train->{cycle}{$cycle_id}{to} }, $1 );
				}
			}
		}
		for my $cycle ( values %{ $train->{cycle} // {} } ) {
			@{ $cycle->{from} } = uniq @{ $cycle->{from} // [] };
			@{ $cycle->{to} }   = uniq @{ $cycle->{to}   // [] };
		}
	}
}

for my $trains ( values %map ) {
	for my $train ( @{$trains} ) {

		if ( not $train->{details} ) {
			next;
		}

		my @details = @{ $train->{details} // [] };
		my %common_setup;
		my @setups;

		for my $line (@details) {
			my %setup;
			if ( $line =~ m{ Tfz1:(\d+) }x ) {
				$setup{series} = $1;
			}
			if ( $line =~ m{ Tfz2:(\d+) }x ) {
				$setup{series2} = $1;
			}
			if ( $line =~ m{ Hg(\d+) }x ) {
				$setup{vmax} = 0 + $1;
			}
			if ( $line =~ m{ BrH(\d+) }x ) {
				$setup{brakingPercentage} = 0 + $1;
			}
			if ( $line =~ m{ (\d+)m }x ) {
				$setup{length} = 0 + $1;
			}
			if ( %setup and $line =~ m{ ^ \s* ([+]? [A-Z ]{1,7}) }x ) {
				my $station = $1;
				$station =~ s{\s+$}{};
				my $special = ( $station =~ s{^[+]}{} );
				if ( $ds100_to_name{$station} ) {
					$station = $ds100_to_name{$station};
				}
				if ($special) {
					$station = "+${station}";
				}
				$setup{station} = $station;
			}
			push( @setups, {%setup} );
		}

		for my $key (qw(brakingPercentage length series series2 vmax)) {
			if ( ( my @uniq = uniq grep { $_ } map { $_->{$key} } @setups )
				== 1 )
			{
				$common_setup{$key}
				  = $key =~ m{series} ? $uniq[0] : 0 + $uniq[0];
			}
		}

		delete $train->{details};
		if (%common_setup) {
			$train->{commonAttr} = {%common_setup};
		}
		$train->{attrVariants} = [@setups];
	}
}

for my $trains ( values %map ) {
	for my $train ( @{$trains} ) {
		my $wagon_numbers_ok = 1;
		my $wagon_types_ok   = 1;
		my $wagon_list_ok    = 1;
		my %bincount;
		my %type_by_wagon_number;
		for my $wagon ( @{ $train->{raw_wagons} // [] } ) {
			my ( $wagon_type, $wagon_number ) = @{$wagon};
			if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {

				# Locomotives do not have a wagon number. That's OK.
				next;
			}
			elsif ( not $wagon_number ) {
				$wagon_numbers_ok = 0;
			}
			else {
				$bincount{$wagon_number}++;
				push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type );
			}
		}
		if ($wagon_numbers_ok) {
			for my $wagon ( @{ $train->{raw_wagons} // [] } ) {
				my ( $wagon_type, $wagon_number ) = @{$wagon};

				my $json_wagon = { type => $wagon_type };
				if ($wagon_number) {
					$json_wagon->{number} = 0 + $wagon_number;
				}

				push( @{ $train->{wagons} }, $json_wagon );

				if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
					next;
				}

				if ( $bincount{$wagon_number} > 1 ) {
					$wagon_list_ok = 0;
					if ( $type_by_wagon_number{$wagon_number}[0] ne
						$type_by_wagon_number{$wagon_number}[1] )
					{
						$wagon_types_ok = 0;
					}
				}
			}
		}
		if ( not $wagon_types_ok ) {

          # train type may differ depending on date of week or similar. We don't
          # parse/handle that yet.
			$train->{type} = $train->{rawType};
			delete $train->{shortType};
			delete $train->{wagons};
			delete $train->{hasWagon};
		}
		elsif ( not $wagon_list_ok ) {

			# train type appears to be correct, but the list contains duplicates
			# (e.g. due to unhandled schedule variants)
			delete $train->{wagons};
		}
	}
}

# Cleanup

for my $trains ( values %map ) {
	for my $train ( @{$trains} ) {
		delete $train->{raw_wagons};
		delete $train->{wagonorder_notes};
	}
}

for my $bork (qw(104 1588 1700 77689 78112 939 2310)) {
	for my $train ( @{ $map{$bork} } ) {
		delete $train->{cycle};
	}
}

# use canonical output (i.e., sort hash keys) to allow for easy diffing.
say JSON->new->utf8->canonical->encode(
	{
		deprecated     => \0,
		source         => $wr_name,
		train_variants => {%map},
		valid          => $valid,
	}
);