#!/usr/bin/env perl
# Copyright (C) 2020 Daniel Friesel
#
# SPDX-License-Identifier: BSD-2-Clause

use strict;
use warnings;
use 5.020;
use utf8;

use File::Slurp qw(write_file);
use IPC::Run3;
use JSON;
use List::Util qw(uniq);

sub show_usage {
	my ($exit_code) = @_;

	say STDERR "Usage: db-wagenreihung-to-json <wagenreihung.pdf>\n";
	say STDERR
"You can obtain the latest PDF from <https://data.deutschebahn.com/dataset/zugbildungsplanzugbildungsplan-zpar>";

	exit $exit_code;
}

my @weekdays = (qw(Mo Di Mi Do Fr Sa So));
my @months   = (qw(0 I II III IV V VI VII VIII IX X XI XII));
my %weekday  = map { ( $weekdays[$_] => $_ ) } ( 0 .. $#weekdays );
my %month    = map { ( $months[$_] => $_ ) } ( 0 .. $#months );

my %valid;
my $year;

sub parse_weekday_range {
	my ($range) = @_;
	my $ret = [ ( \0 ) x 7 ];

	if ( length($range) == 2 ) {
		$ret->[ $weekday{$range} ] = \1;
		return $ret;
	}

	my ( $start, $end ) = split( qr{-}, $range );

	for my $i ( $weekday{$start} .. $weekday{$end} ) {
		$ret->[$i] = \1;
	}

	return $ret;
}

sub parse_weekdays {
	my ($text) = @_;

	if ( $text eq 'tgl.' ) {
		return [ ( \1 ) x 7 ];
	}

	my $ret = [ ( \0 ) x 7 ];

	for my $range ( split( qr{[+]}, $text ) ) {
		my $new_ret = parse_weekday_range($range);
		for my $i ( 0 .. 6 ) {
			if ( ${ $new_ret->[$i] } ) {
				$ret->[$i] = \1;
			}
		}
	}
	return $ret;
}

sub parse_dates {
	my ($text) = @_;

	my @valid_ranges;
	my @valid_dates;
	my @invalid_dates;

	my $mode = "range";
	for my $date ( split( qr{,\s*}, $text ) ) {
		if ( $date =~ m{^auch} ) {
			$mode = "extra";
		}
		elsif ( $date =~ m{^nicht} ) {
			$mode = "except";
		}
		elsif ( $date =~ m{^(ab|bis)} ) {
			$mode = "range";
		}

		if (    $mode eq 'range'
			and $date =~ m{ ^ ab \s* (?<day> \d{2} ) \. (?<month> [^.]* ) }x )
		{
			push( @valid_ranges, [ [ $+{month}, $+{day} ], undef ] );
		}
		elsif ( $mode eq 'range'
			and $date =~ m{ ^ bis \s* (?<day> \d{2} ) \. (?<month> [^.]* ) }x )
		{
			push( @valid_ranges, [ undef, [ $+{month}, $+{day} ] ] );
		}
		elsif ( $date
			=~ m{ ^ (?<fd> \d{2} ) \. (?<fm> [^-]+ )\. - (?<td> \d{2} ) \. (?<tm> [^.]* ) }x
		  )
		{
			push( @valid_ranges, [ [ $+{fm}, $+{fd} ], [ $+{tm}, $+{td} ] ] );
		}
	}

	my %ret = (
		valid   => [],
		invalid => [],
	);

	for my $range (@valid_ranges) {
		my $from_date    = $valid{from};
		my $through_date = $valid{through};
		if ( $range->[0] ) {
			$from_date = sprintf( '%04d-%02d-%02d',
				$year, $month{ $range->[0][0] },
				$range->[0][1] );
		}
		if ( $range->[1] ) {
			$through_date = sprintf( '%04d-%02d-%02d',
				$year, $month{ $range->[1][0] },
				$range->[1][1] );
		}
		push( @{ $ret{valid} }, "${from_date}/${through_date}" );
	}

	return %ret;
}

sub parse_condition {
	my ($line) = @_;
	my %ret;

	if ( $line
		=~ m{ ^ \s* (?<from> .*? ) \s - \s (?<to> [^,]+ ) , \s* (?<weekdays> (Mo|Di|Mi|Do|Fr|Sa|So|tgl[.]|[+]|-)+) \s* (?<dates> .* ) $ }x
	  )
	{
		$ret{from}     = $+{from};
		$ret{to}       = $+{to};
		$ret{weekdays} = parse_weekdays( $+{weekdays} );
		my %validity = parse_dates( $+{dates} );
		while ( my ( $k, $v ) = each %validity ) {
			$ret{$k} = $v;
		}
		$ret{raw} = $line;
	}
	else {
		$ret{raw} = $line;
	}

	return {%ret};
}

if ( @ARGV == 0 ) {
	show_usage(1);
}

run3( [ "which", "pdftotext" ], \undef, \undef, \undef );
if ($?) {
	say STDERR
"Error: db-wagenreihung-to-json depends on the pdftotext utility, but could not find it on this system.\n";
	say STDERR
"On Debian and derivatives, it should be available in the 'poppler-utils' package.";
	exit 2;
}

my $wr_pdf = $ARGV[0];
my $wr_txt;
my $command = [ "pdftotext", "-layout", "-nopgbrk", $wr_pdf, "/dev/stdout" ];

# stdin: /dev/null
# stdout: $wr_txt
# stderr: forwarded to terminal
run3( $command, \undef, \$wr_txt, undef,
	{ binmode_stdout => ':encoding(utf-8)' } );

if ($?) {
	my $exit_status = $? >> 8;
	say STDERR
	  "\nError: 'pdftotext $wr_pdf' failed (exit status: $exit_status)";
	exit 2;
}

my @lines = split( qr{\n}, $wr_txt );

my ( $type, $number );
my %map;
my %wagon_map;

my $state = "intro";

for my $line (@lines) {

	if ( $line
		=~ m{ ^ \s* Gültigkeit: \s* (?<fd> \d{2} ) \. (?<fm> \d{2} ) \. (?<fy> \d{4} ) \s* – \s* (?<td> \d{2} ) \. (?<tm> \d{2} ) \. (?<ty> \d{4} ) }x
	  )
	{
		%valid = (
			"from"    => sprintf( "%04d-%02d-%02d", $+{fy}, $+{fm}, $+{fd} ),
			"through" => sprintf( "%04d-%02d-%02d", $+{ty}, $+{tm}, $+{td} ),
		);
		$year = $+{ty};
	}

	if ( $state eq "intro" ) {
		if ( $line =~ m{ ^ [A-Z] [.] \s* Zugbildungen $ }x ) {
			$state = "";
		}
		else {
			next;
		}
	}

	if ( $line eq 'F. Anhang V: Beschilderungsplan' ) {
		last;
	}

	if ( $line
		=~ m{ ^ (?<type> [EINR][CJN][ A-Z-]{6} | TGV \s*+ ) (?<number> [ 0-9]{2,12}) }x
	  )
	{
		$type = $+{type};
		$type =~ tr{ }{}d;
		$number = $+{number};
		$number =~ tr{ }{}d;

		if ( $number !~ m{ ^ \d+ $ }x ) {
			next;
		}

		$state = "train_no";

		#say "$type $number";
		$map{$number} = {
			raw => $type,
		};

		if ( $type eq 'ICE-A' ) {
			$map{$number}{type} = 'ICE 1/2/4';
		}
		elsif ( $type eq 'ICE-W' ) {
			$map{$number}{type}  = 'ICE 3';
			$map{$number}{short} = '3';
		}
		elsif ( $type eq 'ICE' ) {
			$map{$number}{type} = 'ICE 1/2/4';
		}
		elsif ( $type eq 'ICE-T' ) {
			$map{$number}{type}  = 'ICE T';
			$map{$number}{short} = 'T';
		}
		elsif ( $type eq 'IC-A' ) {
			$map{$number}{type}  = 'Metropolitan';
			$map{$number}{short} = 'M';
		}
		elsif ( $type eq 'IC-D' ) {
			$map{$number}{type}  = 'IC2';
			$map{$number}{short} = '2';
		}
		elsif ( $type eq 'IC-E' ) {
			$map{$number}{type}  = 'IC2 KISS';
			$map{$number}{short} = '2';
		}
		else {
			$map{$number}{type} = $type;
		}
	}
	elsif ( $line
		=~ m{ ^ (?<type> L[NP]F T? (-[A-Z])? ) (?<number> [ 0-9]{2,16}) }x )
	{

		# Probably not in passenger service ("Leehrfahrt")
		# FWIW, LPF(T) can be further distinguished:
		# LPFT-A: ICE BR 401/402
		# LPFT-B: ICE BR 412
		# LPFT-T: ICE BR 411 / 415 / 605
		# LPFT-W: ICE BR 403 / 406
		# LNF seems to be related to couchettes / sleeper cars.
		#$number = undef;
		#$state  = "";

		$type = $+{type};
		$type =~ tr{ }{}d;
		$number = $+{number};
		$number =~ tr{ }{}d;

		if ( $number !~ m{ ^ \d+ $ }x ) {
			next;
		}

		$state = "train_no";

		$map{$number} = {
			raw   => $type,
			type  => $type,
			empty => \1,
		};
	}

	elsif ( $state eq "train_no" and $line =~ m{ ^ \s{2,6} \S }x ) {
		$state = "route";
		$map{$number}{route} = "";
	}

	# ICE types can be further distinguished by rolling stock. For instance,
	# ICE 4 is the only ICE-A variant with restaurant and first-class seats
	# in the same wagon ("ARmz").
	elsif ( $number and $line =~ m{^\d*\s+ARmz} ) {
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}  = 'ICE 4';
			$map{$number}{short} = '4';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+WRmz} ) {

		#say "ICE 1 / ICE 3 Redesign (WRmz)";
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}  = 'ICE 1';
			$map{$number}{short} = '1';
		}
		elsif ( $map{$number}{type} eq 'ICE 3' ) {
			$map{$number}{type}  = 'ICE 3 Redesign';
			$map{$number}{short} = '3R';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+WRmbsz} ) {

		#say "ICE 2 (WRmbsz)";
		if ( $map{$number}{type} eq 'ICE 1/2/4' ) {
			$map{$number}{type}  = 'ICE 2';
			$map{$number}{short} = '2';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+Bpmbsz} ) {

		#say "ICE 3 Velaro";
		if ( $map{$number}{type} eq 'ICE 3' ) {
			$map{$number}{type}  = 'ICE 3 Velaro';
			$map{$number}{short} = '3V';
		}
	}
	elsif ( $number and $line =~ m{^\d*\s+[AB][pv]m} ) {
		if ( $map{$number}{type} eq 'IC2' or $map{$number}{type} eq 'IC2 KISS' )
		{
        # Zugbildungsplan go home, you're drunk. Wagons of IC2 double decker
        # trains must start with D (indicating that they double decker
        # wagons). If an IC2 train has other wagons, it isn't really an IC2
        # train. Observed on IC 2006 Stuttgart – Dortmund, which is planned as
        # IC-D, but actually uses IC1 wagons.
			$map{$number}{type} = 'IC';
			delete $map{$number}{short};
		}
	}

	if ( $state eq "route" and $line =~ m{ ^ \s{2,6} (?<route> .* ) $ }x ) {
		if ( length( $map{$number}{route} ) ) {
			$map{$number}{route} .= ' ';
		}
		$map{$number}{route} .= $+{route};
	}
	elsif ( $state eq "route" and $line eq "" ) {
		$state = "conditions";
	}
	elsif ( $state eq "conditions" and $line =~ m{ Hg | Tfz | BrH }x ) {
		$state = "details";
	}

	if (    $number
		and $line
		=~ m{ ^ \d* \s{1,10} (?<type> ( D? (WL)? (A|AB|B|W) R? D? [acdehimpuvw] \S* ) | 40[1-9]([.][0-9])? | 14[67]([.][0-9])? ) \s* (?<number> \d+ )? (?: $ | \s{3} (?<rest> .* ) $ ) }x
	  )
	{
		$state = "wagonorder";
		my $wagon_type   = $+{type};
		my $wagon_number = $+{number};
		my $rest         = $+{rest};

		push( @{ $wagon_map{$number} }, [ $wagon_type, $wagon_number ] );

		if ( $rest and $rest =~ m{\S} ) {
			push( @{ $map{$number}{wagonorder_notes} }, $line );
		}

		$map{$number}{has_wagon}{$wagon_type} = \1;
	}

	if ( $state eq "conditions" and length($line) and $line =~ m{ \S }x ) {
		push( @{ $map{$number}{conditions} }, parse_condition($line) );
	}

	if (    $state eq "details"
		and length($line)
		and $line =~ m{ Hg | Tfz | BrH }x )
	{
		push( @{ $map{$number}{details} }, $line );
	}
}

for my $train ( values %map ) {
	if ( not $train->{route} ) {
		next;
	}

	my $route = $train->{route};
	my @parts = split( qr{ - }, $route );

	if ( @parts < 2 ) {
		$train->{route} = { raw => $route };
		next;
	}
	my ( $pre_start, $start, @middle, $end, $post_end );

	if ( $parts[0] =~ m{ ^ [(] }x ) {
		$pre_start = shift @parts;
		$pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x;
	}

	if ( $parts[-1] =~ m{ ^ [(] }x ) {
		$post_end = pop @parts;
		$post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x;
	}

	$start = shift @parts;
	$end   = pop @parts;

	$train->{route} = {
		preStart => $pre_start,
		start    => $start,
		middle   => scalar @parts ? [@parts] : undef,
		end      => $end,
		postEnd  => $post_end,
	};
}

for my $train ( values %map ) {
	if ( @{ $train->{wagonorder_notes} // [] } == 0 ) {
		next;
	}
	my $first_line  = $train->{wagonorder_notes}[0];
	my $from_offset = 0;
	my $to_offset   = 0;

	# International trains (e.g. EC 8) are super special and may have another
	# identifier after the "cycle to" train number.
	if ( $first_line
		=~ m{ ^ (?<lead> .*? [^0-9] ) (?<from> \d+ ) (?<middle> [A-Z ]+ ) (?<to> \d+ ) (?: \s+ \d+ )? $ }x
	  )
	{
		$from_offset = length( $+{lead} );
		$to_offset   = $from_offset + length( $+{from} ) + length( $+{middle} );
	}
	if ( not $from_offset ) {
		next;
	}
	for my $i_line ( @{ $train->{wagonorder_notes} } ) {

       # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes.
       # We don't want that.
		my $line = substr( $i_line, 0 );

		# Some cycles do not have a "from" entry, but they do have an odd
		# F-identifier, which confuses the cycle detection code. remove it.
		$line =~ s{F[0-9] [0-9]{3}}{.. ...};
		if ( length($line) <= $from_offset ) {
			next;
		}
		my $umlauf = substr( $line, $from_offset );
		if ( $umlauf =~ m{ ^ (\d+) }x ) {
			push( @{ $train->{cycle}{from} }, $1 );
		}
		if ( length($line) > $to_offset ) {
			$umlauf = substr( $line, $to_offset );
			if ( $umlauf =~ m{ ^ (\d+) }x ) {
				push( @{ $train->{cycle}{to} }, $1 );
			}
		}
	}
	if ( exists $train->{cycle}{from} ) {
		@{ $train->{cycle}{from} } = uniq @{ $train->{cycle}{from} };
		@{ $train->{cycle}{to} }   = uniq @{ $train->{cycle}{to} };
	}
}

#for my $train_number ( keys %map ) {
#	my $tr = $map{$train_number}{route} // q{};
#	$tr =~ s{ - .* - }{ - };
#	my $this = sprintf("\"%d %s\"", $train_number, $tr);
#	for my $from ( @{ $map{$train_number}{umlauf}{from} // [] } ) {
#		$tr = $map{$from}{route} // q{};
#		$tr =~ s{ - .* - }{ - };
#		$from = sprintf("\"%d %s\"", $from, $tr);
#		say "$from -> $this;";
#	}
#	for my $to ( @{ $map{$train_number}{umlauf}{to} // [] } ) {
#		$tr = $map{$to}{route} // q{};
#		$tr =~ s{ -.*- }{ - };
#		$to = sprintf("\"%d %s\"", $to, $tr);
#		say "$this -> $to;";
#	}
#}

#exit 0;

for my $train_number ( keys %map ) {
	my $wagon_numbers_ok = 1;
	my $wagon_types_ok   = 1;
	my %bincount;
	my %type_by_wagon_number;
	for my $wagon ( @{ $wagon_map{$train_number} // [] } ) {
		my ( $wagon_type, $wagon_number ) = @{$wagon};
		if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {

			# Locomotives do not have a wagon number. That's OK.
			next;
		}
		elsif ( not $wagon_number ) {
			$wagon_numbers_ok = 0;
		}
		else {
			$bincount{$wagon_number}++;
			push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type );
		}
	}
	if ($wagon_numbers_ok) {
		for my $wagon ( @{ $wagon_map{$train_number} // [] } ) {
			my ( $wagon_type, $wagon_number ) = @{$wagon};

			push( @{ $map{$train_number}{wagons} }, $wagon );

			if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
				next;
			}

			if ( $bincount{$wagon_number} > 1 ) {
				if ( $type_by_wagon_number{$wagon_number}[0] ne
					$type_by_wagon_number{$wagon_number}[1] )
				{
					$wagon_types_ok = 0;
				}

				# TODO else -> wagon_list_ok = 0 ?
			}
		}
	}
	if ( not $wagon_types_ok ) {

		# train type may differ depending on date of week or similar. We don't
		# parse/handle that yet.
		delete $map{$train_number}{type};
		delete $map{$train_number}{short};
		delete $map{$train_number}{wagons};
		delete $map{$train_number}{has_wagon};
	}
}

# broken umlauf (indentation changes on page break)
delete $map{104}{cycle};
delete $map{77689}{cycle};
delete $map{78112}{cycle};

# indentation change between consecutive lines. wat.
delete $map{939}{cycle};

# use canonical output (i.e., sort hash keys) to allow for easy diffing.
say JSON->new->utf8->canonical->encode(
	{
		valid => {%valid},
		train => {%map}
	}
);