do not silently ignore duplicate train entries (wip)

author: Daniel Friesel <derf@finalrewind.org> 2021-01-11 21:23:31 +0100
committer: Daniel Friesel <derf@finalrewind.org> 2021-01-11 21:23:31 +0100
commit: aaee1881495d0e5bae0e4027b48d2d62d6d5c0f1 (patch)
tree: 700a5857594eb8cd6018185bd2eab879934f0567
parent: c6270546445c1c3954636a1c92d882d1b4b379bd (diff)
3 files changed, 255 insertions, 206 deletions
diff --git a/bin/db-zugbildung-to-json b/bin/db-zugbildung-to-json
index b1f804d..4721d31 100755
--- a/bin/db-zugbildung-to-json
+++ b/bin/db-zugbildung-to-json
@@ -247,7 +247,6 @@ my @lines = split( qr{\n}, $wr_txt );
 
 my ( $type, $number );
 my %map;
-my %wagon_map;
 
 my $state = "intro";
 
@@ -294,9 +293,24 @@ for my $line (@lines) {
 		$state = "train_no";
 
 		#say "$type $number";
-		$map{$number} = {
-			rawType => $type,
-		};
+
+		if ( exists $map{$number} ) {
+
+			# train numbers are not unique (they may have several wagon
+			# orders with non-intersecting date ranges)
+			my @dup = @{ $map{$number}{dup} // [] };
+			delete $map{$number}{dup};
+			push( @dup, $map{$number} );
+			$map{$number} = {
+				rawType => $type,
+				dup     => [@dup],
+			};
+		}
+		else {
+			$map{$number} = {
+				rawType => $type,
+			};
+		}
 
 		if ($name) {
 			$map{$number}{name} = $name;
@@ -442,7 +456,7 @@ for my $line (@lines) {
 		my $wagon_number = $+{number};
 		my $rest         = $+{rest};
 
-		push( @{ $wagon_map{$number} }, [ $wagon_type, $wagon_number ] );
+		push( @{ $map{$number}{raw_wagons} }, [ $wagon_type, $wagon_number ] );
 
 		if ( $rest and $rest =~ m{\S} ) {
 			push( @{ $map{$number}{wagonorder_notes} }, $line );
@@ -463,237 +477,254 @@ for my $line (@lines) {
 	}
 }
 
-for my $train ( values %map ) {
-	if ( not $train->{route} ) {
-		next;
-	}
+for my $train_no ( keys %map ) {
+	my @entries = @{ $map{$train_no}{dup} // [] };
+	delete $map{$train_no}{dup};
+	push( @entries, $map{$train_no} );
+	$map{$train_no} = [@entries];
+}
+
+for my $trains ( values %map ) {
+	for my $train ( @{$trains} ) {
+		if ( not $train->{route} ) {
+			next;
+		}
 
-	my $route = $train->{route};
-	my @parts = split( qr{ - }, $route );
+		my $route = $train->{route};
+		my @parts = split( qr{ - }, $route );
 
-	if ( @parts < 2 ) {
-		$train->{route} = { raw => $route };
-		next;
-	}
-	my ( $pre_start, $start, @middle, $end, $post_end );
+		if ( @parts < 2 ) {
+			$train->{route} = { raw => $route };
+			next;
+		}
+		my ( $pre_start, $start, @middle, $end, $post_end );
 
-	if ( $parts[0] =~ m{ ^ [(] }x ) {
-		$pre_start = shift @parts;
-		$pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x;
-	}
+		if ( $parts[0] =~ m{ ^ [(] }x ) {
+			$pre_start = shift @parts;
+			$pre_start =~ s{ ^ [(] (.*) [)] $ }{$1}x;
+		}
 
-	if ( $parts[-1] =~ m{ ^ [(] }x ) {
-		$post_end = pop @parts;
-		$post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x;
-	}
+		if ( $parts[-1] =~ m{ ^ [(] }x ) {
+			$post_end = pop @parts;
+			$post_end =~ s{ ^ [(] (.*) [)] $ }{$1}x;
+		}
 
-	$start = shift @parts;
-	$end   = pop @parts;
+		$start = shift @parts;
+		$end   = pop @parts;
 
-	$train->{route} = {
-		preStart => $pre_start,
-		start    => $start,
-		middle   => scalar @parts ? [@parts] : undef,
-		end      => $end,
-		postEnd  => $post_end,
-	};
+		$train->{route} = {
+			preStart => $pre_start,
+			start    => $start,
+			middle   => scalar @parts ? [@parts] : undef,
+			end      => $end,
+			postEnd  => $post_end,
+		};
 
-	for my $k ( keys %{ $train->{route} } ) {
-		if ( not defined $train->{route}{$k} ) {
+		for my $k ( keys %{ $train->{route} } ) {
+			if ( not defined $train->{route}{$k} ) {
 
-			# avoid null values (leave out the property instead)
-			delete $train->{route}{$k};
+				# avoid null values (leave out the property instead)
+				delete $train->{route}{$k};
+			}
 		}
 	}
 }
 
-for my $train ( values %map ) {
-	if ( @{ $train->{wagonorder_notes} // [] } == 0 ) {
-		next;
-	}
-	my $first_line  = $train->{wagonorder_notes}[0];
-	my $from_offset = 0;
-	my $to_offset   = 0;
+for my $trains ( values %map ) {
+	for my $train ( @{$trains} ) {
+		if ( @{ $train->{wagonorder_notes} // [] } == 0 ) {
+			next;
+		}
+		my $first_line  = $train->{wagonorder_notes}[0];
+		my $from_offset = 0;
+		my $to_offset   = 0;
 
-	# International trains (e.g. EC 8) are super special and may have another
-	# identifier after the "cycle to" train number.
-	if ( $first_line
-		=~ m{ ^ (?<lead> .*? [^0-9] ) (?<from> \d+ ) (?<middle> [A-Z ]+ ) (?<to> \d+ ) (?: \s+ \d+ )? $ }x
-	  )
-	{
-		$from_offset = length( $+{lead} );
-		$to_offset   = $from_offset + length( $+{from} ) + length( $+{middle} );
-	}
-	if ( not $from_offset ) {
-		next;
-	}
-	my $cycle_id = "unknown";
-	for my $i_line ( @{ $train->{wagonorder_notes} } ) {
+       # International trains (e.g. EC 8) are super special and may have another
+       # identifier after the "cycle to" train number.
+		if ( $first_line
+			=~ m{ ^ (?<lead> .*? [^0-9] ) (?<from> \d+ ) (?<middle> [A-Z ]+ ) (?<to> \d+ ) (?: \s+ \d+ )? $ }x
+		  )
+		{
+			$from_offset = length( $+{lead} );
+			$to_offset
+			  = $from_offset + length( $+{from} ) + length( $+{middle} );
+		}
+		if ( not $from_offset ) {
+			next;
+		}
+		my $cycle_id = "unknown";
+		for my $i_line ( @{ $train->{wagonorder_notes} } ) {
 
        # $i_line is an lvalue, so changes in $i_line end up in wagonorder_notes.
        # We don't want that.
-		my $line = substr( $i_line, 0 );
+			my $line = substr( $i_line, 0 );
 
-		# Fx abc appears to identify cycle groups.
-		# It also confuses the cycle detection code for cycles which do not have
-		# a "from" entry.
-		if ( $line =~ s{(F[0-9] [0-9]{3})}{.. ...} ) {
-			$cycle_id = $1;
-		}
-		if ( length($line) <= $from_offset ) {
-			next;
-		}
-		my $umlauf = substr( $line, $from_offset );
-		if ( $umlauf =~ m{ ^ (\d+) }x ) {
-			push( @{ $train->{cycle}{$cycle_id}{from} }, $1 );
-		}
-		if ( length($line) > $to_offset ) {
-			$umlauf = substr( $line, $to_offset );
+        # Fx abc appears to identify cycle groups.
+        # It also confuses the cycle detection code for cycles which do not have
+        # a "from" entry.
+			if ( $line =~ s{(F[0-9] [0-9]{3})}{.. ...} ) {
+				$cycle_id = $1;
+			}
+			if ( length($line) <= $from_offset ) {
+				next;
+			}
+			my $umlauf = substr( $line, $from_offset );
 			if ( $umlauf =~ m{ ^ (\d+) }x ) {
-				push( @{ $train->{cycle}{$cycle_id}{to} }, $1 );
+				push( @{ $train->{cycle}{$cycle_id}{from} }, $1 );
+			}
+			if ( length($line) > $to_offset ) {
+				$umlauf = substr( $line, $to_offset );
+				if ( $umlauf =~ m{ ^ (\d+) }x ) {
+					push( @{ $train->{cycle}{$cycle_id}{to} }, $1 );
+				}
 			}
 		}
-	}
-	for my $cycle ( values %{ $train->{cycle} // {} } ) {
-		@{ $cycle->{from} } = uniq @{ $cycle->{from} // [] };
-		@{ $cycle->{to} }   = uniq @{ $cycle->{to}   // [] };
+		for my $cycle ( values %{ $train->{cycle} // {} } ) {
+			@{ $cycle->{from} } = uniq @{ $cycle->{from} // [] };
+			@{ $cycle->{to} }   = uniq @{ $cycle->{to}   // [] };
+		}
 	}
 }
 
-for my $train ( values %map ) {
+for my $trains ( values %map ) {
+	for my $train ( @{$trains} ) {
 
-	if ( not $train->{details} ) {
-		next;
-	}
+		if ( not $train->{details} ) {
+			next;
+		}
 
-	my @details = @{ $train->{details} // [] };
-	my %common_setup;
-	my @setups;
+		my @details = @{ $train->{details} // [] };
+		my %common_setup;
+		my @setups;
 
-	for my $line (@details) {
-		my %setup;
-		if ( $line =~ m{ Tfz1:(\d+) }x ) {
-			$setup{series} = $1;
-		}
-		if ( $line =~ m{ Tfz2:(\d+) }x ) {
-			$setup{series2} = $1;
-		}
-		if ( $line =~ m{ Hg(\d+) }x ) {
-			$setup{vmax} = 0 + $1;
-		}
-		if ( $line =~ m{ BrH(\d+) }x ) {
-			$setup{brakingPercentage} = 0 + $1;
-		}
-		if ( $line =~ m{ (\d+)m }x ) {
-			$setup{length} = 0 + $1;
-		}
-		if ( %setup and $line =~ m{ ^ \s* ([+]? [A-Z ]{1,7}) }x ) {
-			my $station = $1;
-			$station =~ s{\s+$}{};
-			my $special = ( $station =~ s{^[+]}{} );
-			if ( $ds100_to_name{$station} ) {
-				$station = $ds100_to_name{$station};
+		for my $line (@details) {
+			my %setup;
+			if ( $line =~ m{ Tfz1:(\d+) }x ) {
+				$setup{series} = $1;
+			}
+			if ( $line =~ m{ Tfz2:(\d+) }x ) {
+				$setup{series2} = $1;
+			}
+			if ( $line =~ m{ Hg(\d+) }x ) {
+				$setup{vmax} = 0 + $1;
 			}
-			if ($special) {
-				$station = "+${station}";
+			if ( $line =~ m{ BrH(\d+) }x ) {
+				$setup{brakingPercentage} = 0 + $1;
 			}
-			$setup{station} = $station;
+			if ( $line =~ m{ (\d+)m }x ) {
+				$setup{length} = 0 + $1;
+			}
+			if ( %setup and $line =~ m{ ^ \s* ([+]? [A-Z ]{1,7}) }x ) {
+				my $station = $1;
+				$station =~ s{\s+$}{};
+				my $special = ( $station =~ s{^[+]}{} );
+				if ( $ds100_to_name{$station} ) {
+					$station = $ds100_to_name{$station};
+				}
+				if ($special) {
+					$station = "+${station}";
+				}
+				$setup{station} = $station;
+			}
+			push( @setups, {%setup} );
 		}
-		push( @setups, {%setup} );
-	}
 
-	for my $key (qw(brakingPercentage length series series2 vmax)) {
-		if ( ( my @uniq = uniq grep { $_ } map { $_->{$key} } @setups ) == 1 ) {
-			$common_setup{$key} = $key =~ m{series} ? $uniq[0] : 0 + $uniq[0];
+		for my $key (qw(brakingPercentage length series series2 vmax)) {
+			if ( ( my @uniq = uniq grep { $_ } map { $_->{$key} } @setups )
+				== 1 )
+			{
+				$common_setup{$key}
+				  = $key =~ m{series} ? $uniq[0] : 0 + $uniq[0];
+			}
 		}
-	}
 
-	delete $train->{details};
-	if (%common_setup) {
-		$train->{commonAttr} = {%common_setup};
+		delete $train->{details};
+		if (%common_setup) {
+			$train->{commonAttr} = {%common_setup};
+		}
+		$train->{attrVariants} = [@setups];
 	}
-	$train->{attrVariants} = [@setups];
 }
 
-for my $train_number ( keys %map ) {
-	my $wagon_numbers_ok = 1;
-	my $wagon_types_ok   = 1;
-	my $wagon_list_ok    = 1;
-	my %bincount;
-	my %type_by_wagon_number;
-	for my $wagon ( @{ $wagon_map{$train_number} // [] } ) {
-		my ( $wagon_type, $wagon_number ) = @{$wagon};
-		if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
-
-			# Locomotives do not have a wagon number. That's OK.
-			next;
-		}
-		elsif ( not $wagon_number ) {
-			$wagon_numbers_ok = 0;
-		}
-		else {
-			$bincount{$wagon_number}++;
-			push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type );
-		}
-	}
-	if ($wagon_numbers_ok) {
-		for my $wagon ( @{ $wagon_map{$train_number} // [] } ) {
+for my $trains ( values %map ) {
+	for my $train ( @{$trains} ) {
+		my $wagon_numbers_ok = 1;
+		my $wagon_types_ok   = 1;
+		my $wagon_list_ok    = 1;
+		my %bincount;
+		my %type_by_wagon_number;
+		for my $wagon ( @{ $train->{raw_wagons} // [] } ) {
 			my ( $wagon_type, $wagon_number ) = @{$wagon};
+			if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
 
-			my $json_wagon = { type => $wagon_type };
-			if ($wagon_number) {
-				$json_wagon->{number} = 0 + $wagon_number;
+				# Locomotives do not have a wagon number. That's OK.
+				next;
+			}
+			elsif ( not $wagon_number ) {
+				$wagon_numbers_ok = 0;
 			}
+			else {
+				$bincount{$wagon_number}++;
+				push( @{ $type_by_wagon_number{$wagon_number} }, $wagon_type );
+			}
+		}
+		if ($wagon_numbers_ok) {
+			for my $wagon ( @{ $train->{raw_wagons} // [] } ) {
+				my ( $wagon_type, $wagon_number ) = @{$wagon};
 
-			push( @{ $map{$train_number}{wagons} }, $json_wagon );
+				my $json_wagon = { type => $wagon_type };
+				if ($wagon_number) {
+					$json_wagon->{number} = 0 + $wagon_number;
+				}
 
-			if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
-				next;
-			}
+				push( @{ $train->{wagons} }, $json_wagon );
+
+				if ( $wagon_type =~ m{ 40[1-9] | 14[67] }x ) {
+					next;
+				}
 
-			if ( $bincount{$wagon_number} > 1 ) {
-				$wagon_list_ok = 0;
-				if ( $type_by_wagon_number{$wagon_number}[0] ne
-					$type_by_wagon_number{$wagon_number}[1] )
-				{
-					$wagon_types_ok = 0;
+				if ( $bincount{$wagon_number} > 1 ) {
+					$wagon_list_ok = 0;
+					if ( $type_by_wagon_number{$wagon_number}[0] ne
+						$type_by_wagon_number{$wagon_number}[1] )
+					{
+						$wagon_types_ok = 0;
+					}
 				}
 			}
 		}
-	}
-	if ( not $wagon_types_ok ) {
+		if ( not $wagon_types_ok ) {
 
-		# train type may differ depending on date of week or similar. We don't
-		# parse/handle that yet.
-		$map{$train_number}{type} = $map{$train_number}{rawType};
-		delete $map{$train_number}{shortType};
-		delete $map{$train_number}{wagons};
-		delete $map{$train_number}{hasWagon};
-	}
-	elsif ( not $wagon_list_ok ) {
+          # train type may differ depending on date of week or similar. We don't
+          # parse/handle that yet.
+			$train->{type} = $train->{rawType};
+			delete $train->{shortType};
+			delete $train->{wagons};
+			delete $train->{hasWagon};
+		}
+		elsif ( not $wagon_list_ok ) {
 
-		# train type appears to be correct, but the list contains duplicates
-		# (e.g. due to unhandled schedule variants)
-		delete $map{$train_number}{wagons};
+			# train type appears to be correct, but the list contains duplicates
+			# (e.g. due to unhandled schedule variants)
+			delete $train->{wagons};
+		}
 	}
 }
 
 # Cleanup
 
-for my $train ( values %map ) {
-	delete $train->{wagonorder_notes};
+for my $trains ( values %map ) {
+	for my $train ( @{$trains} ) {
+		delete $train->{raw_wagons};
+		delete $train->{wagonorder_notes};
+	}
 }
 
-# broken umlauf (indentation changes on page break)
-delete $map{104}{cycle};
-delete $map{1588}{cycle};
-delete $map{1700}{cycle};
-delete $map{77689}{cycle};
-delete $map{78112}{cycle};
-
-# indentation change between consecutive lines. wat.
-delete $map{939}{cycle};
-delete $map{2310}{cycle};
+for my $bork (qw(104 1588 1700 77689 78112 939 2310)) {
+	for my $train ( @{ $map{$bork} } ) {
+		delete $train->{cycle};
+	}
+}
 
 # use canonical output (i.e., sort hash keys) to allow for easy diffing.
 say JSON->new->utf8->canonical->encode(
diff --git a/bin/umlauf-to-dot b/bin/umlauf-to-dot
index 1ac2c2e..860c962 100755
--- a/bin/umlauf-to-dot
+++ b/bin/umlauf-to-dot
@@ -67,9 +67,11 @@ sub build_cycle {
 
 		my @candidates;
 
-		if ( my $c = $map->{$train_number}{cycle}{$cycle_id} ) {
-			push( @candidates, @{ $c->{from} // [] } );
-			push( @candidates, @{ $c->{to}   // [] } );
+		for my $train ( @{ $map->{$train_number} } ) {
+			if ( my $c = $train->{cycle}{$cycle_id} ) {
+				push( @candidates, @{ $c->{from} // [] } );
+				push( @candidates, @{ $c->{to}   // [] } );
+			}
 		}
 
 		@candidates = uniq @candidates;
@@ -87,29 +89,39 @@ sub build_cycle {
 	my @output;
 
 	for my $train_number (@train_numbers) {
-		if ( my $c = $map->{$train_number}{cycle}{$cycle_id} ) {
-			for my $from ( @{ $c->{from} // [] } ) {
-				push(
-					@output,
-					sprintf( "%s -> %s;",
-						format_train( $from,         $map->{$from} ),
-						format_train( $train_number, $map->{$train_number} ) )
-				);
-			}
-			for my $to ( @{ $c->{to} // [] } ) {
-				push(
-					@output,
-					sprintf( "%s -> %s;",
-						format_train( $train_number, $map->{$train_number} ),
-						format_train( $to,           $map->{$to} ) )
-				);
+		for my $train ( @{ $map->{$train_number} } ) {
+			if ( my $c = $train->{cycle}{$cycle_id} ) {
+				for my $from ( @{ $c->{from} // [] } ) {
+					push(
+						@output,
+						sprintf(
+							"%s -> %s;",
+							format_train( $from, $map->{$from}[0] ),
+							format_train(
+								$train_number, $map->{$train_number}[0]
+							)
+						)
+					);
+				}
+				for my $to ( @{ $c->{to} // [] } ) {
+					push(
+						@output,
+						sprintf(
+							"%s -> %s;",
+							format_train(
+								$train_number, $map->{$train_number}[0]
+							),
+							format_train( $to, $map->{$to}[0] )
+						)
+					);
+				}
 			}
 		}
 		if ( $train_number != $line ) {
 			push(
 				@output,
 				sprintf( "%s [shape=box];",
-					format_train( $train_number, $map->{$train_number} ) )
+					format_train( $train_number, $map->{$train_number}[0] ) )
 			);
 		}
 	}
@@ -117,7 +129,11 @@ sub build_cycle {
 	return @output;
 }
 
-my @cycle_ids = keys %{ $map->{$line}{cycle} // {} };
+my @cycle_ids;
+
+for my $train ( @{ $map->{$line} } ) {
+	push( @cycle_ids, keys %{ $train->{cycle} // {} } );
+}
 
 say "digraph Umlauf {";
 
@@ -125,6 +141,6 @@ for my $cycle_id (@cycle_ids) {
 	say join( "\n", uniq build_cycle( $line, $cycle_id ) );
 }
 
-printf( "%s [style=bold];\n", format_train( $line, $map->{$line} ) );
+printf( "%s [style=bold];\n", format_train( $line, $map->{$line}[0] ) );
 
 say "}"
diff --git a/schema.yml b/schema.yml
index 60f5b15..0cbc85d 100644
--- a/schema.yml
+++ b/schema.yml
@@ -40,7 +40,9 @@ components:
           type: object
           description: dict mapping train numbers to train objects
           additionalProperties:
-            $ref: '#/components/schemas/train'
+            type: array
+            items:
+              $ref: '#/components/schemas/train'
     train:
       type: object
       properties:
author	Daniel Friesel <derf@finalrewind.org>	2021-01-11 21:23:31 +0100
committer	Daniel Friesel <derf@finalrewind.org>	2021-01-11 21:23:31 +0100
commit	aaee1881495d0e5bae0e4027b48d2d62d6d5c0f1 (patch)
tree	700a5857594eb8cd6018185bd2eab879934f0567
parent	c6270546445c1c3954636a1c92d882d1b4b379bd (diff)