diff options
| author | Daniel Friesel <derf@derf.homelinux.org> | 2010-06-05 17:56:21 +0200 | 
|---|---|---|
| committer | Daniel Friesel <derf@derf.homelinux.org> | 2010-06-05 17:56:21 +0200 | 
| commit | b71529f3b7fed44b337027a623252db346acd1a0 (patch) | |
| tree | 637a99ce1418d7b9ac738b8f2ade3ab5de51d83b | |
| parent | 6328b9cf2477049b377329f1059ef31111d054b0 (diff) | |
Use HTML::TreeBuilder::XPath for parsing. Todo: Lots of code cleanup.
| -rw-r--r-- | Changelog | 4 | ||||
| -rwxr-xr-x | bin/efa | 264 | 
2 files changed, 80 insertions, 188 deletions
| @@ -1,3 +1,7 @@ +git HEAD + +    * Rewrite efa parser using HTML::TreeBuilder::XPath +  efa 1.1.2 - Wed May 12 2010      * Fix -v @@ -6,8 +6,10 @@ use strict;  use warnings;  use encoding 'utf8';  use 5.010; +  use Encode;  use Getopt::Long qw/:config no_ignore_case/; +use HTML::TreeBuilder::XPath;  use WWW::Mechanize;  my $firsturl = 'http://efa.vrr.de/vrr/XSLT_TRIP_REQUEST2?language=de&itdLPxx_transpCompany=vrr'; @@ -19,7 +21,6 @@ my %post;  my $www = WWW::Mechanize->new(  	autocheck => 1,  ); -my $raw;  my (@from, @to, @via);  my ($from_type, $to_type, $via_type) = ('stop') x 3;  my ($time, $time_depart, $time_arrive); @@ -31,183 +32,16 @@ my $prefer;  my $proximity;  my $walk_speed;  my $with_bike; -my $debug = 0;  my $timeout = 60;  my $ignore_info = 'Fahrradmitnahme';  my ($test_dump, $test_parse); -sub check_ambiguous { -	my $html = shift; -	my $choose_re = qr{ -		<span \s class="errorTextBold"> -		Bitte \s auswählen -		</span> -	}x; -	my $select_re = qr{ -		<select \s name=" -		(?<what> -			( place | type | name ) -			_ -			( origin | destination ) -		) " -	}x; -	my $option_re = qr{ -		<option \s value=" \d+ ( : \d+ )* " -		( \s selected )? > -		(?<choice> [^<]+ ) -		</option> -	}x; - -	if ($html =~ /$choose_re/s) { -		foreach (split(/$choose_re/s, $html)) { -			if (/$select_re/) { -				print "Ambiguous input for $+{what}\n"; -			} -			while (/$option_re/gs) { -				print "\t$+{choice}\n"; -			} -		} -		return 1; -	} -	return 0; -} - -sub parse_content { -	my $input = shift; -	my $groupsize = 8; -	my $return; -	my $time_re = qr{ \d+ : \d+ }x; -	my $ext_time_re = qr{ -		^ ( -			$time_re -			| -			ab \s -			| -		) $ -	}x; -	my $anschluss_re = qr{ -		^ ( -			Fußweg -			| -			Anschluss \s wird .* abgewartet -		) -	}x; - -	for my $offer (0 .. $#{$input}) { -		foreach (@{$input->[$offer]}) { -			s/\s* <br> \s*/, /gx; -			s/< [^>]+ >//gx; -		} - -		for (my $i = 0; @{$input->[$offer]} >= (($i+1) * $groupsize) - 1; $i++) { -			my $offset = $i * $groupsize; -			my @extra; -			if ( -			     $input->[$offer]->[$offset+2] =~ $anschluss_re -			  or $input->[$offer]->[$offset+3] =~ / ^ Fußweg /x -			) { -				# These are generic and usually lack both the time and the last element -				if ($input->[$offer]->[$offset  ] !~ $time_re) {splice(@{$input->[$offer]}, $offset  , 0, '')} -				if ($input->[$offer]->[$offset+4] !~ $time_re) {splice(@{$input->[$offer]}, $offset+4, 0, '')} -				splice(@{$input->[$offer]}, $offset+7, 0, ''); -			} - -			for my $j (0, 4, 8) { -				while ( -					exists $input->[$offer]->[$offset+$j] -					and $input->[$offer]->[$offset+$j] !~ $ext_time_re -					and $input->[$offer]->[$offset+$j] ne 'Verspätungen sind berücksichtigt' -				) { -					if ($input->[$offer]->[$offset+$j] =~ /^ \s* $/x) { -						splice(@{$input->[$offer]}, $offset+$j, 1); -					} -					else { -						push(@extra, splice(@{$input->[$offer]}, $offset+$j, 1)); -					} -				} -			} - -			$return->[$offer]->[$i] = { -				deptime  => $input->[$offer]->[$offset], -				dep      => $input->[$offer]->[$offset+1], -				depstop  => $input->[$offer]->[$offset+2], -				deptrain => $input->[$offer]->[$offset+3], -				depdest  => $input->[$offer]->[$offset+7], -				arrtime  => $input->[$offer]->[$offset+4], -				arr      => $input->[$offer]->[$offset+5], -				arrstop  => $input->[$offer]->[$offset+6], -			}; -			@{$return->[$offer]->[$i]->{extra}} = @extra; -		} -	} -	return $return; -} - -sub prepare_content { -	my $html = shift; -	my $offer = 0; -	my $return; -	my $split_re = qr{ -		<span \s class="labelTextBold"> -		\s \d+ \. \s Fahrt -		</span> -	}x; -	my $content_re = qr{ -		<span \s class="labelText" ( \s valign="center" )? > -		(?<content> .+ ) -		</span> </td> -	}x; - -	foreach my $chunk (split($split_re, $html)) { -		if ($offer == 0) { -			$offer++; -			next; -		} -		foreach my $line (split(/\n/, $chunk)) { -			if ($line =~ $content_re) { -				push(@{$return->[$offer-1]}, $+{content}); -			} -		} -		$offer++; -	} -	return $return; -} - -sub show_content { -	my $connections = shift; -	my $first = 0; - -	foreach my $connection (@{$connections}) { -		if ($first) { -			print "------\n\n"; -		} -		else { -			$first = 1; -		} - -		foreach my $part (@{$connection}) { -			foreach (@{$part->{extra}}) { -				if (not (length($ignore_info) and $_ =~ /$ignore_info/i)) { -					print "# $_\n"; -				} -			} - -			printf( -				"%-5s %-2s %-30s %-20s %s\n%-5s %-2s %-30s\n\n", -				$part->{deptime}, $part->{dep}, $part->{depstop}, $part->{deptrain}, -				$part->{depdest}, $part->{arrtime}, $part->{arr}, $part->{arrstop} -			); - -		} -	} -	return; -} +my $xp_ambiguous = '//select';  GetOptions(  	'a|arrive=s'     => \$time_arrive,  	'b|bike'         => \$with_bike,  	'd|date=s'       => \$date, -	'D|debug'        => \$debug,  	'depart=s'       => \$time_depart,  	'e|exclude=s'    => \@exclude,  	'from=s{2}'      => \@from, @@ -389,29 +223,88 @@ if ($test_dump) {  	exit 0  } -if (check_ambiguous($content)) { +my $tree = HTML::TreeBuilder::XPath->new_from_content($content); + +if ($tree->exists($xp_ambiguous)) { +	foreach my $select (@{$tree->findnodes($xp_ambiguous)}) { +		printf( +			"Ambiguous input: %s\n", +			$select->attr('name'), +		); +		foreach my $val ($select->findnodes_as_strings('./option')) { +			say "\t$val"; +		} +	}  	exit 1;  } -$raw = prepare_content($content); - -if ($debug) { -	print STDERR "custom post values used in query:\n"; -	foreach (keys(%post)) { -		print STDERR "\t$_ => $post{$_}\n"; +my @chunk; +my $con_part = 0; +my $no = 0; +my $connections; + +foreach my $row (@{$tree->findnodes('//table//table/tr')}) { +	foreach (@{$row->findnodes( +		'./td[@class="bgColor"] | '. +		'./td[@class="bgColor2"] | '. +		'./td[@colspan="8"]')}) +	{ +		if (defined $_->attr('colspan') and $_->attr('colspan') == 8) { +			if ($_->as_text() =~ / (?<no> \d+ ) \. .+ Fahrt /x) { +				$no = $+{'no'} - 1; +				$con_part = 0; +				next; +			} +		} +		if (defined $_->attr('class') and $_->attr('class') =~ /^bgColor2?$/) { +			if ($_->attr('class') eq 'bgColor' and ($con_part % 2) == 1) { +				$con_part++; +			} +			elsif ($_->attr('class') eq 'bgColor2' and ($con_part % 2) == 0) { +				$con_part++; +			} +		} +		if (not $_->exists('./img') and $_->as_text() !~ /^\s*$/) { +			push(@{$connections->[$no]->[$con_part]}, $_->as_text()); +		}  	} +} -	print STDERR "\nraw response:\n"; -	foreach (@{$raw}) { -		print STDERR "---\n"; -		foreach (@{$_}) { -			print STDERR "$_\n"; +if (@{$connections} == 0) { +	die("Got no connections, parse error?\n"); +} + +for my $i (0 .. $#{$connections}) { +	for my $j (0 .. $#{$connections->[$i]}) { + +		if ($connections->[$i]->[$j]->[0] !~ / \d{2} : \d{2} /ox) { +			splice(@{$connections->[$i]->[$j]}, 0, 0, q{}); +			splice(@{$connections->[$i]->[$j]}, 4, 0, q{}); +			$connections->[$i]->[$j]->[7] = q{}; +		} +		elsif ($connections->[$i]->[$j]->[4] =~ / Plan: \s ab /ox) { +			printf( +				"# %s\n", +				splice(@{$connections->[$i]->[$j]}, 4, 1), +			);  		} + +		foreach my $extra (splice(@{$connections->[$i]->[$j]}, 8, -1)) { +			if (not (length($ignore_info) and $extra =~ /$ignore_info/i)) { +				say "# $extra"; +			} +		} + +		printf( +			"%-5s %-2s %-30s %-20s %s\n%-5s %-2s %-30s\n\n", +			@{$connections->[$i]->[$j]}[0, 1, 2, 3, 7, 4, 5, 6], +		) +	} +	if ($i != $#{$connections}) { +		print "------\n\n";  	}  } -show_content(parse_content($raw)); -  __END__  =head1 NAME @@ -544,11 +437,6 @@ If I<regex> is not supplied, removes the default regex (-E<gt> nothing will be i  Set timeout for HTTP requests. Default: 60 seconds. -=item B<-D>|B<--debug> - -Display debug information (additional post requests sent to the site, -raw items received from the site) -  =item B<--post> I<key>=I<value>  Add I<key> with I<value> to the HTTP POST request sent to the EFA server. | 
