summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Friesel <derf@derf.homelinux.org>2010-06-05 17:56:21 +0200
committerDaniel Friesel <derf@derf.homelinux.org>2010-06-05 17:56:21 +0200
commitb71529f3b7fed44b337027a623252db346acd1a0 (patch)
tree637a99ce1418d7b9ac738b8f2ade3ab5de51d83b
parent6328b9cf2477049b377329f1059ef31111d054b0 (diff)
Use HTML::TreeBuilder::XPath for parsing. Todo: Lots of code cleanup.
-rw-r--r--Changelog4
-rwxr-xr-xbin/efa264
2 files changed, 80 insertions, 188 deletions
diff --git a/Changelog b/Changelog
index c6f6de0..8ea6c13 100644
--- a/Changelog
+++ b/Changelog
@@ -1,3 +1,7 @@
+git HEAD
+
+ * Rewrite efa parser using HTML::TreeBuilder::XPath
+
efa 1.1.2 - Wed May 12 2010
* Fix -v
diff --git a/bin/efa b/bin/efa
index 92893a5..c9cdb36 100755
--- a/bin/efa
+++ b/bin/efa
@@ -6,8 +6,10 @@ use strict;
use warnings;
use encoding 'utf8';
use 5.010;
+
use Encode;
use Getopt::Long qw/:config no_ignore_case/;
+use HTML::TreeBuilder::XPath;
use WWW::Mechanize;
my $firsturl = 'http://efa.vrr.de/vrr/XSLT_TRIP_REQUEST2?language=de&itdLPxx_transpCompany=vrr';
@@ -19,7 +21,6 @@ my %post;
my $www = WWW::Mechanize->new(
autocheck => 1,
);
-my $raw;
my (@from, @to, @via);
my ($from_type, $to_type, $via_type) = ('stop') x 3;
my ($time, $time_depart, $time_arrive);
@@ -31,183 +32,16 @@ my $prefer;
my $proximity;
my $walk_speed;
my $with_bike;
-my $debug = 0;
my $timeout = 60;
my $ignore_info = 'Fahrradmitnahme';
my ($test_dump, $test_parse);
-sub check_ambiguous {
- my $html = shift;
- my $choose_re = qr{
- <span \s class="errorTextBold">
- Bitte \s auswählen
- </span>
- }x;
- my $select_re = qr{
- <select \s name="
- (?<what>
- ( place | type | name )
- _
- ( origin | destination )
- ) "
- }x;
- my $option_re = qr{
- <option \s value=" \d+ ( : \d+ )* "
- ( \s selected )? >
- (?<choice> [^<]+ )
- </option>
- }x;
-
- if ($html =~ /$choose_re/s) {
- foreach (split(/$choose_re/s, $html)) {
- if (/$select_re/) {
- print "Ambiguous input for $+{what}\n";
- }
- while (/$option_re/gs) {
- print "\t$+{choice}\n";
- }
- }
- return 1;
- }
- return 0;
-}
-
-sub parse_content {
- my $input = shift;
- my $groupsize = 8;
- my $return;
- my $time_re = qr{ \d+ : \d+ }x;
- my $ext_time_re = qr{
- ^ (
- $time_re
- |
- ab \s
- |
- ) $
- }x;
- my $anschluss_re = qr{
- ^ (
- Fußweg
- |
- Anschluss \s wird .* abgewartet
- )
- }x;
-
- for my $offer (0 .. $#{$input}) {
- foreach (@{$input->[$offer]}) {
- s/\s* <br> \s*/, /gx;
- s/< [^>]+ >//gx;
- }
-
- for (my $i = 0; @{$input->[$offer]} >= (($i+1) * $groupsize) - 1; $i++) {
- my $offset = $i * $groupsize;
- my @extra;
- if (
- $input->[$offer]->[$offset+2] =~ $anschluss_re
- or $input->[$offer]->[$offset+3] =~ / ^ Fußweg /x
- ) {
- # These are generic and usually lack both the time and the last element
- if ($input->[$offer]->[$offset ] !~ $time_re) {splice(@{$input->[$offer]}, $offset , 0, '')}
- if ($input->[$offer]->[$offset+4] !~ $time_re) {splice(@{$input->[$offer]}, $offset+4, 0, '')}
- splice(@{$input->[$offer]}, $offset+7, 0, '');
- }
-
- for my $j (0, 4, 8) {
- while (
- exists $input->[$offer]->[$offset+$j]
- and $input->[$offer]->[$offset+$j] !~ $ext_time_re
- and $input->[$offer]->[$offset+$j] ne 'Verspätungen sind berücksichtigt'
- ) {
- if ($input->[$offer]->[$offset+$j] =~ /^ \s* $/x) {
- splice(@{$input->[$offer]}, $offset+$j, 1);
- }
- else {
- push(@extra, splice(@{$input->[$offer]}, $offset+$j, 1));
- }
- }
- }
-
- $return->[$offer]->[$i] = {
- deptime => $input->[$offer]->[$offset],
- dep => $input->[$offer]->[$offset+1],
- depstop => $input->[$offer]->[$offset+2],
- deptrain => $input->[$offer]->[$offset+3],
- depdest => $input->[$offer]->[$offset+7],
- arrtime => $input->[$offer]->[$offset+4],
- arr => $input->[$offer]->[$offset+5],
- arrstop => $input->[$offer]->[$offset+6],
- };
- @{$return->[$offer]->[$i]->{extra}} = @extra;
- }
- }
- return $return;
-}
-
-sub prepare_content {
- my $html = shift;
- my $offer = 0;
- my $return;
- my $split_re = qr{
- <span \s class="labelTextBold">
- \s \d+ \. \s Fahrt
- </span>
- }x;
- my $content_re = qr{
- <span \s class="labelText" ( \s valign="center" )? >
- (?<content> .+ )
- </span> </td>
- }x;
-
- foreach my $chunk (split($split_re, $html)) {
- if ($offer == 0) {
- $offer++;
- next;
- }
- foreach my $line (split(/\n/, $chunk)) {
- if ($line =~ $content_re) {
- push(@{$return->[$offer-1]}, $+{content});
- }
- }
- $offer++;
- }
- return $return;
-}
-
-sub show_content {
- my $connections = shift;
- my $first = 0;
-
- foreach my $connection (@{$connections}) {
- if ($first) {
- print "------\n\n";
- }
- else {
- $first = 1;
- }
-
- foreach my $part (@{$connection}) {
- foreach (@{$part->{extra}}) {
- if (not (length($ignore_info) and $_ =~ /$ignore_info/i)) {
- print "# $_\n";
- }
- }
-
- printf(
- "%-5s %-2s %-30s %-20s %s\n%-5s %-2s %-30s\n\n",
- $part->{deptime}, $part->{dep}, $part->{depstop}, $part->{deptrain},
- $part->{depdest}, $part->{arrtime}, $part->{arr}, $part->{arrstop}
- );
-
- }
- }
- return;
-}
+my $xp_ambiguous = '//select';
GetOptions(
'a|arrive=s' => \$time_arrive,
'b|bike' => \$with_bike,
'd|date=s' => \$date,
- 'D|debug' => \$debug,
'depart=s' => \$time_depart,
'e|exclude=s' => \@exclude,
'from=s{2}' => \@from,
@@ -389,29 +223,88 @@ if ($test_dump) {
exit 0
}
-if (check_ambiguous($content)) {
+my $tree = HTML::TreeBuilder::XPath->new_from_content($content);
+
+if ($tree->exists($xp_ambiguous)) {
+ foreach my $select (@{$tree->findnodes($xp_ambiguous)}) {
+ printf(
+ "Ambiguous input: %s\n",
+ $select->attr('name'),
+ );
+ foreach my $val ($select->findnodes_as_strings('./option')) {
+ say "\t$val";
+ }
+ }
exit 1;
}
-$raw = prepare_content($content);
-
-if ($debug) {
- print STDERR "custom post values used in query:\n";
- foreach (keys(%post)) {
- print STDERR "\t$_ => $post{$_}\n";
+my @chunk;
+my $con_part = 0;
+my $no = 0;
+my $connections;
+
+foreach my $row (@{$tree->findnodes('//table//table/tr')}) {
+ foreach (@{$row->findnodes(
+ './td[@class="bgColor"] | '.
+ './td[@class="bgColor2"] | '.
+ './td[@colspan="8"]')})
+ {
+ if (defined $_->attr('colspan') and $_->attr('colspan') == 8) {
+ if ($_->as_text() =~ / (?<no> \d+ ) \. .+ Fahrt /x) {
+ $no = $+{'no'} - 1;
+ $con_part = 0;
+ next;
+ }
+ }
+ if (defined $_->attr('class') and $_->attr('class') =~ /^bgColor2?$/) {
+ if ($_->attr('class') eq 'bgColor' and ($con_part % 2) == 1) {
+ $con_part++;
+ }
+ elsif ($_->attr('class') eq 'bgColor2' and ($con_part % 2) == 0) {
+ $con_part++;
+ }
+ }
+ if (not $_->exists('./img') and $_->as_text() !~ /^\s*$/) {
+ push(@{$connections->[$no]->[$con_part]}, $_->as_text());
+ }
}
+}
- print STDERR "\nraw response:\n";
- foreach (@{$raw}) {
- print STDERR "---\n";
- foreach (@{$_}) {
- print STDERR "$_\n";
+if (@{$connections} == 0) {
+ die("Got no connections, parse error?\n");
+}
+
+for my $i (0 .. $#{$connections}) {
+ for my $j (0 .. $#{$connections->[$i]}) {
+
+ if ($connections->[$i]->[$j]->[0] !~ / \d{2} : \d{2} /ox) {
+ splice(@{$connections->[$i]->[$j]}, 0, 0, q{});
+ splice(@{$connections->[$i]->[$j]}, 4, 0, q{});
+ $connections->[$i]->[$j]->[7] = q{};
+ }
+ elsif ($connections->[$i]->[$j]->[4] =~ / Plan: \s ab /ox) {
+ printf(
+ "# %s\n",
+ splice(@{$connections->[$i]->[$j]}, 4, 1),
+ );
}
+
+ foreach my $extra (splice(@{$connections->[$i]->[$j]}, 8, -1)) {
+ if (not (length($ignore_info) and $extra =~ /$ignore_info/i)) {
+ say "# $extra";
+ }
+ }
+
+ printf(
+ "%-5s %-2s %-30s %-20s %s\n%-5s %-2s %-30s\n\n",
+ @{$connections->[$i]->[$j]}[0, 1, 2, 3, 7, 4, 5, 6],
+ )
+ }
+ if ($i != $#{$connections}) {
+ print "------\n\n";
}
}
-show_content(parse_content($raw));
-
__END__
=head1 NAME
@@ -544,11 +437,6 @@ If I<regex> is not supplied, removes the default regex (-E<gt> nothing will be i
Set timeout for HTTP requests. Default: 60 seconds.
-=item B<-D>|B<--debug>
-
-Display debug information (additional post requests sent to the site,
-raw items received from the site)
-
=item B<--post> I<key>=I<value>
Add I<key> with I<value> to the HTTP POST request sent to the EFA server.