#!/usr/bin/env perl use strict; use warnings; use 5.010; use WWW::Mechanize; my $mech = WWW::Mechanize->new( stack_depth => 2, ); my %conf = file_to_hash('comirror.conf'); my %state = file_to_hash('comirror.state'); my $uri = shift || $state{'uri'}; my $image_re = $conf{'image_re'}; my $exit = 1; my $version = '0.0-git'; if ($uri ~~ ['-v', '--version']) { say "comirror version $version"; exit 0; } if (not defined $uri or not defined $image_re) { die("last_uri or image_re not found / specified\n"); } $image_re = qr{$image_re}; sub find_next_link { if (defined $conf{'next_link'}) { my $link = $mech->find_link(text => $conf{'next_link'}); if ($link) { return $link; } } else { foreach my $re ( qr{ ^ next $ }ix, qr{ next }ix, ) { my $link = $mech->find_link(text_regex => $re); if ($link) { return $link; } } } save_state(); say "Cannot find next link. We might have reached the end of the comic."; exit $exit; } sub find_image { my $image; if ( $conf{'link_to_image'} and $mech->find_link(url_abs_regex => $image_re) ) { $image = $mech->find_link(url_abs_regex => $image_re); } else { $image = $mech->find_image(url_abs_regex => $image_re); } if ($image) { my $tmpmech = WWW::Mechanize->new(); $tmpmech->get($image->url_abs); return $tmpmech; } return; } sub get_image { my $tmpmech = find_image() or return; my $filename = (split(qr{/}o, $tmpmech->uri->as_string))[-1]; if (-e $filename) { say "img: $filename (skipped)"; } else { $exit = 0; say "img: $filename"; open(my $fh, '>', $filename) or die("Cannot open $filename: $!"); print {$fh} $tmpmech->content(); close($fh) or die("Cannot close $filename: $!"); } return; } sub file_to_hash { my ($file) = @_; my %return; if (not -e $file) { return; } open(my $fh, '<', $file) or die("Cannot read $file: $!\n"); binmode($fh, ':utf8'); while(my $line = <$fh>) { $line =~ / ^ (? \S+ ) [[:space:]]+ (? .*) $ /x or next; $return{$+{key}} = $+{value}; } close($fh); return %return; } sub save_state { # Some webcomics have a non-regular page for the last (as in, latest) # image. Work around this. $mech->back(); $state{'uri'} = $mech->uri->as_string; open(my $fh, '>', 'comirror.state') or die("Cannot open comirror.state: $!"); while (my ($key, $value) = each(%state)) { print {$fh} "$key\t$value\n"; } close($fh) or die("Cannot close last_uri: $!"); return; } local $SIG{INT} = sub { save_state(); exit $exit; }; while ( $mech->get($uri) and $mech->success() and $mech->status() == 200 ) { say "URI: $uri"; get_image; $uri = find_next_link->URI->abs->as_string; if ($uri eq $mech->uri->as_string) { save_state(); say "The 'next' link lead us to a loop."; say "This is probably because we reached the end of the comic."; exit $exit; } print "\n"; # Avoid accidently DoSing webservers. sleep(1); } __END__ =head1 NAME B - Generic webcomic mirrorer =head1 SYNOPSIS B [I] =head1 DESCRIPTION B "reads" a webcomic while saving the comic images to the current working directory. =head1 OPTIONS B takes no options. =head1 EXIT STATUS Zero if at least one new comic image was downloaded, one if either no images were found or all found images already existed in the current directory. Any other non-zero return value indicates grave errors. =head1 CONFIGURATION B is designed to operate in the current working directory. Images are saved to it; the configuration is read from F and the last state (if any) is read from F. Both files are formatted in the form key value with one key-value pair per line. Comments or empty lines are not supported. F is automatically written when B terminates. =head2 COMIRROR.CONF A little explanation of the F keys. Note that comirror-setup(1) will automatically create this file for you, you only need to edit it if comirror-setup(1) didn't work properly or you don't want to use it at all. =over =item image_re A regular expression matching the URL of the webcomic image to be saved. =item next_link The text on the link to the next image. Can be left out if it contains "next". =item link_to_image If this is set to a true value, B will first try to find a link pointing to image_re and only if that fails look directly for images. This is useful for comics with small-ish preview images which link to larger ones. The future of this option is unclear, such behaviour may become the default. =back =head2 COMIRROR.STATE You should not need to edit this. =over =item uri Absolute URI to the last but one comic page B was inspecting. Can be overridden by the commandline argument. Exists so that B will resume its comic crawl from the right point when it's started again. =back =head1 DEPENDENCIES B requires the perl module WWW::Mechanize =head1 BUGS AND LIMITATIONS This script has no brain. It has very limited knowledge about the usual layout of a webcomic and makes a few guesses which happen to work in a lot of cases. However, there may well be webcomics which (combined with a unrestrictive image_re) lead B to crawling lots of non-comic images. So of course, use at your own risk. =head1 SEE ALSO comirror-setup(1) =head1 AUTHOR Copyright (C) 2010 by Daniel Friesel Ederf@chaosdorf.deE =head1 LICENSE 0. You just DO WHAT THE FUCK YOU WANT TO.