#!/usr/bin/env perl use strict; use warnings; use 5.010; use WWW::Mechanize; my $mech = WWW::Mechanize->new( stack_depth => 2, ); my $uri = shift || first_line('last_uri'); my $image_re = first_line('image_re'); my $exit = 1; if (not defined $uri or not defined $image_re) { die("last_uri or image_re not found / specified\n"); } $image_re = qr{$image_re}; sub find_next_link { foreach my $re ( qr{ ^ next $ }ix, qr{ next }ix, ) { my $link = $mech->find_link(text_regex => $re); if ($link) { return $link; } } save_lasturi(); say "Cannot find next link. We might have reached the end of the comic."; exit $exit; } sub find_image { my $image = $mech->find_image(url_abs_regex => $image_re); if ($image) { my $tmpmech = WWW::Mechanize->new(); $tmpmech->get($image->url_abs); return $tmpmech; } return; } sub get_image { my $tmpmech = find_image() or return; my $filename = (split(qr{/}o, $tmpmech->uri->as_string))[-1]; if (-e $filename) { say "img: $filename (skipped)"; } else { $exit = 0; say "img: $filename"; open(my $fh, '>', $filename) or die("Cannot open $filename: $!"); print {$fh} $tmpmech->content(); close($fh) or die("Cannot close $filename: $!"); } return; } sub first_line { my ($filename) = @_; my ($line, $fh); if (not open($fh, '<', $filename)) { warn("Cannot open $filename: $!\n"); return; } $line = <$fh>; close($fh) or warn("Cannot close $filename: $!\n"); chomp $line; return $line; } sub save_lasturi { # Some webcomics have a non-regular page for the last (as in, latest) # image. Work around this. $mech->back(); open(my $fh, '>', 'last_uri') or die("Cannot open last_uri: $!"); print {$fh} $mech->uri->as_string; close($fh) or die("Cannot close last_uri: $!"); return; } local $SIG{INT} = sub { save_lasturi(); exit $exit; }; while ( $mech->get($uri) and $mech->success() and $mech->status() == 200 ) { say "URI: $uri"; get_image; $uri = find_next_link->URI->abs->as_string; if ($uri eq $mech->uri->as_string) { save_lasturi(); say "The 'next' link lead us to a loop."; say "This is probably because we reached the end of the comic."; exit $exit; } print "\n"; # Avoid accidently DoSing webservers. sleep(1); } __END__ =head1 NAME B - Generic webcomic mirrorer =head1 SYNOPSIS B [I] =head1 DESCRIPTION B "reads" a webcomic while saving the comic images to the current working directory. =head1 OPTIONS B takes no options. =head1 EXIT STATUS Zero if at least one new comic image was downloaded, one if either no images were found or all found images already existed in the current directory. Any other non-zero means indicates grave errors. =head1 CONFIGURATION B is designed to operate in the current working directory. Images are saved to it and a few files ard read by B as configuration parameters. =over =item F A regular expression matching the URL of the webcomic image to be saved. You can either create the file manually or let it be created by comirror-setup(1). =item F The URI to the last but one comic site before B exited is automatically written to this file. If this file exists and B is called without arguments, it will automatically resume crawling the webcomic from that point on. =back =head1 DEPENDENCIES B requires the perl module WWW::Mechanize =head1 BUGS AND LIMITATIONS This script has no brain. It has very limited knowledge about the usual layout of a webcomic and makes a few guesses which happen to work in a lot of cases. However, there may well be webcomics which (combined with a unrestrictive image_re) lead B to crawling lots of non-comic images. So of course, use at your own risk. =head1 SEE ALSO comirror-setup(1) =head1 AUTHOR Copyright (C) 2010 by Daniel Friesel Ederf@chaosdorf.deE =head1 LICENSE 0. You just DO WHAT THE FUCK YOU WANT TO.