#!/usr/bin/env perl
use strict;
use warnings;
use 5.010;

use WWW::Mechanize;

my $mech = WWW::Mechanize->new(
	stack_depth => 2,
);

my $uri      = shift || first_line('last_uri');
my $image_re = first_line('image_re');
my $exit     = 1;

if (not defined $uri or not defined $image_re) {
	die("last_uri or image_re not found / specified\n");
}

$image_re = qr{$image_re};

sub find_next_link {
	foreach my $re (
		qr{ ^ next $ }ix,
		qr{   next   }ix,
	)
	{
		my $link = $mech->find_link(text_regex => $re);
		if ($link) {
			return $link;
		}
	}
	save_lasturi();
	say "Cannot find next link. We might have reached the end of the comic.";
	exit $exit;
}

sub find_image {
	my $image = $mech->find_image(url_abs_regex => $image_re);

	if ($image) {
		my $tmpmech = WWW::Mechanize->new();
		$tmpmech->get($image->url_abs);
		return $tmpmech;
	}
	return;
}

sub get_image {
	my $tmpmech = find_image() or return;
	my $filename = (split(qr{/}o, $tmpmech->uri->as_string))[-1];

	if (-e $filename) {
		say "img: $filename (skipped)";
	}
	else {
		$exit = 0;
		say "img: $filename";
		open(my $fh, '>', $filename) or die("Cannot open $filename: $!");
		print {$fh} $tmpmech->content();
		close($fh) or die("Cannot close $filename: $!");
	}

	return;
}

sub first_line {
	my ($filename) = @_;
	my ($line, $fh);

	if (not open($fh, '<', $filename)) {
		warn("Cannot open $filename: $!\n");
		return;
	}

	$line = <$fh>;
	close($fh) or warn("Cannot close $filename: $!\n");

	chomp $line;
	return $line;
}

sub save_lasturi {

	# Some webcomics have a non-regular page for the last (as in, latest)
	# image. Work around this.
	$mech->back();

	open(my $fh, '>', 'last_uri') or die("Cannot open last_uri: $!");
	print {$fh} $mech->uri->as_string;
	close($fh) or die("Cannot close last_uri: $!");
	return;
}

local $SIG{INT} = sub {
	save_lasturi();
	exit $exit;
};

while (
	$mech->get($uri)
	and $mech->success()
	and $mech->status() == 200
      )
{
	say "URI: $uri";

	get_image;

	$uri = find_next_link->URI->abs->as_string;

	if ($uri eq $mech->uri->as_string) {
		save_lasturi();
		say "The 'next' link lead us to a loop.";
		say "This is probably because we reached the end of the comic.";
		exit $exit;
	}

	print "\n";
	# Avoid accidently DoSing webservers.
	sleep(1);
}


__END__

=head1 NAME

B<comirror> - Generic webcomic mirrorer

=head1 SYNOPSIS

B<comirror> [I<comic url>]

=head1 DESCRIPTION

B<comirror> "reads" a webcomic while saving the comic images to the current
working directory.

=head1 OPTIONS

B<comirror> takes no options.

=head1 EXIT STATUS

Zero if at least one new comic image was downloaded, one if either no images
were found or all found images already existed in the current directory.  Any
other non-zero means indicates grave errors.

=head1 CONFIGURATION

B<comirror> is designed to operate in the current working directory.  Images
are saved to it and a few files ard read by B<comirror> as configuration
parameters.

=over

=item F<image_re>

A regular expression matching the URL of the webcomic image to be saved.
You can either create the file manually or let it be created by
comirror-setup(1).

=item F<last_uri>

The URI to the last but one comic site before B<comirror> exited is
automatically written to this file.  If this file exists and B<comirror> is
called without arguments, it will automatically resume crawling the webcomic
from that point on.

=back

=head1 DEPENDENCIES

B<comirror> requires the perl module WWW::Mechanize

=head1 BUGS AND LIMITATIONS

This script has no brain.  It has very limited knowledge about the usual
layout of a webcomic and makes a few guesses which happen to work in a lot of
cases.  However, there may well be webcomics which (combined with a
unrestrictive image_re) lead B<comirror> to crawling lots of non-comic images.
So of course, use at your own risk.

=head1 SEE ALSO

comirror-setup(1)

=head1 AUTHOR

Copyright (C) 2010 by Daniel Friesel E<lt>derf@chaosdorf.deE<gt>

=head1 LICENSE

  0. You just DO WHAT THE FUCK YOU WANT TO.