summaryrefslogtreecommitdiff
path: root/bin/comirror
blob: 98353a64ab033a301c8d0b21fcac04bb96e1520e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env perl
use strict;
use warnings;
use 5.010;

use WWW::Mechanize;

my $mech = WWW::Mechanize->new(
	stack_depth => 2,
);
$mech->agent_alias('Linux Mozilla');

our $VERSION = '0.1';

my %conf     = file_to_hash('comirror.conf');
my %state    = file_to_hash('comirror.state');
my $uri      = shift || $state{'uri'};
my $image_re = $conf{'image_re'};
my $exit     = 1;
my $version = '0.0-git';

if ($uri ~~ ['-v', '--version']) {
	say "comirror version $version";
	exit 0;
}

if (not defined $uri or not defined $image_re) {
	die("last_uri or image_re not found / specified\n");
}

$image_re = qr{$image_re};

sub find_next_link {

	if (defined $conf{'next_link'}) {
		my $link = $mech->find_link(text => $conf{'next_link'});
		if ($link) {
			return $link;
		}
	}
	else {

		foreach my $re (
			qr{ ^ next $ }ix,
			qr{   next   }ix,
		)
		{
			my $link = $mech->find_link(text_regex => $re);
			if ($link) {
				return $link;
			}
		}
	}
	save_state();
	say "Cannot find next link. We might have reached the end of the comic.";
	exit $exit;
}

sub find_image {
	my $image;

	if (
		$conf{'link_to_image'}
		and $mech->find_link(url_abs_regex => $image_re)
	   )
	{
		$image = $mech->find_link(url_abs_regex => $image_re);
	}
	else {
		$image = $mech->find_image(url_abs_regex => $image_re);
	}

	if ($image) {
		my $tmpmech = WWW::Mechanize->new();
		$tmpmech->get($image->url_abs);
		return $tmpmech;
	}
	return;
}

sub get_image {
	my $tmpmech = find_image() or return;
	my $filename = (split(qr{/}o, $tmpmech->uri->as_string))[-1];

	if (-e $filename) {
		say "img: $filename (skipped)";
	}
	else {
		$exit = 0;
		say "img: $filename";
		open(my $fh, '>', $filename) or die("Cannot open $filename: $!");
		print {$fh} $tmpmech->content();
		close($fh) or die("Cannot close $filename: $!");
	}

	return;
}

sub file_to_hash {
	my ($file) = @_;
	my %return;

	if (not -e $file) {
		return;
	}

	open(my $fh, '<', $file) or die("Cannot read $file: $!\n");
	binmode($fh, ':utf8');
	while(my $line = <$fh>) {
		$line =~ / ^ (?<key> \S+ ) [[:space:]]+ (?<value> .*) $ /x or next;
		$return{$+{key}} = $+{value};
	}
	close($fh);
	return %return;
}

sub save_state {
	# Some webcomics have a non-regular page for the last (as in, latest)
	# image. Work around this.
	$mech->back();

	$state{'uri'} = $mech->uri->as_string;

	open(my $fh, '>', 'comirror.state') or die("Cannot open comirror.state: $!");
	while (my ($key, $value) = each(%state)) {
		print {$fh} "$key\t$value\n";
	}
	close($fh) or die("Cannot close last_uri: $!");
	return;
}

local $SIG{INT} = sub {
	save_state();
	exit $exit;
};

while (
	$mech->get($uri)
	and $mech->success()
	and $mech->status() == 200
      )
{
	say "URI: $uri";

	get_image;

	$uri = find_next_link->URI->abs->as_string;

	if ($uri eq $mech->uri->as_string) {
		save_state();
		say "The 'next' link lead us to a loop.";
		say "This is probably because we reached the end of the comic.";
		exit $exit;
	}

	print "\n";
	# Avoid accidently DoSing webservers.
	sleep(1);
}




__END__

=head1 NAME

B<comirror> - Generic webcomic mirrorer

=head1 SYNOPSIS

B<comirror> [I<comic url>]

=head1 DESCRIPTION

B<comirror> "reads" a webcomic while saving the comic images to the current
working directory.

=head1 OPTIONS

B<comirror> takes no options.

=head1 EXIT STATUS

Zero if at least one new comic image was downloaded, one if either no images
were found or all found images already existed in the current directory.  Any
other non-zero return value indicates grave errors.

=head1 CONFIGURATION

B<comirror> is designed to operate in the current working directory.  Images
are saved to it; the configuration is read from F<comirror.conf> and the last
state (if any) is read from F<comirror.state>.

Both files are formatted in the form key <tab> value with one key-value pair
per line.  Comments or empty lines are not supported.  F<comirror.state> is
automatically written when B<comirror> terminates.

=head2 COMIRROR.CONF

A little explanation of the F<comirror.conf> keys.
Note that comirror-setup(1) will automatically create this file for you, you
only need to edit it if comirror-setup(1) didn't work properly or you don't
want to use it at all.

=over

=item image_re

A regular expression matching the URL of the webcomic image to be saved.

=item next_link

The text on the link to the next image.  Can be left out if it contains
"next".

=item link_to_image

If this is set to a true value, B<comirror> will first try to find a link
pointing to image_re and only if that fails look directly for images.  This is
useful for comics with small-ish preview images which link to larger ones.

The future of this option is unclear, such behaviour may become the default.

=back

=head2 COMIRROR.STATE

You should not need to edit this.

=over

=item uri

Absolute URI to the last but one comic page B<comirror> was inspecting.  Can
be overridden by the commandline argument.  Exists so that B<comirror> will
resume its comic crawl from the right point when it's started again.

=back

=head1 DEPENDENCIES

B<comirror> requires the perl module WWW::Mechanize

=head1 BUGS AND LIMITATIONS

This script has no brain.  It has very limited knowledge about the usual
layout of a webcomic and makes a few guesses which happen to work in a lot of
cases.  However, there may well be webcomics which (combined with a
unrestrictive image_re) lead B<comirror> to crawling lots of non-comic images.
So of course, use at your own risk.

=head1 SEE ALSO

comirror-setup(1)

=head1 AUTHOR

Copyright (C) 2010 by Daniel Friesel E<lt>derf@chaosdorf.deE<gt>

=head1 LICENSE

  0. You just DO WHAT THE FUCK YOU WANT TO.