#!/usr/bin/env perl ## Copyright © 2010 by Daniel Friesel ## License: WTFPL ## 0. You just DO WHAT THE FUCK YOU WANT TO. use strict; use warnings; use 5.010; use autodie; use Cwd; use Digest::SHA qw(sha1_hex); use File::Find; use Getopt::Long; use Storable qw(nstore retrieve); use Time::Progress; my $base = getcwd(); my $rel_paths = 1; my $read_size = (2 ** 20) * 4; # 4 MiB my $db_file = '.hashl.db'; my $total = 0; my $cur = 0; my $timer; my $VERSION = '0.1'; my $db; GetOptions( 'd|database=s' => \$db_file, 's|read-size=i' => sub { $read_size = $_[1] * 1024 }, ); if (substr($db_file, 0, 1) ne '/') { $db_file = "${base}/${db_file}"; } my $action = $ARGV[0]; if (not defined $action) { die("Usage: $0 \n"); } if (-r $db_file) { $db = retrieve($db_file); $read_size = $db->{'config'}->{'read_size'}; } else { $db->{'config'} = { read_size => $read_size, } } sub get_total { my $file = $File::Find::name; if (-f $file and not -l $file and $file ne $db_file) { $total++; } } sub drop_deleted { for my $file (keys %{$db->{'files'}}) { if (! -e $file) { delete $db->{'files'}->{$file}; } } } sub hash_file { my ($file) = @_; my ($fh, $data); open($fh, '<', $file); binmode($fh); read($fh, $data, $read_size); close($fh); return sha1_hex($data); } sub hash_in_db { my ($hash) = @_; while (my ($name, $file) = each(%{$db->{'files'}})) { if ($file->{'hash'} eq $hash) { return $name; } } return undef; } sub file_in_db { my ($file) = @_; return hash_in_db(hash_file($file)); } sub db_info { printf( "Read size: %d bytes (%.f KiB)\n", $db->{'config'}->{'read_size'}, $db->{'config'}->{'read_size'} / 1024, ); } sub process_file { my $file = $File::Find::name; my $path = $file; my ($size, $mtime) = (stat($file))[7,9]; local $| = 1; if (not -f $file or -l $file or $file eq $db_file) { return; } $cur++; if ($rel_paths) { $file = substr($file, length($base) + 1); } print $timer->report("\r\e[2KUpdating: %p done, %L elapsed, %E remaining", $cur); if (exists($db->{'files'}->{$file}) and $db->{'files'}->{$file}->{'mtime'} == $mtime and $db->{'files'}->{$file}->{'size'} == $size ) { return; } $db->{'files'}->{$file} = { hash => hash_file($path), mtime => $mtime, size => $size, }; if (($cur % 100) == 0) { nstore($db, $db_file); } } if ($action eq 'update') { drop_deleted(); find(\&get_total, $base); $timer = Time::Progress->new(); $timer->attr( min => 1, max => $total ); find(\&process_file, $base); print "\n"; nstore($db, $db_file); } elsif ($action eq 'list') { for my $name (sort keys %{$db->{'files'}}) { my $file = $db->{'files'}->{$name}; printf("%s %s\n", $file->{'hash'}, $name); } } elsif ($action ~~ [qw[know-file know-hash new-file new-hash]]) { if ($ARGV[1]) { given ($action) { when ('know-file') { exit (not defined file_in_db($ARGV[1])) } when ('know-hash') { exit (not defined hash_in_db($ARGV[1])) } when ('new-file') { exit (defined file_in_db($ARGV[1])) } when ('new-hash') { exit (defined hash_in_db($ARGV[1])) } } } else { while (my $line = ) { chomp $line; if ( ($action eq 'know-file' and file_in_db($line)) or ($action eq 'know-hash' and hash_in_db($line)) or ($action eq 'new-file' and not file_in_db($line)) or ($action eq 'new-hash' and not hash_in_db($line))) { say $line; } } } } elsif ($action eq 'info') { db_info(); } __END__ =head1 NAME B - Create database with partial file hashes, check if other files are in it =head1 SYNOPSIS B [B<-d> I] I [I] =head1 DESCRIPTION Actions: =over =item B [I] Checks if I is in the database. Returns 0 if it is, 1 otherwise. If I is not specified, B reads filenames from stdin and returns those which are not already in the database. =item B Update or create hash database. Iterates over all files below the current directory. =back =head1 OPTIONS =over =item B<-d> I Use I instead of F<.hashl.db> =back =head1 CONFIGURATION None, so far =head1 DEPENDENCIES =over =item * autodie (included with perl E= 5.10.1) =item * Digest::SHA =item * Time::Progress =back =head1 BUGS AND LIMITATIONS Unknown. This is alpha software. =head1 EXAMPLES =head2 LEECHING First, create a database of your local files: C<< cd /media/videos; hashl update >> Now, assume you have a (possibly slow) external share mounted at F. You do not want to copy all files to your disk and then use B or similar to weed out the duplicates. Since you just used B to create a database with the hashes of the first 4MB of all your files, you can now use it to check if you (very probably) already have any remote file. For that, you only need to leech the first 4MB of every file on the share, and not the whole file. For example: C<< for f (/tmp/mnt/ext/**/*(.)); hashl in-list $f || cp -i $f /media/videos/incoming/ >> =head2 EXTERNAL HARD DISK Personally, I have all my videos on an external hard disk, which I usually do not carry with me. So, when I get new videos, I put them into F<~/lib/videos> on my netboo, and then later copy them to the external disk. Of course, it can always happen that I get a movie I already have, or forget to move something from F<~/lib/videos> to the external disk, especially since I also always have some stuff from the disk in F<~/lib/videos>. However, I can use B to conveniently solve this issue. Run periodically: C<< cd /media/argon; hashl -d ~/lib/video/.argon update >> Now, I always have a list of files on the external disk with me. When I get a new file: C<< hashl -d ~/lib/video/.argon in-list $file >> And to find out which files are not on the external disk: C<< cd ~/lib/video; print -l **/*(.) | hashl -d .argon in-list >> =head1 AUTHOR Copyright (C) 2010 by Daniel Friesel Ederf@finalrewind.orgE =head1 LICENSE 0. You just DO WHAT THE FUCK YOU WANT TO.