#!/usr/bin/perl -w

# $Id: dedup,v 1.18 2010-01-27 10:20:08-05 roderick Exp $
#
# Roderick Schertler <roderick@argon.org>

# Copyright (C) 2001 Roderick Schertler
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# For a copy of the GNU General Public License write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

# XXX switch to show all things which would be removed, prompt, then
# removed them
# XXX maybe switch to prompt item by item

use strict;
use RS::Handy		qw (:stat $Me xdie getopt);
use POSIX		qw (SIGINT WEXITSTATUS WIFSIGNALED);
use Proc::WaitStat	qw (waitstat);
use String::ShellQuote	qw (shell_quote shell_comment_quote);

my $Batch	= 0;
my $Debug	= 0;
my $Do_link	= 0;
my $Exit	= 0;
my $No		= 0;
my $Stdin	= undef;
my $Verbose	= 0;
my $Version	= q$Revision: 1.18 $ =~ /(\d\S+)/ ? $1 : '?';

my @Option_spec = (
    'batch'	=> \$Batch,
    'debug!'	=> \$Debug,
    'help'	=> sub { usage() },
    'link'	=> \$Do_link,
    'no|n'	=> \$No,
    'stdin'	=> sub { $Stdin = "\n" },
    'stdin0'	=> sub { $Stdin = "\0" },
    'verbose|v+' => \$Verbose,
    'version'	=> sub { print "$Me version $Version\n"; exit },
);

my $Usage = <<EOF;
usage: $Me [switch]... file file...
switches:
     --batch		exit silently if given less than 2 files
     --debug		turn debugging on
     --help		show this and then die
     --link		instead of removing duplicates, link them together
 -n, --no		output info but don't actually do anything
     --stdin		read newline-terminated file list from stdin
     --stdin0		read null-terminated file list from stdin
 -v, --verbose		be verbose (multiple -v -> more verbose yet)
     --version	show the version ($Version) and exit
Use `perldoc $Me' to see the full documenation.
EOF

sub xwarn {
    RS::Handy::xwarn @_;
    $Exit ||= 1;
}

sub debug {
    print STDERR "debug: ", @_, "\n" if $Debug;
}

sub usage {
    xwarn @_ if @_;
    # Use exit() rather than die(), as Getopt::Long does eval().
    print STDERR $Usage;
    exit 1;
}

sub init {
    getopt -bundle, @Option_spec or usage if @ARGV;

    if (defined $Stdin) {
	local $/ = $Stdin;
	my @list = <STDIN>;
	chomp @list;
	push @ARGV, @list;
    }
}

# $info points to a hash of lists of lists.  Keys are file sizes.  Values
# are anon lists, element 0 is the file path, element 1 is the mtime.

sub F_PATH	{ 0 };
sub F_MTIME	{ 1 };

sub get_file_list {
    my @file = @_;
    my $info = {};
    my ($file, %seen);
    my $tot_files = 0;

    if (@file < 2) {
	exit if $Batch;
	usage "at least 2 files must be specified\n";
    }

    foreach $file (@file) {
	# NB lstat so symlinks are skipped.
	my @stat = lstat $file or do {
	    xwarn "can't lstat $file:";
	    next;
	};
	next unless -f _;

	my $seen = $seen{$stat[ST_DEV]}{$stat[ST_INO]};
	if (!defined $seen) {
	    $seen{$stat[ST_DEV]}{$stat[ST_INO]} = $file;
	}
	else {
	    if ($Verbose > 1) {
	    	printf "# skipping additional link (%s) to %s\n",
		    shell_comment_quote($file), shell_comment_quote($seen);
	    }
	    next;
	}

	my ($size, $mtime) = @stat[ST_SIZE, ST_MTIME];
	next unless $size;
	$tot_files++;
	push @{$info->{$size}}, [$file, $mtime];
    }

    if ($tot_files < 2) {
	exit if $Batch;
	xdie "didn't get at least 2 valid files\n";
    }
    return $info;
}

sub handle_dup {
    my ($i, $j, $rinfo) = @_;

    my ($ti, $tj) = ($rinfo->[$i][F_MTIME], $rinfo->[$j][F_MTIME]);
    my @tab       = ([$j, $j, $j], [$j, $j, $i], [$i, $i, $i]);
    my $goner     = $tab[1+($ti <=> $tj)][1+($i <=> $j)];
    my $pgoner    = $rinfo->[$goner][F_PATH];
    my $keeper    = ($goner == $i) ? $j : $i;
    my $pkeeper   = $rinfo->[$keeper][F_PATH];

    if ($Verbose or $No) {
	my $q_pgoner  = shell_quote $pgoner;
	my $q_pkeeper = shell_quote $pkeeper;
	if ($Do_link) {
	    printf "rm %s;\tln %s %s\n",
	    	$q_pgoner, $q_pkeeper, $q_pgoner;
	}
	else {
	    printf "rm %s\t# keeping %s\n",
		$q_pgoner, shell_comment_quote $q_pkeeper;
	}
    }


    if ($No) {
	# do nothing
    }
    elsif (!unlink $pgoner) {
	xwarn "can't unlink $pgoner:";
    }
    elsif ($Do_link && ! link $pkeeper, $pgoner) {
    	xwarn "can't link $pkeeper to $pgoner:";
    }

    undef $rinfo->[$goner];
}

sub process_files {
    my $info = shift;
    my $size;
    foreach $size (keys %$info) {
	my @info = @{$info->{$size}};
	next unless @info > 1;
	debug "checking ", scalar @info, " elements of size $size";
	my ($i, $j);
	for ($i = 0; $i < @info; $i++) {
	    for ($j = $i+1; $j < @info; $j++) {
		next unless $info[$i] and $info[$j];
		my ($pi, $pj) = ($info[$i][F_PATH],  $info[$j][F_PATH]);
		my @cmd = ('cmp', '-s', $pi, $pj);
		debug join ' ', @cmd;
		my $return = system @cmd;
		if ($return == 0) {
		    handle_dup $i, $j, \@info;
		} elsif (WEXITSTATUS($return) == 1) {
		    print "# ", shell_comment_quote($pi), " and ",
			    shell_comment_quote($pj), " differ\n"
			if $Verbose > 1;
		} elsif (WIFSIGNALED $return) {
		    # give up so I'm interruptible
		    xdie "cmp killed (", waitstat ($return),
			    ") comparing $pi and $pj\n";
		} else {
		    xwarn "bad exit (", waitstat ($return),
			    ") from cmp comparing $pi and $pj\n";
		}
	    }
	}
    }
}

sub main {
    init;
    process_files get_file_list @ARGV;
    return 0;
}

$Exit = eval { main } || $Exit;
die $@ if $@;
exit $Exit;

__END__

=head1 NAME

dedup - remove duplicate files

=head1 SYNOPSIS

B<dedup>
[B<--batch>]
[B<--debug>]
[B<--link>]
[B<--no>|B<-n>]
[B<--stdin>]
[B<--stdin0>]
[B<--verbose>|B<-v>]
I<file> I<file>...

=head1 DESCRIPTION

I<dedup> scans all the given I<file> arguments and removes any
duplicates.  Files of 0 length are silently skipped.  When choosing
which I<file> to retain older files are preferred over newer ones, and
files given earlier on the command line are preferred over those given
later.

=head1 OPTIONS

=over 4

=item B<--batch>

Don't complain if fewer than 2 files are given or if fewer than 2 remain
after screening out non-files and the like.

=item B<--debug>

Turn debugging on.

=item B<--link>

Rather than removing duplicates, (hard) link them together so they share
storage.

=item B<--help>

Show the usage message and die.

=item B<--no>, B<-n>

Print what would have been done, but don't actually do anything.

=item B<--stdin>

Read stdin for a list of files and add these to those given on the
command line.

=item B<--stdin0>

Same, but the files on stdin are separated by null characters rather
than newlines.

=item B<--verbose>, B<-v>

Be more verbose.  A single B<--verbose> causes B<dedup> to display which
files are being removed.  Adding another causes it to additionally output
the names of files which are the same length but which differ.

=item B<--version>

Show the version number and exit.

=back

=head1 BUGS

B<dedup> doesn't try to deal with files which have multiple links, it
just ignores them.  It doesn't even allow symlinks.

=head1 AVAILABILITY

The code is licensed under the GNU GPL.  Check
http://www.argon.org/~roderick/ for updated versions.

=head1 AUTHOR

Roderick Schertler <roderick@argon.org>

=cut
