#!/usr/bin/perl -w # $Id: dedup,v 1.16 2005-03-12 20:19:15-05 roderick Exp roderick $ # # Roderick Schertler # Copyright (C) 2001 Roderick Schertler # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # For a copy of the GNU General Public License write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # XXX switch to show all things which would be removed, prompt, then # removed them # XXX maybe switch to prompt item by item use strict; use RS::Handy qw ($Me xdie getopt); use POSIX qw (SIGINT WEXITSTATUS WIFSIGNALED); use Proc::WaitStat qw (waitstat); use String::ShellQuote qw (shell_quote); my $Batch = 0; my $Debug = 0; my $Exit = 0; my $No = 0; my $Stdin = undef; my $Verbose = 0; my $Version = q$Revision: 1.16 $ =~ /(\d\S+)/ ? $1 : '?'; my @Option_spec = ( 'batch' => \$Batch, 'debug!' => \$Debug, 'help' => sub { usage() }, 'no|n' => \$No, 'stdin' => sub { $Stdin = "\n" }, 'stdin0' => sub { $Stdin = "\0" }, 'verbose|v+' => \$Verbose, 'version' => sub { print "$Me version $Version\n"; exit }, ); my $Usage = < more verbose yet) --version show the version ($Version) and exit Use `perldoc $Me' to see the full documenation. EOF sub xwarn { RS::Handy::xwarn @_; $Exit ||= 1; } sub debug { print STDERR "debug: ", @_, "\n" if $Debug; } sub usage { xwarn @_ if @_; # Use exit() rather than die(), as Getopt::Long does eval(). print STDERR $Usage; exit 1; } sub init { getopt -bundle, @Option_spec or usage if @ARGV; if (defined $Stdin) { local $/ = $Stdin; my @list = ; chomp @list; push @ARGV, @list; } } # $info points to a hash of lists of lists. Keys are file sizes. Values # are anon lists, element 0 is the file path, element 1 is the mtime. sub F_PATH { 0 }; sub F_MTIME { 1 }; sub get_file_list { my @file = @_; my $info = {}; my ($file, %seen); my $tot_files = 0; if (@file < 2) { exit if $Batch; usage "at least 2 files must be specified\n"; } foreach $file (@file) { next if $seen{$file}++; # NB lstat so symlinks are skipped. my @stat = lstat $file or do { xwarn "can't lstat $file:"; next; }; next unless -f _; my ($nlink, $size, $mtime) = @stat[3, 7, 9]; next unless $size; if ($nlink != 1) { xwarn "$file has $nlink links, skipping it\n"; next; } $tot_files++; push @{$info->{$size}}, [$file, $mtime]; } if ($tot_files < 2) { exit if $Batch; xdie "didn't get at least 2 valid files\n"; } return $info; } sub remove_dups { my $info = shift; my $size; foreach $size (keys %$info) { my @info = @{$info->{$size}}; next unless @info > 1; debug "checking ", scalar @info, " elements of size $size"; my ($i, $j); for ($i = 0; $i < @info; $i++) { for ($j = $i+1; $j < @info; $j++) { next unless $info[$i] and $info[$j]; my ($pi, $pj) = ($info[$i][F_PATH], $info[$j][F_PATH]); my ($ti, $tj) = ($info[$i][F_MTIME], $info[$j][F_MTIME]); my @cmd = ('cmp', '-s', $pi, $pj); debug join ' ', @cmd; my $return = system @cmd; if ($return == 0) { my @tab = ([$j, $j, $j], [$j, $j, $i], [$i, $i, $i]); my $goner = $tab[1+($ti <=> $tj)][1+($i <=> $j)]; my $pgoner = $info[$goner][F_PATH]; if ($Verbose or $No) { my $pgoner = shell_quote $pgoner; my $keeper = ($goner == $i) ? $j : $i; my $pkeeper = shell_quote $info[$keeper][F_PATH]; $pkeeper =~ s/\n/\n\t# /g; print "rm $pgoner\t# keeping $pkeeper\n" } unlink $pgoner or xwarn "can't unlink $pgoner:" unless $No; undef $info[$goner]; } elsif (WEXITSTATUS($return) == 1) { print "# ", shell_quote($pi), " and ", shell_quote($pj), " differ\n" if $Verbose > 1; } elsif (WIFSIGNALED $return) { # give up so I'm interruptible xdie "cmp killed (", waitstat ($return), ") comparing $pi and $pj\n"; } else { xwarn "bad exit (", waitstat ($return), ") from cmp comparing $pi and $pj\n"; } } } } } sub main { init; remove_dups get_file_list @ARGV; return 0; } $Exit = eval { main } || $Exit; die $@ if $@; exit $Exit; __END__ =head1 NAME dedup - remove duplicate files =head1 SYNOPSIS B [B<--batch>] [B<--debug>] [B<--no>|B<-n>] [B<--stdin>] [B<--stdin0>] [B<--verbose>|B<-v>] I I... =head1 DESCRIPTION I scans all the given I arguments and removes any duplicates. Files of 0 length are silently skipped. When choosing which I to retain older files are preferred over newer ones, and files given earlier on the command line are preferred over those given later. =head1 OPTIONS =over 4 =item B<--batch> Don't complain if fewer than 2 files are given or if fewer than 2 remain after screening out non-files and the like. =item B<--debug> Turn debugging on. =item B<--help> Show the usage message and die. =item B<--no>, B<-n> Print what would have been done, but don't actually do anything. =item B<--stdin> Read stdin for a list of files and add these to those given on the command line. =item B<--stdin0> Same, but the files on stdin are separated by null characters rather than newlines. =item B<--verbose>, B<-v> Be more verbose. A single B<--verbose> causes B to display which files are being removed. Adding another causes it to additionally output the names of files which are the same length but which differ. =item B<--version> Show the version number and exit. =back =head1 BUGS B doesn't try to deal with files which have multiple links, it just ignores them. It doesn't even allow symlinks. =head1 AVAILABILITY The code is licensed under the GNU GPL. Check http://www.argon.org/~roderick/ for updated versions. =head1 AUTHOR Roderick Schertler =cut