#!/usr/bin/perl -w
use strict;

# $Id: gzip-links,v 1.4 2005-03-25 10:00:39-04 roderick Exp $

use Proc::SafePipe	qw(backtick_noshell popen_noshell);
use Proc::WaitStat	qw(waitstat);
use RS::Handy		qw(:stat $Me data_dump getopt xdie);

my @Gzip_arg_0 = (
    map({ "-$_" } 1..9),
    qw(
	-N --name
	-n --no-name
	-q --quiet
	-v --verbose
	   --fast
	   --best
	   --rsyncable
    ),
);

# first is preferred
my @Gzip_suffix	= qw(.gz .z .Z .taz .tgz -gz -z _z);

my $Allow_no_files	= 0;
my $Debug		= 0;
my $Exit		= 0;
my $Gzip_suffix_pat	= undef;
my $Gzip_suffix		= $Gzip_suffix[0];
my $No			= 0;
my @Real_gzip_arg	= ();
my $Quiet		= 0;
my $Stdin0		= 0;
my $Verbose		= 0;
my $Version		= q$Revision: 1.4 $ =~ /(\d\S+)/ ? $1 : '?';

my @Option_spec = (
    'allow-no-files' => \$Allow_no_files,
    'debug+'	=> \$Debug,
    'help'	=> sub { usage() },
    'no'	=> \$No,
    'stdin0'	=> \$Stdin0,
    'version'	=> sub { print "$Me version $Version\n"; exit },

    map({
	    my $full = $_; # need lexical for closure
	    (my $plain = $full) =~ s/^-+// or die;
	    $plain => sub { push @Real_gzip_arg, $full }
	} @Gzip_arg_0),
);

my $Usage = <<EOF;
usage: $Me [switch]... file...
switches:
     --allow-no-files	don't complain if no files were specified
     --debug		turn debugging on
     --help		show this and then die
     --no		don't change anything
     --stdin0		add null-terminated file names from stdin to arg list
     --version		show the version ($Version) and exit
switches passed to gzip:
     @Gzip_arg_0

Use \`perldoc $Me\' to see the full documentation.
EOF

sub xwarn {
    RS::Handy::xwarn_caller 1, @_
    	unless $Quiet;
    $Exit ||= 1;
}

sub debug {
    print STDERR "$Me: debug: ", @_, "\n" if $Debug;
}

sub verbose {
    print "$Me: ", @_, "\n" if $Verbose;
}

sub usage {
    xwarn @_ if @_;
    # Use exit() rather than die(), as Getopt::Long does eval().
    print STDERR $Usage;
    exit 1;
}

sub init {
    $| = 1;
    getopt -bundle, @Option_spec or usage if @ARGV;

    $Quiet	= grep { $_ eq '-q' || $_ eq '--quiet'   } @Real_gzip_arg;
    $Verbose	= grep { $_ eq '-v' || $_ eq '--verbose' } @Real_gzip_arg;

    my $p = join '|', map { quotemeta } @Gzip_suffix;
    $Gzip_suffix_pat = qr/(?:$p)/;

    if ($Stdin0) {
    	-t STDIN
	    and usage "--stdin0 specified but stdin is a tty\n";
	my $s = do { local $/; <STDIN> };
	# strip 1 trailing null, not all, so an empty name will be an
	# error later
	$s =~ s/\0\z//
	    or $s eq ''
	    or xdie "--stdin0 input didn't end with null\n";
	push @ARGV, split /\0/, $s, -1;
    }

}

# Group paths by device/inode.  Strip out those which can't be statted.
# Return a reference to %inode:
#
#   $inode{$dev}{$inode} = [$nlinks, $path...]

sub group_by_inode {
    my @path = @_;
    my (%seen, %inode);

    for my $p (@path) {
	if ($seen{$p}++) {
	    xwarn "$p specified multiple times\n";
	    next;
    	}
    	my @st = stat $p;
	if (!@st) {
	    xwarn "can't stat $p:";
	    next;
	}
	my $r = $inode{$st[ST_DEV]}{$st[ST_INO]} ||= [$st[ST_NLINK]];
	push @$r, $p;
    }
    print data_dump "inode:", \%inode if $Debug > 1;

    return \%inode;
}

# Given \%inode, do preprocessing.
#
# For multiply-linked files, remove all but the first name, saving the
# other names for later re-linking.
#
# Return references to:
#    $single[$i] = $singly_linked_path
#    $mult{$path_1} = [$path_2...]

sub preprocess {
    my ($rall) = @_;
    my (@single, %mult);

    for my $dev (keys %$rall) {
      Inode:
    	for my $inode (keys %{ $rall->{$dev} }) {
	    my ($nlink, @p) = @{ $rall->{$dev}{$inode} };
	    debug "preprocess $dev/$inode nlink=$nlink names=@p";

	    if ($nlink != @p) {
    	    	xwarn "skipping multiply-linked inode, got ", 0+@p,
		    " of $nlink links with: @p\n";
    	    	next Inode;
	    }

    	    # Skip inodes with compressed-looking names, as gzip would
    	    # do the same, and I don't want to remove links 2+ in that
    	    # case (especially if it's those names which are bad).

	    if (grep { /$Gzip_suffix_pat\z/ } @p) {
	    	for (@p) {
		    xwarn "skipping $_, ",
			/($Gzip_suffix_pat)\z/
			    ? "already has $1 suffix\n"
			    : "other name for this inode has a"
    	    	    	    	." compressed suffix\n";
    	    	}
		next Inode;
	    }

	    # Similarly if the .gz version exists.

    	    {
	    	my @exists = grep { lstat } map { "$_$Gzip_suffix" } @p;
		if (@exists) {
    	    	    xwarn "skipping @p due to existing @exists\n";
		    next Inode;
		}
	    }

	    if (@p > 1) {
	    	my ($keep, @rm) = @p;
		@p = ($keep);
	    	verbose "compressing $keep for @rm";
		for (@rm) {
		    if (!$No && !unlink $_) {
			xwarn "error unlinking $_:";
			# put it in the %mult list anyway, I can create the
			# $_.gz link.
		    }
		}
		$mult{$keep} = \@rm;
	    }

	    push @single, $p[0];
	}
    }

    return \@single, \%mult;
}

# Compress the listed files, which are all singly-linked.

sub compress {
    my ($rsingle) = @_;

    print data_dump "rsingle:", $rsingle if $Debug > 1;

    my @cmd = (qw(xargs -0 --no-run-if-empty gzip), @Real_gzip_arg);
    debug "running @cmd";
    my $fh = eval { popen_noshell 'w', @cmd };
    if (!$fh) {
    	chomp $@;
    	xwarn "error running xargs/gzip: $@\n";
	return;
    }

    for (@$rsingle) {
	debug "to-gzip $_";
	print $fh "$_\0"
	    or xwarn "error writing to xargs/gzip:";
    }

    if (!close $fh) {
	xwarn "error closing xargs/gzip: ",
	    ($!+0 ? "$!" : 'non-zero exit (' . waitstat($?) . ')'),
	    "\n";
    }

}

# Put links back that I removed, but with a .gz attached if appropriate.

sub postprocess {
    my ($rmult) = @_;

    print data_dump "rmult:", $rmult if $Debug > 1;

    my $suff = $Gzip_suffix;

    for my $base (keys %$rmult) {
	my @sub		= @{ $rmult->{$base} };
    	my $have_plain	= -f $base;
	my $have_gz	= -f "$base$suff";

	if ($have_gz) {
	    if ($have_plain) {
		xwarn "both $base and $base$suff now exist,",
		    	" using $base$suff\n";
		$have_plain = 0;
	    }
	}
	elsif ($have_plain) {
	    xwarn "$base didn't get compressed\n";
	}
	else {
	    xwarn "neither $base nor $base$suff now exists,",
		    " skipping link restore of @sub\n";
    	    next;
	}

	my $this_suff	= $have_gz ? $suff : '';
	my $base_path	= "$base$this_suff";

	verbose "relinking $base_path to ",
	    join ' ', map { "$_$this_suff" } @sub;

	for my $sub (@sub) {
	    my $sub_path = "$sub$this_suff";
	    if (!link $base_path, $sub_path) {
	    	xwarn "error linking $base_path to $sub_path:";
	    }
	}
    }
}

sub work {
    my @path = @_;
    if ($Debug) {
	debug "input $_" for @path;
    }

    if (!@path) {
    	return if $Allow_no_files;
	usage "no files specified\n";
    }

    my $rinode = group_by_inode @path;
    %$rinode
    	or return;

    my ($rsingle, $rmult) = preprocess $rinode;
    @$rsingle
    	or return;

    # After this point warn but continue after errors as possible so I
    # put the links back for the multiply-linked inodes which were
    # removed.

    return if $No;
    compress $rsingle;
    postprocess $rmult;
}

sub main {
    init;
    work @ARGV;
    return 0;
}

$Exit = main || $Exit;
$Exit = 1 if $Exit && !($Exit % 256);
exit $Exit;

__END__

=head1 NAME

gzip-links - run gzip but deal with files which have multiple (hard) links

=head1 SYNOPSIS

B<gzip-links>
[B<--allow-no-files>]
[B<--debug>]
[B<--help>]
[B<--no>]
[B<--stdin0>]
[B<--version>]
[B<-1>] [B<-2>] [B<-3>] [B<-4>] [B<-5>] [B<-6>] [B<-7>] [B<-8>] [B<-9>]
[B<-N>] [B<--name>]
[B<-n>] [B<--no-name>]
[B<-q>] [B<--quiet>]
[B<-v>] [B<--verbose>]
[B<--fast>]
[B<--best>]
[B<--rsyncable>]
[I<switch>]...
file...

=head1 DESCRIPTION

B<gzip-links> is like gzip but it allows you to compress multiply-linked
files.  It does this by removing all but one of the links, compressing
the data under the remaining name, then linking to the removed names
(plus .gz).

=head1 OPTIONS

=over 4

=item B<--allow-no-files>

Don't complain if no files are given.  This is particularly useful with
B<--stdin0>, such as when feeding in B<find> output when you don't know
if there will be any matches.

=item B<--debug>

Turn debugging on.   Use multiple times for more detail.

=item B<--help>

Show the usage message and die.

=item B<--no>

Go through the motions as possible, but don't change anything.

=item B<--stdin0>

Read null-terminated file names (such as from C<find -print0>) from
stdin and treat them like the files specified as arguments.  This is
necessary because all of the names for a file have to be seen by the
same B<gzip-links> invocation, and with C<xargs -0> this can't be
guaranteed.

=item B<--version>

Show the version number and exit.

=back

=head1 OPTIONS FOR GZIP

These switches are passed along to B<gzip>.

=over 4

=item B<-1> .. B<-9>

=item B<-N>, B<--name>

B<--name> doesn't make much sense in the multiply-linked case, since
only 1 name (the first given on the command line) is saved.

=item B<-n>, B<--no-name>

=item B<-q>, B<--quiet>

B<--quiet> affect B<gzip-links> as well.

=item B<-v>, B<--verbose>

B<--verbose> affect B<gzip-links> as well.

=item        B<--fast>

=item        B<--best>

=item        B<--rsyncable>

=back

=head1 EXAMPLES

    $ ls -liG
    total 3
    3499474 -rw-rw-r--    2 roderick      623 Feb 17 14:48 mult.1
    3499474 -rw-rw-r--    2 roderick      623 Feb 17 14:48 mult.2
    3499476 -rw-rw-r--    1 roderick      316 Feb 17 14:49 single
    $ gzip-links -v *
    gzip-links: compressing mult.1 for mult.2
    mult.1:                  40.2% -- replaced with mult.1.gz
    single:                  36.3% -- replaced with single.gz
    gzip-links: relinking mult.1.gz to mult.2.gz
    $ ls -liG
    total 2
    3499477 -rw-rw-r--    2 roderick      397 Feb 17 14:48 mult.1.gz
    3499477 -rw-rw-r--    2 roderick      397 Feb 17 14:48 mult.2.gz
    3499474 -rw-rw-r--    1 roderick      226 Feb 17 14:49 single.gz
    $ _

=head1 AVAILABILITY

The code is licensed under the GNU GPL.  Check
http://www.argon.org/~roderick/ for updated versions.

=head1 AUTHOR

Roderick Schertler <roderick@argon.org>

=cut
