#!/usr/bin/perl -w
use strict;

# $Id: rss-download,v 1.12 2006-10-27 18:46:11-04 roderick Exp roderick $

# XXX
# - new name, too many hits for rss-download on Google
# - document
# - per-feed refetch intervals, with default from <ttl>
# - for --download-command, put info into environment variables (eg, rss
#   url, item url, file name, title)
# - improve interface
#     - different storage directories
#     - episode history
#     - way to select with Perl code (--eval)
#     - way to configure with Perl code from a file?
#     - REs to match against title, description, link, file, any
#     - way to restrict REs to some URLs
#     - different ways to determine uniqueness
# - periodic cleanup of %persist db
# - config file?

use sigtrap qw(die untrapped normal-signals);

use Fcntl		qw(/^O_/);
use File::Basename	qw(basename);
use LWP::UserAgent	  ();
use RS::Handy		qw($Me
			    badinvo chompexpr_fileline create_index_subs
			    data_dump dstr exclusive_create getopt iso_dt
			    safe_tmp url_decode xdie);
use URI			  ();
use XML::RAI		  ();
use XML::RAI::Enclosure	  ();

my $Cookie_file		= undef;
my $Download_cmd	= undef;
my $Exit_after_seconds	= undef;
my $Sleep		= 60 * 60;
my $Persist		= undef;
my $Persist_module	= 'AnyDBM_File';
my $Verbose		= 0;

my $Debug	= undef;
my $Debug_save	= undef;
my $Exit	= 0;
my $Init_done	= 0;
my $No		= 0;
my @Re		= ();
my @Url		= ();
my %Url		= ();
my $Version	= q$Revision: 1.12 $ =~ /(\d\S+)/ ? $1 : '?';

my @Option_spec = (
    'cookies=s'		=> \$Cookie_file,
    'debug:i'		=> \$Debug,
    'debug-save=s'	=> \$Debug_save,
    'download-cmd=s'	=> \$Download_cmd,
    'exit-after=i'	=> \$Exit_after_seconds,
    'help'		=> sub { usage() },
    'no|n'		=> \$No,
    'once'		=> sub { $Exit_after_seconds = -1 },
    'persist=s'		=> \$Persist,
    'persist-module=s'	=> \$Persist_module,
    're=s'		=> \@Re,
    'url=s'		=> \@Url,
    'sleep=i'		=> \$Sleep,
    'verbose|v'		=> \$Verbose,
    'version'		=> sub { print "$Me version $Version\n"; exit },
);

my $Usage = <<EOF;
usage: $Me [switch]...
required switches:
    --url u		RSS URL (multiples ok)
    --re r		regex matching items to download (multiples ok)
switches:
    --cookies f		store cookies in file f
    --debug		turn debugging on
    --debug-save f	save downloaded RSS data to f.*
    --download-cmd c	use c to download links rather than doing it myself
    --exit-after s	exit some time after s seconds (I use this because
    	    	    	there's a memory leak I haven't spotted)
    --help		show this and then die
    --once		just download one time and exit, don't sleep and loop
    --persist p		save info about previously downloaded links in p
    --persist-module m	use m for storing --persist database ($Persist_module)
    --sleep s		seconds to sleep between fetches ($Sleep)
    --version		show the version ($Version) and exit
Use \`perldoc $Me\' to see the full documentation.
EOF

use subs qw(url_last_modified);

sub xwarn {
    unshift @_, iso_dt, " " if $Init_done;
    RS::Handy::xwarn_caller 1, @_;
    $Exit ||= 1;
}

sub chompexpr {
    @_ == 1 || badinvo;
    local $_ = shift;
    chomp;
    return $_;
}

sub debug_nonl {
    print STDERR "$Me: debug: ", @_ if $Debug;
}

sub debug {
    debug_nonl @_, "\n" if $Debug;
}

sub info {
    print iso_dt, " ", @_, "\n"
}

sub verbose {
    info @_
	if $Verbose;
}

sub usage {
    xwarn @_ if @_;
    # Use exit() rather than die(), as Getopt::Long does eval().
    print STDERR $Usage;
    exit 1;
}

sub init {
    $| = 1;

    getopt -bundle, @Option_spec or usage if @ARGV;

    # Getopt::Long's optional numeric args default to zero if specified
    # without a number, so bump that by one.

    if (!defined $Debug) {
	$Debug = 0;
    }
    elsif ($Debug == 0) {
	$Debug = 1;
    }
    $Verbose = 1 if $Debug;

    @Url
    	or usage "no --url switches given\n";
    for (@Url) {
    	if (exists $Url{$_}) {
    	    xdie "--url $_ specified multiple times\n";
    	}
	$Url{$_} = [];
    }

    @Re
    	or usage "no --re switches given\n";
    for (@Re) {
    	my $compiled = eval { qr/$_/i };
	if ($@) {
	    $@ = chompexpr_fileline $@;
	    xwarn "invalid --re ", dstr $_, ": $@\n";
	}
	$_ = $compiled;
    }
    if ($Exit != 0) {
	xdie "exiting due to errors\n"
    }

    @ARGV and usage "unknown non-switch args: @ARGV\n";

    if ($Debug) {
	require LWP::Debug;
	LWP::Debug::level('+');
    }

    db_init();

    $Init_done = 1;
}

sub NOT_MODIFIED () { \"not modified" }

{

my $ua;
sub _init_ua {
    return if $ua;

    $ua = LWP::UserAgent->new
	or xdie "can't initialize LWP user agent\n";
    $ua->env_proxy;
    $ua->agent("$Me/$Version");
    if (defined $Cookie_file) {
	$ua->cookie_jar({ file => $Cookie_file });
	$ua->cookie_jar
	    or xdie "can't initialize cookie jar\n";
	debug "cookie jar: ", chompexpr data_dump $ua->cookie_jar
	    if $Debug > 2;
    }
}

sub get_backend {
    @_ == 2 || badinvo;
    my ($use_last_modified, $url) = @_;

    _init_ua;

    my $req = HTTP::Request->new(GET => $url);
    if (my $jar = $ua->cookie_jar) {
	$jar->add_cookie_header($req);
    }
    if ($use_last_modified && defined(my $d = url_last_modified $url)) {
	$req->header('If-Modified-Since' => $d);
    }
    debug "request: ", $req->as_string
	if $Debug > 2;
    debug "request object: ", chompexpr data_dump $req
	if $Debug > 3;

    # XXX I'm not doing any redirects because unauthorized responses
    # redirect instead of saying unauthorized, and I don't want to loop
    # on those because the web site ops are touchy.
    #
    # This is not a good solution because some redirects are valid.
    # This needs touching up.  Perhaps test if the any redirect you get
    # puts you back to a page you already got?

    my $r = $ua->simple_request($req);
    debug "response: ", $r->as_string
	if $Debug > 2;
    debug "response object: ", data_dump $r
	if $Debug > 3;
    if ($r->code == 304) {
    	return NOT_MODIFIED;
    }
    if (!$r->is_success) {
    	xwarn "error downloading $url: ", $r->status_line, "\n";
	return;
    }

    if ($use_last_modified) {
	url_last_modified $url, $r->header('Last-Modified');
    }
    return $r->content;
}

}

sub get {
    return get_backend 0, @_;
}

sub get_with_last_modified {
    return get_backend 1, @_;
}

sub link_to_file {
    @_ == 1 || badinvo;
    my ($link_url) = @_;

    my $lu = URI->new($link_url);
    if (!$lu) {
	xwarn "invalid link url $link_url\n";
	return;
    }

    return basename url_decode $lu;
}

{

my %persist;
my $do_sync = 0;
my $db_version = 1;
my $db_version_key = ':rss_download_persist_version';

sub db_init {
    return unless $Persist;

    @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File ODBM_File);

    require MLDBM;
    $MLDBM::UseDB = # squelch warning
    $MLDBM::UseDB = $Persist_module;

    if (!tie %persist, 'MLDBM', $Persist, O_RDWR, 0666) {
	tie %persist, 'MLDBM', $Persist, O_RDWR | O_CREAT, 0666
	    or xdie "can't create --persist $Persist with $Persist_module: $!";
	$persist{$db_version_key} = $db_version;
    }

    if (tied(%persist)->UseDB->can('sync')) {
    	debug "db can sync";
    	$do_sync = 1;
    }

    my $v = $persist{$db_version_key};
    if (!defined $v) {
	xdie "--persist $Persist isn't in my format\n";
    }
    elsif ($v != $db_version) {
	xdie "--persist $Persist doesn't use my version",
		" ($v != $db_version)\n";
    }
}

sub db_put {
    @_ >= 2 || badinvo;
    my ($k, @v) = @_;

    debug "db_put ", dstr $k, " => ",
	@v > 1 ? chompexpr data_dump @v : dstr $v[0];
    return if $No;
    $persist{$k} = [time, @v];
    tied(%persist)->UseDB->sync if $do_sync;
}

sub db_get {
    @_ == 1 || badinvo;
    my ($k) = @_;

    my $r = $persist{$k};
    debug "db_get ", dstr $k, " => ", chompexpr data_dump $r;
    if (!$r) {
	return;
    }
    my @v = @$r;
    shift @v; # time
    return wantarray
	    ? @v
	    : @v <= 1
		? $v[0]
		: xdie 0+@v, " elements to return for ", dstr $k;
}

sub db_delete {
    @_ == 1 || badinvo;
    my ($k) = @_;

    debug "db_delete ", dstr $k;
    delete $persist{$k};
}

END {
    # XXX way to check for errors
    untie %persist;
}

}

sub db_key {
    @_ >= 2 || badinvo;
    my ($tag, @rest) = @_;
    return join "\x00", $tag, @rest;
}

sub seen_link {
    @_ == 2 || @_ == 3 || badinvo;
    my $have_new = @_ == 3;
    my ($rss_url, $link_url, $new) = @_;

    my $k = link_to_file $link_url;
    my $old = defined db_get $k;
    if ($have_new) {
    	if ($new) {
	    db_put $k, $link_url;
	}
	else {
	    db_delete $k;
	}
    }
    return $old;
}

sub url_last_modified {
    @_ == 1 || @_ == 2 || badinvo;
    my $have_new = @_ == 2;
    my ($url, $new) = @_;

    my $k = db_key 'url-last-modified', $url;
    my $old = db_get $k;
    if ($have_new) {
    	if (defined $new) {
	    db_put $k, $new;
	}
	else {
	    db_delete $k;
	}
    }
    return $old;
}

sub show_item {
    @_ == 2 || badinvo;
    my ($item, $file) = @_;

    my $fmt = "%-10s %s\n";
    print "\n";
    for my $meth (qw(title)) {
	my $v = $item->$meth;
	next unless defined $v;
	printf $fmt, $meth, $v;
    }
    printf $fmt, "file", $file;
    printf $fmt, "time", scalar localtime;
}

sub download_link {
    @_ == 2 || badinvo;
    my ($url, $file) = @_;

    if ($No) {
	debug "not downloading $url due to --no";
	return 1;
    }

    if ($Download_cmd) {
	system "$Download_cmd \Q$url\E";
	if ($?) {
	    xwarn "non-zero exit ($?) from $Download_cmd for $url\n";
	    return 0;
	}
	return 1;
    }

    # XXX why not get_with_last_modified() here?
    my $s = get $url
	or return 0;
    if (ref $s) {
    	if ($s == NOT_MODIFIED) {
	    verbose "skipping $url, not modified";
	    return 1;
	}
	xdie "invalid get response $$s\n";
    }

    my $fh = exclusive_create $file;
    if (!$fh) {
	xwarn "can't exclusively create $file:";
	return 0;
    }
    if (!print $fh $s or !close $fh) {
	xwarn "error writing to $file:";
	return 0;
    }

    return 1;
}

sub do_url {
    @_ == 1 || badinvo;
    my ($rss_url) = @_;

    my $s = get_with_last_modified $rss_url
	or return;
    if (ref $s) {
    	if ($s == NOT_MODIFIED) {
	    verbose sprintf "unmod.   $rss_url";
	    return;
	}
	print "s=$s not_mod=", NOT_MODIFIED, "\n";
	xdie "invalid get response ", dstr $$s, "\n";
    }

    if ($Debug_save) {
    	my ($file, $fh) = safe_tmp
			    dir		=> ".",
			    prefix	=> "$Debug_save.",
			    postfix	=> ".rss";
	info "saving $rss_url to $file";
	if (!print $fh $s or !close $fh) {
	    xdie "error writing to $file:";
	}
    }

    my $rai = eval { XML::RAI->parse($s) };
    if ($@) {
	chomp $@;
	$@ =~ s/^\n+//; # bogus
	xwarn "failure parsing data from $rss_url: $@\n";
	return;
    }
    elsif (!defined $rai) {
	# XXX error reporting
	xwarn "error parsing data from $rss_url\n";
	return;
    }

    my @item = @{ $rai->items };
    my $item_count = @item;
    verbose sprintf "%2d item%s %s",
	$item_count,
	$item_count == 1 ? ' ' : 's',
	$rss_url;

    my $shown_any = 0;
    for my $item (@item) {
    	my @url;

	if (my @enc = XML::RAI::Enclosure->load($item)) {
	    @url = grep { defined } map { $_->url } @enc;
	    if (!@url) {
		debug "no urls in enclosures in ", $item->title;
		next;
	    }
	}
	elsif (defined(my $url = $item->link)) {
	    @url = ($url);
	}
	else {
	    debug "neither enclosures nor link in ", $item->title;
	    next;
	}
	debug "url list: @url"
	    if $Debug > 1;

	for my $link_url (@url) {
	    if (!defined $link_url) {
		# can't happen
		debug "no link url for ", $item->title;
		next;
	    }
	    my $file = link_to_file $link_url;
	    debug "test $file";

	    my $match = 0;
	    for my $re (@Re) {
		if ($file =~ /$re/) {
		    $match = 1;
		    last;
		}
	    }
	    if (!$match) {
		debug "no match";
		next;
	    }

	    if (seen_link $rss_url, $link_url) {
		debug "already saw $link_url";
		next;
	    }

	    $shown_any = 1;
	    show_item $item, $file;
	    if (download_link $link_url, $file) {
		seen_link $rss_url, $link_url, 1;
	    }
	    else {
		# Forget the last-modified time for this feed so I'll
		# process it again so I'll re-try this fetch.
		url_last_modified $rss_url, undef;
	    }
	}
    }

    print "\n" if $shown_any && $Verbose;
}

sub main {
    init;

    my $start_t = time;
    while (1) {
	do_url $_ for @Url;

	if (defined $Exit_after_seconds) {
	    last if time - $start_t >= $Exit_after_seconds;
	}

	debug "sleep $Sleep";
	sleep $Sleep;
    }

    return 0;
}

$Exit = main || $Exit;
$Exit = 1 if $Exit && !($Exit % 256);
exit $Exit;

__END__

=head1 NAME

rss-download - monitor an RSS feed and download links based on patterns

=head1 SYNOPSIS

XXX incomplete
B<prog> [I<switch>]...

=head1 DESCRIPTION

XXX incomplete
B<prog> does foo

=head1 OPTIONS

=over 4

=item B<--debug>

Turn debugging on.

=item B<--help>

Show the usage message and die.

=item B<--version>

Show the version number and exit.

=back

=head1 AVAILABILITY

The code is licensed under the GNU GPL.  Check
http://www.argon.org/~roderick/ for updated versions.

=head1 AUTHOR

Roderick Schertler <roderick@argon.org>

=cut
