#!/usr/bin/perl -w use strict; # $Id: rss-download,v 1.12 2006-10-27 18:46:11-04 roderick Exp roderick $ # XXX # - new name, too many hits for rss-download on Google # - document # - per-feed refetch intervals, with default from # - for --download-command, put info into environment variables (eg, rss # url, item url, file name, title) # - improve interface # - different storage directories # - episode history # - way to select with Perl code (--eval) # - way to configure with Perl code from a file? # - REs to match against title, description, link, file, any # - way to restrict REs to some URLs # - different ways to determine uniqueness # - periodic cleanup of %persist db # - config file? use sigtrap qw(die untrapped normal-signals); use Fcntl qw(/^O_/); use File::Basename qw(basename); use LWP::UserAgent (); use RS::Handy qw($Me badinvo chompexpr_fileline create_index_subs data_dump dstr exclusive_create getopt iso_dt safe_tmp url_decode xdie); use URI (); use XML::RAI (); use XML::RAI::Enclosure (); my $Cookie_file = undef; my $Download_cmd = undef; my $Exit_after_seconds = undef; my $Sleep = 60 * 60; my $Persist = undef; my $Persist_module = 'AnyDBM_File'; my $Verbose = 0; my $Debug = undef; my $Debug_save = undef; my $Exit = 0; my $Init_done = 0; my $No = 0; my @Re = (); my @Url = (); my %Url = (); my $Version = q$Revision: 1.12 $ =~ /(\d\S+)/ ? $1 : '?'; my @Option_spec = ( 'cookies=s' => \$Cookie_file, 'debug:i' => \$Debug, 'debug-save=s' => \$Debug_save, 'download-cmd=s' => \$Download_cmd, 'exit-after=i' => \$Exit_after_seconds, 'help' => sub { usage() }, 'no|n' => \$No, 'once' => sub { $Exit_after_seconds = -1 }, 'persist=s' => \$Persist, 'persist-module=s' => \$Persist_module, 're=s' => \@Re, 'url=s' => \@Url, 'sleep=i' => \$Sleep, 'verbose|v' => \$Verbose, 'version' => sub { print "$Me version $Version\n"; exit }, ); my $Usage = <new or xdie "can't initialize LWP user agent\n"; $ua->env_proxy; $ua->agent("$Me/$Version"); if (defined $Cookie_file) { $ua->cookie_jar({ file => $Cookie_file }); $ua->cookie_jar or xdie "can't initialize cookie jar\n"; debug "cookie jar: ", chompexpr data_dump $ua->cookie_jar if $Debug > 2; } } sub get_backend { @_ == 2 || badinvo; my ($use_last_modified, $url) = @_; _init_ua; my $req = HTTP::Request->new(GET => $url); if (my $jar = $ua->cookie_jar) { $jar->add_cookie_header($req); } if ($use_last_modified && defined(my $d = url_last_modified $url)) { $req->header('If-Modified-Since' => $d); } debug "request: ", $req->as_string if $Debug > 2; debug "request object: ", chompexpr data_dump $req if $Debug > 3; # XXX I'm not doing any redirects because unauthorized responses # redirect instead of saying unauthorized, and I don't want to loop # on those because the web site ops are touchy. # # This is not a good solution because some redirects are valid. # This needs touching up. Perhaps test if the any redirect you get # puts you back to a page you already got? my $r = $ua->simple_request($req); debug "response: ", $r->as_string if $Debug > 2; debug "response object: ", data_dump $r if $Debug > 3; if ($r->code == 304) { return NOT_MODIFIED; } if (!$r->is_success) { xwarn "error downloading $url: ", $r->status_line, "\n"; return; } if ($use_last_modified) { url_last_modified $url, $r->header('Last-Modified'); } return $r->content; } } sub get { return get_backend 0, @_; } sub get_with_last_modified { return get_backend 1, @_; } sub link_to_file { @_ == 1 || badinvo; my ($link_url) = @_; my $lu = URI->new($link_url); if (!$lu) { xwarn "invalid link url $link_url\n"; return; } return basename url_decode $lu; } { my %persist; my $do_sync = 0; my $db_version = 1; my $db_version_key = ':rss_download_persist_version'; sub db_init { return unless $Persist; @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File ODBM_File); require MLDBM; $MLDBM::UseDB = # squelch warning $MLDBM::UseDB = $Persist_module; if (!tie %persist, 'MLDBM', $Persist, O_RDWR, 0666) { tie %persist, 'MLDBM', $Persist, O_RDWR | O_CREAT, 0666 or xdie "can't create --persist $Persist with $Persist_module: $!"; $persist{$db_version_key} = $db_version; } if (tied(%persist)->UseDB->can('sync')) { debug "db can sync"; $do_sync = 1; } my $v = $persist{$db_version_key}; if (!defined $v) { xdie "--persist $Persist isn't in my format\n"; } elsif ($v != $db_version) { xdie "--persist $Persist doesn't use my version", " ($v != $db_version)\n"; } } sub db_put { @_ >= 2 || badinvo; my ($k, @v) = @_; debug "db_put ", dstr $k, " => ", @v > 1 ? chompexpr data_dump @v : dstr $v[0]; return if $No; $persist{$k} = [time, @v]; tied(%persist)->UseDB->sync if $do_sync; } sub db_get { @_ == 1 || badinvo; my ($k) = @_; my $r = $persist{$k}; debug "db_get ", dstr $k, " => ", chompexpr data_dump $r; if (!$r) { return; } my @v = @$r; shift @v; # time return wantarray ? @v : @v <= 1 ? $v[0] : xdie 0+@v, " elements to return for ", dstr $k; } sub db_delete { @_ == 1 || badinvo; my ($k) = @_; debug "db_delete ", dstr $k; delete $persist{$k}; } END { # XXX way to check for errors untie %persist; } } sub db_key { @_ >= 2 || badinvo; my ($tag, @rest) = @_; return join "\x00", $tag, @rest; } sub seen_link { @_ == 2 || @_ == 3 || badinvo; my $have_new = @_ == 3; my ($rss_url, $link_url, $new) = @_; my $k = link_to_file $link_url; my $old = defined db_get $k; if ($have_new) { if ($new) { db_put $k, $link_url; } else { db_delete $k; } } return $old; } sub url_last_modified { @_ == 1 || @_ == 2 || badinvo; my $have_new = @_ == 2; my ($url, $new) = @_; my $k = db_key 'url-last-modified', $url; my $old = db_get $k; if ($have_new) { if (defined $new) { db_put $k, $new; } else { db_delete $k; } } return $old; } sub show_item { @_ == 2 || badinvo; my ($item, $file) = @_; my $fmt = "%-10s %s\n"; print "\n"; for my $meth (qw(title)) { my $v = $item->$meth; next unless defined $v; printf $fmt, $meth, $v; } printf $fmt, "file", $file; printf $fmt, "time", scalar localtime; } sub download_link { @_ == 2 || badinvo; my ($url, $file) = @_; if ($No) { debug "not downloading $url due to --no"; return 1; } if ($Download_cmd) { system "$Download_cmd \Q$url\E"; if ($?) { xwarn "non-zero exit ($?) from $Download_cmd for $url\n"; return 0; } return 1; } # XXX why not get_with_last_modified() here? my $s = get $url or return 0; if (ref $s) { if ($s == NOT_MODIFIED) { verbose "skipping $url, not modified"; return 1; } xdie "invalid get response $$s\n"; } my $fh = exclusive_create $file; if (!$fh) { xwarn "can't exclusively create $file:"; return 0; } if (!print $fh $s or !close $fh) { xwarn "error writing to $file:"; return 0; } return 1; } sub do_url { @_ == 1 || badinvo; my ($rss_url) = @_; my $s = get_with_last_modified $rss_url or return; if (ref $s) { if ($s == NOT_MODIFIED) { verbose sprintf "unmod. $rss_url"; return; } print "s=$s not_mod=", NOT_MODIFIED, "\n"; xdie "invalid get response ", dstr $$s, "\n"; } if ($Debug_save) { my ($file, $fh) = safe_tmp dir => ".", prefix => "$Debug_save.", postfix => ".rss"; info "saving $rss_url to $file"; if (!print $fh $s or !close $fh) { xdie "error writing to $file:"; } } my $rai = eval { XML::RAI->parse($s) }; if ($@) { chomp $@; $@ =~ s/^\n+//; # bogus xwarn "failure parsing data from $rss_url: $@\n"; return; } elsif (!defined $rai) { # XXX error reporting xwarn "error parsing data from $rss_url\n"; return; } my @item = @{ $rai->items }; my $item_count = @item; verbose sprintf "%2d item%s %s", $item_count, $item_count == 1 ? ' ' : 's', $rss_url; my $shown_any = 0; for my $item (@item) { my @url; if (my @enc = XML::RAI::Enclosure->load($item)) { @url = grep { defined } map { $_->url } @enc; if (!@url) { debug "no urls in enclosures in ", $item->title; next; } } elsif (defined(my $url = $item->link)) { @url = ($url); } else { debug "neither enclosures nor link in ", $item->title; next; } debug "url list: @url" if $Debug > 1; for my $link_url (@url) { if (!defined $link_url) { # can't happen debug "no link url for ", $item->title; next; } my $file = link_to_file $link_url; debug "test $file"; my $match = 0; for my $re (@Re) { if ($file =~ /$re/) { $match = 1; last; } } if (!$match) { debug "no match"; next; } if (seen_link $rss_url, $link_url) { debug "already saw $link_url"; next; } $shown_any = 1; show_item $item, $file; if (download_link $link_url, $file) { seen_link $rss_url, $link_url, 1; } else { # Forget the last-modified time for this feed so I'll # process it again so I'll re-try this fetch. url_last_modified $rss_url, undef; } } } print "\n" if $shown_any && $Verbose; } sub main { init; my $start_t = time; while (1) { do_url $_ for @Url; if (defined $Exit_after_seconds) { last if time - $start_t >= $Exit_after_seconds; } debug "sleep $Sleep"; sleep $Sleep; } return 0; } $Exit = main || $Exit; $Exit = 1 if $Exit && !($Exit % 256); exit $Exit; __END__ =head1 NAME rss-download - monitor an RSS feed and download links based on patterns =head1 SYNOPSIS XXX incomplete B [I]... =head1 DESCRIPTION XXX incomplete B does foo =head1 OPTIONS =over 4 =item B<--debug> Turn debugging on. =item B<--help> Show the usage message and die. =item B<--version> Show the version number and exit. =back =head1 AVAILABILITY The code is licensed under the GNU GPL. Check http://www.argon.org/~roderick/ for updated versions. =head1 AUTHOR Roderick Schertler =cut