head	1.12;
access;
symbols;
locks
	roderick:1.12; strict;
comment	@# @;


1.12
date	2006.10.27.22.46.11;	author roderick;	state Exp;
branches;
next	1.11;

1.11
date	2006.10.22.18.05.28;	author roderick;	state Exp;
branches;
next	1.10;

1.10
date	2006.10.21.17.52.30;	author roderick;	state Exp;
branches;
next	1.9;

1.9
date	2005.12.20.14.22.53;	author roderick;	state Exp;
branches;
next	1.8;

1.8
date	2005.12.09.14.46.03;	author roderick;	state Exp;
branches;
next	1.7;

1.7
date	2005.11.16.19.19.50;	author roderick;	state Exp;
branches;
next	1.6;

1.6
date	2005.06.01.14.18.31;	author roderick;	state Exp;
branches;
next	1.5;

1.5
date	2005.04.07.15.57.45;	author roderick;	state Exp;
branches;
next	1.4;

1.4
date	2005.02.09.15.19.08;	author roderick;	state Exp;
branches;
next	1.3;

1.3
date	2005.01.03.12.19.26;	author roderick;	state Exp;
branches;
next	1.2;

1.2
date	2004.12.02.19.06.02;	author roderick;	state Exp;
branches;
next	1.1;

1.1
date	2004.11.10.12.18.01;	author roderick;	state Exp;
branches;
next	;


desc
@#!/usr/bin/perl -w
@


1.12
log
@Give nice error message for bad --re.

Oops, db_get() was shifting the time out of the underlying array.

Add untie().
@
text
@#!/usr/bin/perl -w
use strict;

# $Id: rss-download,v 1.11 2006-10-22 14:05:28-04 roderick Exp $

# XXX
# - new name, too many hits for rss-download on Google
# - document
# - per-feed refetch intervals, with default from <ttl>
# - for --download-command, put info into environment variables (eg, rss
#   url, item url, file name, title)
# - improve interface
#     - episode history
#     - way to select with Perl code (--eval)
#     - way to configure with Perl code from a file?
#     - REs to match against title, description, link, file, any
#     - way to restrict REs to some URLs
#     - different ways to determine uniqueness
# - periodic cleanup of %persist db
# - config file?

use sigtrap qw(die untrapped normal-signals);

use Fcntl		qw(/^O_/);
use File::Basename	qw(basename);
use LWP::UserAgent	  ();
use RS::Handy		qw($Me
			    badinvo chompexpr_fileline create_index_subs
			    data_dump dstr exclusive_create getopt iso_dt
			    safe_tmp url_decode xdie);
use URI			  ();
use XML::RAI		  ();
use XML::RAI::Enclosure	  ();

my $Cookie_file		= undef;
my $Download_cmd	= undef;
my $Exit_after_seconds	= undef;
my $Sleep		= 60 * 60;
my $Persist		= undef;
my $Persist_module	= 'AnyDBM_File';
my $Verbose		= 0;

my $Debug	= undef;
my $Debug_save	= undef;
my $Exit	= 0;
my $Init_done	= 0;
my $No		= 0;
my @@Re		= ();
my @@Url		= ();
my %Url		= ();
my $Version	= q$Revision: 1.11 $ =~ /(\d\S+)/ ? $1 : '?';

my @@Option_spec = (
    'cookies=s'		=> \$Cookie_file,
    'debug:i'		=> \$Debug,
    'debug-save=s'	=> \$Debug_save,
    'download-cmd=s'	=> \$Download_cmd,
    'exit-after=i'	=> \$Exit_after_seconds,
    'help'		=> sub { usage() },
    'no|n'		=> \$No,
    'once'		=> sub { $Exit_after_seconds = -1 },
    'persist=s'		=> \$Persist,
    'persist-module=s'	=> \$Persist_module,
    're=s'		=> \@@Re,
    'url=s'		=> \@@Url,
    'sleep=i'		=> \$Sleep,
    'verbose|v'		=> \$Verbose,
    'version'		=> sub { print "$Me version $Version\n"; exit },
);

my $Usage = <<EOF;
usage: $Me [switch]...
required switches:
    --url u		RSS URL (multiples ok)
    --re r		regex matching items to download (multiples ok)
switches:
    --cookies f		store cookies in file f
    --debug		turn debugging on
    --debug-save f	save downloaded RSS data to f.*
    --download-cmd c	use c to download links rather than doing it myself
    --exit-after s	exit some time after s seconds (I use this because
    	    	    	there's a memory leak I haven't spotted)
    --help		show this and then die
    --once		just download one time and exit, don't sleep and loop
    --persist p		save info about previously downloaded links in p
    --persist-module m	use m for storing --persist database ($Persist_module)
    --sleep s		seconds to sleep between fetches ($Sleep)
    --version		show the version ($Version) and exit
Use \`perldoc $Me\' to see the full documentation.
EOF

use subs qw(url_last_modified);

sub xwarn {
    unshift @@_, iso_dt, " " if $Init_done;
    RS::Handy::xwarn_caller 1, @@_;
    $Exit ||= 1;
}

sub chompexpr {
    @@_ == 1 || badinvo;
    local $_ = shift;
    chomp;
    return $_;
}

sub debug_nonl {
    print STDERR "$Me: debug: ", @@_ if $Debug;
}

sub debug {
    debug_nonl @@_, "\n" if $Debug;
}

sub info {
    print iso_dt, " ", @@_, "\n"
}

sub verbose {
    info @@_
	if $Verbose;
}

sub usage {
    xwarn @@_ if @@_;
    # Use exit() rather than die(), as Getopt::Long does eval().
    print STDERR $Usage;
    exit 1;
}

sub init {
    $| = 1;

    getopt -bundle, @@Option_spec or usage if @@ARGV;

    # Getopt::Long's optional numeric args default to zero if specified
    # without a number, so bump that by one.

    if (!defined $Debug) {
	$Debug = 0;
    }
    elsif ($Debug == 0) {
	$Debug = 1;
    }
    $Verbose = 1 if $Debug;

    @@Url
    	or usage "no --url switches given\n";
    for (@@Url) {
    	if (exists $Url{$_}) {
    	    xdie "--url $_ specified multiple times\n";
    	}
	$Url{$_} = [];
    }

    @@Re
    	or usage "no --re switches given\n";
    for (@@Re) {
    	my $compiled = eval { qr/$_/i };
	if ($@@) {
	    $@@ = chompexpr_fileline $@@;
	    xwarn "invalid --re ", dstr $_, ": $@@\n";
	}
	$_ = $compiled;
    }
    if ($Exit != 0) {
	xdie "exiting due to errors\n"
    }

    @@ARGV and usage "unknown non-switch args: @@ARGV\n";

    if ($Debug) {
	require LWP::Debug;
	LWP::Debug::level('+');
    }

    db_init();

    $Init_done = 1;
}

sub NOT_MODIFIED () { \"not modified" }

{

my $ua;
sub _init_ua {
    return if $ua;

    $ua = LWP::UserAgent->new
	or xdie "can't initialize LWP user agent\n";
    $ua->env_proxy;
    $ua->agent("$Me/$Version");
    if (defined $Cookie_file) {
	$ua->cookie_jar({ file => $Cookie_file });
	$ua->cookie_jar
	    or xdie "can't initialize cookie jar\n";
	debug "cookie jar: ", chompexpr data_dump $ua->cookie_jar
	    if $Debug > 2;
    }
}

sub get_backend {
    @@_ == 2 || badinvo;
    my ($use_last_modified, $url) = @@_;

    _init_ua;

    my $req = HTTP::Request->new(GET => $url);
    if (my $jar = $ua->cookie_jar) {
	$jar->add_cookie_header($req);
    }
    if ($use_last_modified && defined(my $d = url_last_modified $url)) {
	$req->header('If-Modified-Since' => $d);
    }
    debug "request: ", $req->as_string
	if $Debug > 2;
    debug "request object: ", chompexpr data_dump $req
	if $Debug > 3;

    # XXX I'm not doing any redirects because unauthorized responses
    # redirect instead of saying unauthorized, and I don't want to loop
    # on those because the web site ops are touchy.
    #
    # This is not a good solution because some redirects are valid.
    # This needs touching up.  Perhaps test if the any redirect you get
    # puts you back to a page you already got?

    my $r = $ua->simple_request($req);
    debug "response: ", $r->as_string
	if $Debug > 2;
    debug "response object: ", data_dump $r
	if $Debug > 3;
    if ($r->code == 304) {
    	return NOT_MODIFIED;
    }
    if (!$r->is_success) {
    	xwarn "error downloading $url: ", $r->status_line, "\n";
	return;
    }

    if ($use_last_modified) {
	url_last_modified $url, $r->header('Last-Modified');
    }
    return $r->content;
}

}

sub get {
    return get_backend 0, @@_;
}

sub get_with_last_modified {
    return get_backend 1, @@_;
}

sub link_to_file {
    @@_ == 1 || badinvo;
    my ($link_url) = @@_;

    my $lu = URI->new($link_url);
    if (!$lu) {
	xwarn "invalid link url $link_url\n";
	return;
    }

    return basename url_decode $lu;
}

{

my %persist;
my $do_sync = 0;
my $db_version = 1;
my $db_version_key = ':rss_download_persist_version';

sub db_init {
    return unless $Persist;

    @@AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File ODBM_File);

    require MLDBM;
    $MLDBM::UseDB = # squelch warning
    $MLDBM::UseDB = $Persist_module;

    if (!tie %persist, 'MLDBM', $Persist, O_RDWR, 0666) {
	tie %persist, 'MLDBM', $Persist, O_RDWR | O_CREAT, 0666
	    or xdie "can't create --persist $Persist with $Persist_module: $!";
	$persist{$db_version_key} = $db_version;
    }

    if (tied(%persist)->UseDB->can('sync')) {
    	debug "db can sync";
    	$do_sync = 1;
    }

    my $v = $persist{$db_version_key};
    if (!defined $v) {
	xdie "--persist $Persist isn't in my format\n";
    }
    elsif ($v != $db_version) {
	xdie "--persist $Persist doesn't use my version",
		" ($v != $db_version)\n";
    }
}

sub db_put {
    @@_ >= 2 || badinvo;
    my ($k, @@v) = @@_;

    debug "db_put ", dstr $k, " => ",
	@@v > 1 ? chompexpr data_dump @@v : dstr $v[0];
    return if $No;
    $persist{$k} = [time, @@v];
    tied(%persist)->UseDB->sync if $do_sync;
}

sub db_get {
    @@_ == 1 || badinvo;
    my ($k) = @@_;

    my $r = $persist{$k};
    debug "db_get ", dstr $k, " => ", chompexpr data_dump $r;
    if (!$r) {
	return;
    }
    my @@v = @@$r;
    shift @@v; # time
    return wantarray
	    ? @@v
	    : @@v <= 1
		? $v[0]
		: xdie 0+@@v, " elements to return for ", dstr $k;
}

sub db_delete {
    @@_ == 1 || badinvo;
    my ($k) = @@_;

    debug "db_delete ", dstr $k;
    delete $persist{$k};
}

END {
    # XXX way to check for errors
    untie %persist;
}

}

sub db_key {
    @@_ >= 2 || badinvo;
    my ($tag, @@rest) = @@_;
    return join "\x00", $tag, @@rest;
}

sub seen_link {
    @@_ == 2 || @@_ == 3 || badinvo;
    my $have_new = @@_ == 3;
    my ($rss_url, $link_url, $new) = @@_;

    my $k = link_to_file $link_url;
    my $old = defined db_get $k;
    if ($have_new) {
    	if ($new) {
	    db_put $k, $link_url;
	}
	else {
	    db_delete $k;
	}
    }
    return $old;
}

sub url_last_modified {
    @@_ == 1 || @@_ == 2 || badinvo;
    my $have_new = @@_ == 2;
    my ($url, $new) = @@_;

    my $k = db_key 'url-last-modified', $url;
    my $old = db_get $k;
    if ($have_new) {
    	if (defined $new) {
	    db_put $k, $new;
	}
	else {
	    db_delete $k;
	}
    }
    return $old;
}

sub show_item {
    @@_ == 2 || badinvo;
    my ($item, $file) = @@_;

    my $fmt = "%-10s %s\n";
    print "\n";
    for my $meth (qw(title)) {
	my $v = $item->$meth;
	next unless defined $v;
	printf $fmt, $meth, $v;
    }
    printf $fmt, "file", $file;
    printf $fmt, "time", scalar localtime;
}

sub download_link {
    @@_ == 2 || badinvo;
    my ($url, $file) = @@_;

    if ($No) {
	debug "not downloading $url due to --no";
	return 1;
    }

    if ($Download_cmd) {
	system "$Download_cmd \Q$url\E";
	if ($?) {
	    xwarn "non-zero exit ($?) from $Download_cmd for $url\n";
	    return 0;
	}
	return 1;
    }

    # XXX why not get_with_last_modified() here?
    my $s = get $url
	or return 0;
    if (ref $s) {
    	if ($s == NOT_MODIFIED) {
	    verbose "skipping $url, not modified";
	    return 1;
	}
	xdie "invalid get response $$s\n";
    }

    my $fh = exclusive_create $file;
    if (!$fh) {
	xwarn "can't exclusively create $file:";
	return 0;
    }
    if (!print $fh $s or !close $fh) {
	xwarn "error writing to $file:";
	return 0;
    }

    return 1;
}

sub do_url {
    @@_ == 1 || badinvo;
    my ($rss_url) = @@_;

    my $s = get_with_last_modified $rss_url
	or return;
    if (ref $s) {
    	if ($s == NOT_MODIFIED) {
	    verbose sprintf "unmod.   $rss_url";
	    return;
	}
	print "s=$s not_mod=", NOT_MODIFIED, "\n";
	xdie "invalid get response ", dstr $$s, "\n";
    }

    if ($Debug_save) {
    	my ($file, $fh) = safe_tmp
			    dir		=> ".",
			    prefix	=> "$Debug_save.",
			    postfix	=> ".rss";
	info "saving $rss_url to $file";
	if (!print $fh $s or !close $fh) {
	    xdie "error writing to $file:";
	}
    }

    my $rai = eval { XML::RAI->parse($s) };
    if ($@@) {
	chomp $@@;
	$@@ =~ s/^\n+//; # bogus
	xwarn "failure parsing data from $rss_url: $@@\n";
	return;
    }
    elsif (!defined $rai) {
	# XXX error reporting
	xwarn "error parsing data from $rss_url\n";
	return;
    }

    my @@item = @@{ $rai->items };
    my $item_count = @@item;
    verbose sprintf "%2d item%s %s",
	$item_count,
	$item_count == 1 ? ' ' : 's',
	$rss_url;

    my $shown_any = 0;
    for my $item (@@item) {
    	my @@url;

	if (my @@enc = XML::RAI::Enclosure->load($item)) {
	    @@url = grep { defined } map { $_->url } @@enc;
	    if (!@@url) {
		debug "no urls in enclosures in ", $item->title;
		next;
	    }
	}
	elsif (defined(my $url = $item->link)) {
	    @@url = ($url);
	}
	else {
	    debug "neither enclosures nor link in ", $item->title;
	    next;
	}
	debug "url list: @@url"
	    if $Debug > 1;

	for my $link_url (@@url) {
	    if (!defined $link_url) {
		# can't happen
		debug "no link url for ", $item->title;
		next;
	    }
	    my $file = link_to_file $link_url;
	    debug "test $file";

	    my $match = 0;
	    for my $re (@@Re) {
		if ($file =~ /$re/) {
		    $match = 1;
		    last;
		}
	    }
	    if (!$match) {
		debug "no match";
		next;
	    }

	    if (seen_link $rss_url, $link_url) {
		debug "already saw $link_url";
		next;
	    }

	    $shown_any = 1;
	    show_item $item, $file;
	    if (download_link $link_url, $file) {
		seen_link $rss_url, $link_url, 1;
	    }
	    else {
		# Forget the last-modified time for this feed so I'll
		# process it again so I'll re-try this fetch.
		url_last_modified $rss_url, undef;
	    }
	}
    }

    print "\n" if $shown_any && $Verbose;
}

sub main {
    init;

    my $start_t = time;
    while (1) {
	do_url $_ for @@Url;

	if (defined $Exit_after_seconds) {
	    last if time - $start_t >= $Exit_after_seconds;
	}

	debug "sleep $Sleep";
	sleep $Sleep;
    }

    return 0;
}

$Exit = main || $Exit;
$Exit = 1 if $Exit && !($Exit % 256);
exit $Exit;

__END__

=head1 NAME

rss-download - monitor an RSS feed and download links based on patterns

=head1 SYNOPSIS

XXX incomplete
B<prog> [I<switch>]...

=head1 DESCRIPTION

XXX incomplete
B<prog> does foo

=head1 OPTIONS

=over 4

=item B<--debug>

Turn debugging on.

=item B<--help>

Show the usage message and die.

=item B<--version>

Show the version number and exit.

=back

=head1 AVAILABILITY

The code is licensed under the GNU GPL.  Check
http://www.argon.org/~roderick/ for updated versions.

=head1 AUTHOR

Roderick Schertler <roderick@@argon.org>

=cut
@


1.11
log
@Overhaul the handling of the persistence DB a bit.

Store feed Last-Modified times in the DB.

Don't set an URL as seen until you successfully download it.  If you
fail to download it lose the the Last-Modified time for the feed so
you'll try again next time.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.10 2006-10-21 13:52:30-04 roderick Exp $
d7 1
d9 11
a19 10
# - episode history
# - different fetch frequencies for different URLs
# - fetch based on expire time in RSS by default
# - for --download-command, put rss url/item url/file name/title/etc
#   into environment variables
# - REs to match against title, description, link, file, any
# - way to select with Perl code (--eval)
# - way to restrict REs to some URLs
# - different ways to determine uniqueness
# - periodic cleanup of %persist
d28 3
a30 3
			    badinvo create_index_subs data_dump dstr
			    exclusive_create getopt iso_dt safe_tmp
			    url_decode xdie);
d51 1
a51 1
my $Version	= q$Revision: 1.10 $ =~ /(\d\S+)/ ? $1 : '?';
d159 5
a163 2
	# XXX nice message for bad pattern
    	my $compiled = qr/$_/i;
d166 3
d328 2
a329 1
    shift @@$r;
d331 4
a334 4
	    ? @@$r
	    : @@$r <= 1
		? $r->[0]
		: xdie 0+@@$r, " elements to return for ", dstr $k;
d345 5
a393 3
# XXX aging for seen db
# XXX untie/flush at end?

a418 1
    	# XXX set up much info in environment vars
d427 1
@


1.10
log
@Oops, --no didn't do anything.

Add --cookies, --debug-save.

Have --debug take an optional integer.

Add If-Modified-Since handling.

Decode URL basename when creating file names.

If there are no enclosures fall back on using the <link> element.

Oops, the /i on the --re match wasn't working.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.9 2005-12-20 09:22:53-05 roderick Exp roderick $
d14 1
a14 1
# - way to select with Perl code (-eval)
d17 1
a17 4
# - persistence: think about key used, what data is stored
# - periodic cleanup of %seen
# - if-modified-since, including caching it in the persist file
# - ditch RS::Handy?
a32 4
BEGIN {
    create_index_subs 'URL', undef, qw(last_modified);
}

d49 1
a49 1
my $Version	= q$Revision: 1.9 $ =~ /(\d\S+)/ ? $1 : '?';
d90 2
d98 11
d110 1
a110 1
    print STDERR "$Me: debug: ", @@_, "\n" if $Debug;
d169 1
a169 1
    seen_init();
d190 1
a190 1
	debug "cookie jar: ", data_dump $ua->cookie_jar
d195 3
a197 3
sub get {
    @@_ == 1 || badinvo;
    my ($url) = @@_;
d205 1
a205 1
    if (defined(my $d = $Url{$url}[URL_LAST_MODIFIED])) {
d210 2
a211 2
    debug "request: ", data_dump $req
	if $Debug > 2;
d224 2
d234 3
a236 1
    $Url{$url}[URL_LAST_MODIFIED] = $r->header('Last-Modified');
d242 8
d260 1
a260 1
    return url_decode basename $lu;
d265 1
a265 1
my %seen;
d267 2
a268 2
my $persist_version = 1;
my $persist_version_key = ':rss_download_persist_version';
d270 1
a270 1
sub seen_init {
d279 2
a280 2
    if (!tie %seen, 'MLDBM', $Persist, O_RDWR, 0666) {
	tie %seen, 'MLDBM', $Persist, O_RDWR | O_CREAT, 0666
d282 1
a282 1
	$seen{$persist_version_key} = $persist_version;
d285 1
a285 1
    if (tied(%seen)->UseDB->can('sync')) {
d290 1
a290 1
    my $v = $seen{$persist_version_key};
d294 1
a294 1
    elsif ($v != $persist_version) {
d296 23
a318 1
		" ($v != $persist_version)\n";
d320 6
d328 3
a330 3
sub seen_link {
    @@_ == 2 || badinvo;
    my ($rss_url, $link_url) = @@_;
d332 3
a334 1
    my $file = link_to_file $link_url;
d336 1
a336 2
    my $o = $seen{$file};
    return 1 if $o;
d338 5
a342 1
    return 0 if $No;
d344 17
a360 1
    # XXX don't set this unless you successfully download, or fail X times?
d362 16
a377 3
    $seen{$file} = [time, $link_url];
    tied(%seen)->UseDB->sync if $do_sync;
    return 0;
a382 2
}

d404 1
a404 1
	return;
d412 1
d414 1
a414 1
	return;
d418 1
a418 1
	or return;
d422 1
a422 1
	    return;
d430 1
a430 1
	return;
d434 1
a434 1
	return;
d436 2
d444 1
a444 1
    my $s = get $rss_url
d504 2
a505 1
	debug "url list: @@url";
d535 8
a542 1
	    download_link $link_url, $file;
@


1.9
log
@Use LWP::UserAgent rather than LWP::Simple, to get better error
reporting and to set the user agent name.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.8 2005-12-09 09:46:03-05 roderick Exp roderick $
d8 1
d28 4
a31 2
use RS::Handy		qw($Me badinvo data_dump exclusive_create
			    getopt iso_dt xdie);
d36 5
d48 2
a49 1
my $Debug	= 0;
d55 2
a56 1
my $Version	= q$Revision: 1.8 $ =~ /(\d\S+)/ ? $1 : '?';
d59 3
a61 1
    'debug!'		=> \$Debug,
d82 1
d84 1
d107 4
d112 1
a112 1
    print iso_dt, " ", @@_, "\n"
d124 2
d128 11
d141 7
d152 1
a152 1
    	my $compiled = qr/$_/;
d168 2
d180 7
d195 26
a220 1
    my $r = $ua->get($url);
d226 1
d242 1
a242 1
    return basename $lu;
d291 2
d324 5
d340 7
d365 19
d407 14
a420 3
	my @@enc = XML::RAI::Enclosure->load($item);
	if (!@@enc) {
	    debug "no enclosures in ", $item->title;
d423 1
d425 1
a425 3
	for my $enc (@@enc) {
	    my $link_url = $enc->url;

d427 1
d436 1
a436 1
		if ($file =~ /$re/i) {
@


1.8
log
@Add --exit-after.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.7 2005-11-16 14:19:50-05 roderick Exp roderick $
d10 2
a11 3
# - switch to set download command, example using wget -c limit 1 dir
#   convert links, put rss url/item url/file name/title/etc into
#   environment variables
d14 1
a14 1
# - way to restrict REs to some URLs?
a18 1
# - LWP for error checking
d26 1
a26 1
use LWP::Simple		qw(get);
d46 1
a46 1
my $Version	= q$Revision: 1.7 $ =~ /(\d\S+)/ ? $1 : '?';
d130 29
d258 2
a259 6
    my $s = get $url;
    if (!defined $s) {
	# XXX error reporting
	xwarn "error downloading $url\n";
	return;
    }
d276 2
a277 6
    my $s = get $rss_url;
    if (!defined $s) {
    	# XXX switch to LWP::UserAgent for error reporting
	xwarn "failure fetching $rss_url\n";
	return;
    }
@


1.7
log
@Handle enclosures.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.6 2005-06-01 10:18:31-04 roderick Exp roderick $
d35 6
a40 6
my $Download_cmd = undef;
my $Once	= 0;
my $Sleep	= 60 * 60;
my $Persist	= undef;
my $Persist_module = 'AnyDBM_File';
my $Verbose	= 0;
d48 1
a48 1
my $Version	= q$Revision: 1.6 $ =~ /(\d\S+)/ ? $1 : '?';
d53 1
d56 1
a56 1
    'once'		=> \$Once,
d74 2
d327 1
d330 5
a334 1
	last if $Once;
@


1.6
log
@->sync() the persistent DB if possible.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.5 2005-04-07 11:57:45-04 roderick Exp roderick $
d33 1
d48 1
a48 1
my $Version	= q$Revision: 1.5 $ =~ /(\d\S+)/ ? $1 : '?';
d206 1
a206 1
    for my $meth  (qw(title)) {
d278 4
a281 4
    foreach my $item (@@item) {
	my $link_url = $item->link;
	if (!defined $link_url) {
	    debug "no link url for ", $item->title;
a283 2
	my $file = link_to_file $link_url;
	debug "test $file";
d285 25
a309 5
	my $match = 0;
	for my $re (@@Re) {
	    if ($file =~ /$re/i) {
		$match = 1;
		last;
a310 5
	}
	if (!$match) {
	    debug "no match";
	    next;
	}
d312 3
a314 3
    	if (seen_link $rss_url, $link_url) {
	    debug "already saw $link_url";
	    next;
a315 4

	$shown_any = 1;
	show_item $item, $file;
	download_link $link_url, $file;
@


1.5
log
@Squelch warning for $MLDBM::UseDB.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.4 2005-02-09 10:19:08-05 roderick Exp roderick $
d47 1
a47 1
my $Version	= q$Revision: 1.4 $ =~ /(\d\S+)/ ? $1 : '?';
d144 1
d163 5
d187 2
d190 1
@


1.4
log
@Add --download-cmd, --persist, --persist-module, --no
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.3 2005-01-03 07:19:26-05 roderick Exp roderick $
d9 1
d14 1
d47 1
a47 1
my $Version	= q$Revision: 1.3 $ =~ /(\d\S+)/ ? $1 : '?';
d75 1
a75 1
    --persist-module m	use m for storiing --persist database ($Persist_module)
d153 1
@


1.3
log
@Add --verbose.

Run XML::RAI in eval.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.2 2004-12-02 14:06:02-05 roderick Exp roderick $
d8 1
d13 1
d15 1
a15 1
# - persistence via AnyDBM_File, or module given with switch
d22 3
d32 1
d35 2
d42 1
d45 1
a45 1
my $Version	= q$Revision: 1.2 $ =~ /(\d\S+)/ ? $1 : '?';
d48 12
a59 8
    'debug!'	=> \$Debug,
    'help'	=> sub { usage() },
    'once'	=> \$Once,
    're=s'	=> \@@Re,
    'url=s'	=> \@@Url,
    'sleep=i'	=> \$Sleep,
    'verbose|v'	=> \$Verbose,
    'version'	=> sub { print "$Me version $Version\n"; exit },
d65 2
a66 2
    --url u	RSS URL (multiples ok)
    --re r	regex matching items to download (multiples ok)
d68 8
a75 5
    --debug	turn debugging on
    --help	show this and then die
    --once	just download one time and exit, don't sleep and loop
    --sleep s	seconds to sleep between fetches ($Sleep)
    --version	show the version ($Version) and exit
d121 2
d139 30
a168 1
{ my %seen; # XXX persistence
a173 2
    return $seen{$file}++;
} }
d175 11
d202 31
d293 1
a293 16
	my $s = get $link_url;
	if (!defined $s) {
	    # XXX error reporting
    	    xwarn "error downloading $link_url\n";
	    next;
	}

	my $fh = exclusive_create $file;
	if (!$fh) {
	    xwarn "can't exclusively create $file:";
	    next;
	}
	if (!print $fh $s or !close $fh) {
	    xwarn "error writing to $file:";
	    next;
	}
@


1.2
log
@Add timestamp to warnings which happen after initialization.

Put the newline before a show, not after, so errors about it appear next
to it.
@
text
@d4 1
a4 1
# $Id: rss-download,v 1.1 2004-11-10 07:18:01-05 roderick Exp roderick $
d8 3
d29 1
d36 1
a36 1
my $Version	= q$Revision: 1.1 $ =~ /(\d\S+)/ ? $1 : '?';
d45 1
d73 5
d157 8
a164 2
    my $rai = XML::RAI->parse($s);
    if (!defined $rai) {
d170 9
a178 1
    foreach my $item ( @@{$rai->items} ) {
d204 1
d223 2
@


1.1
log
@entered into RCS
@
text
@d4 1
a4 1
# $Id$
d19 2
a20 1
use RS::Handy		qw($Me badinvo data_dump exclusive_create getopt xdie);
d29 1
d32 1
a32 1
my $Version	= q$Revision$ =~ /(\d\S+)/ ? $1 : '?';
d59 1
d83 1
a83 1
	# XXX nice message for bat pattern
d88 2
d94 2
d126 1
a133 1
    print "\n";
a201 1
    @@ARGV and usage "unknown non-switch args: @@ARGV\n";
d221 1
a221 1
prog - prog
d225 1
d230 1
@
