#!/usr/bin/perl -w use strict; # $Id: randline,v 1.7 2005-03-25 10:22:19-05 roderick Exp $ # # Roderick Schertler # Copyright (C) 2001 Roderick Schertler # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # For a copy of the GNU General Public License write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use RS::Handy qw($Me dstr getopt xdie xsrand); my $Exit = 0; my $N = 1; my $Force_filename = undef; my $Percent = undef; my $Seed = undef; my $Test = 0; my $Version = q$Revision: 1.7 $ =~ /(\d\S+)/ ? $1 : '?'; my @Option_spec = ( 'h|no-filename' => sub { $Force_filename = 0 }, 'H|with-filename' => sub { $Force_filename = 1 }, 'help!' => sub { usage() }, 'input-record-separator|irs=s' => \$/, 'percent|p=f' => \$Percent, 'n|pick=i' => \$N, 'seed=i' => \$Seed, 'test=i' => \$Test, 'version' => sub { print "$Me version $Version\n"; exit }, ); my $Usage = <) { $hit[0] = [$ARGV, $_] if rand($.) < 1; } return \@hit; } # Thanks to mjd@plover.com for the generalization to the $N > 1 case. sub run_multi { my @hit; while (<>) { push @hit, [$ARGV, $_]; last if @hit == $N; } # be sure to notice EOF, else the following <> will read stdin return \@hit if @hit < $N; while (<>) { if (rand() < $N / $.) { splice @hit, rand @hit, 1; push @hit, [$ARGV, $_]; } } return \@hit; } sub test { my (%hit); if (defined $Percent) { xdie "--test not implemented for --percent\n"; } my @orig_argv = @ARGV; for my $trial (1..$Test) { @ARGV = @orig_argv; for (@{ $N > 1 ? run_multi : run_one }) { $hit{$_->[1]}++; } } for (sort { $hit{$a} <=> $hit{$b} or $a cmp $b } keys %hit) { printf "%6d %s", $hit{$_}, $_; } return 0; } sub main { init; my $print_filename = defined $Force_filename ? $Force_filename : @ARGV > 1; if (defined $Percent) { if ($Percent < 0 || $Percent > 100) { xdie "invalid --percent `$Percent'\n" } return 0 if !$Percent; $Percent /= 100; while (<>) { if (rand 1 < $Percent) { print "$ARGV:" if $print_filename; print $_; } } return 0; } if ($N < 0) { xdie "invalid line count `$N'\n"; } elsif ($N == 0) { return 0; } if ($Test) { return test; } for my $hit (@{ $N == 1 ? run_one : run_multi }) { my ($file, $line) = @$hit; print "$file:" if $print_filename; print $line; } return 0; } $Exit = main || $Exit; $Exit = 1 if $Exit && !($Exit % 256); exit $Exit; __END__ =head1 NAME ranline - print lines selected at random =head1 SYNOPSIS B [B<-h> | B<--no-filename>] [B<-H> | B<--with-filename>] [B<--help>] [B<--irs> | B<--input-read-separator> I] [B<-p> | B<--percent> I] [B<-n> | B<--pick> I] [B<--seed> I] [B<--version>] [I]... =head1 DESCRIPTION B chooses lines at random from its input and outputs them. By default it picks one line from all the input files (or stdin if no files are specified). In all cases it does this using a single pass over the input, without saving the whole thing in memory. =head1 OPTIONS =over 4 =item B<-h>, B<--no-filename> Never prepend the input file name to lines output. Normally it's output when there's more than one input file. =item B<-H>, B<--with-filename> Always output the input file name. =item B<--help> Show the usage message and die. =item B<--irs>, B<--input-read-separator> I Specify a different input record separator (really, terminator) (default newline). =item B<-p>, B<--percent> I Choose I percent of the input lines, rather than a fixed number. =item B<-n>, B<--pick> I Choose I lines from the input rather than 1. =item B<--seed> I Use I as the seed for the random number generator. Normally you wouldn't want to specify this, instead you'd let the program pick the seed. The B<--seed> switch is useful to achieve repeatability for debugging. If you use the same seed and other inputs B will pick the same lines to output. =item B<--version> Show the version number and exit. =back =head1 AUTHOR Roderick Schertler =cut