vcs.maemo.org Git - dh-make-perl/blob - dev/arm/libwww-perl/libwww-perl-5.813/debian/libwww-perl/usr/bin/lwp-rget

   1 #!/usr/bin/perl -w
   2
   3 eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
   4     if 0; # not running under some shell
   5
   6 =head1 NAME
   7
   8 lwp-rget - Retrieve web documents recursively
   9
  10 =head1 SYNOPSIS
  11
  12  lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier] [--iis]
  13           [--keepext=mime/type[,mime/type]] [--limit=N] [--nospace]
  14           [--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] <URL>
  15  lwp-rget --version
  16
  17 =head1 DESCRIPTION
  18
  19 This program will retrieve a document and store it in a local file.  It
  20 will follow any links found in the document and store these documents
  21 as well, patching links so that they refer to these local copies.
  22 This process continues until there are no more unvisited links or the
  23 process is stopped by the one or more of the limits which can be
  24 controlled by the command line arguments.
  25
  26 This program is useful if you want to make a local copy of a
  27 collection of documents or want to do web reading off-line.
  28
  29 All documents are stored as plain files in the current directory. The
  30 file names chosen are derived from the last component of URL paths.
  31
  32 The options are:
  33
  34 =over 3
  35
  36 =item --auth=USER:PASS<n>
  37
  38 Set the authentication credentials to user "USER" and password "PASS" if
  39 any restricted parts of the web site are hit.  If there are restricted
  40 parts of the web site and authentication credentials are not available,
  41 those pages will not be downloaded.
  42
  43 =item --depth=I<n>
  44
  45 Limit the recursive level. Embedded images are always loaded, even if
  46 they fall outside the I<--depth>. This means that one can use
  47 I<--depth=0> in order to fetch a single document together with all
  48 inline graphics.
  49
  50 The default depth is 5.
  51
  52 =item --hier
  53
  54 Download files into a hierarchy that mimics the web site structure.
  55 The default is to put all files in the current directory.
  56
  57 =item --referer=I<URI>
  58
  59 Set the value of the Referer header for the initial request.  The
  60 special value C<"NONE"> can be used to suppress the Referer header in
  61 any of subsequent requests.  The Referer header will always be suppressed
  62 in all normal C<http> requests if the referring page was transmitted over
  63 C<https> as recommended in RFC 2616.
  64
  65 =item --iis
  66
  67 Sends an "Accept: */*" on all URL requests as a workaround for a bug in
  68 IIS 2.0.  If no Accept MIME header is present, IIS 2.0 returns with a
  69 "406 No acceptable objects were found" error.  Also converts any back
  70 slashes (\\) in URLs to forward slashes (/).
  71
  72 =item --keepext=I<mime/type[,mime/type]>
  73
  74 Keeps the current extension for the list MIME types.  Useful when
  75 downloading text/plain documents that shouldn't all be translated to
  76 *.txt files.
  77
  78 =item --limit=I<n>
  79
  80 Limit the number of documents to get.  The default limit is 50.
  81
  82 =item --nospace
  83
  84 Changes spaces in all URLs to underscore characters (_).  Useful when
  85 downloading files from sites serving URLs with spaces in them.  Does not
  86 remove spaces from fragments, e.g., "file.html#somewhere in here".
  87
  88 =item --prefix=I<url_prefix>
  89
  90 Limit the links to follow. Only URLs that start the prefix string are
  91 followed.
  92
  93 The default prefix is set as the "directory" of the initial URL to
  94 follow.  For instance if we start lwp-rget with the URL
  95 C<http://www.sn.no/foo/bar.html>, then prefix will be set to
  96 C<http://www.sn.no/foo/>.
  97
  98 Use C<--prefix=''> if you don't want the fetching to be limited by any
  99 prefix.
 100
 101 =item --sleep=I<n>
 102
 103 Sleep I<n> seconds before retrieving each document. This options allows
 104 you to go slowly, not loading the server you visiting too much.
 105
 106 =item --tolower
 107
 108 Translates all links to lowercase.  Useful when downloading files from
 109 IIS since it does not serve files in a case sensitive manner.
 110
 111 =item --verbose
 112
 113 Make more noise while running.
 114
 115 =item --quiet
 116
 117 Don't make any noise.
 118
 119 =item --version
 120
 121 Print program version number and quit.
 122
 123 =item --help
 124
 125 Print the usage message and quit.
 126
 127 =back
 128
 129 Before the program exits the name of the file, where the initial URL
 130 is stored, is printed on stdout.  All used filenames are also printed
 131 on stderr as they are loaded.  This printing can be suppressed with
 132 the I<--quiet> option.
 133
 134 =head1 SEE ALSO
 135
 136 L<lwp-request>, L<LWP>
 137
 138 =head1 AUTHOR
 139
 140 Gisle Aas <aas@sn.no>
 141
 142 =cut
 143
 144 use strict;
 145
 146 use Getopt::Long    qw(GetOptions);
 147 use URI::URL        qw(url);
 148 use LWP::MediaTypes qw(media_suffix);
 149 use HTML::Entities  ();
 150
 151 use vars qw($VERSION);
 152 use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT);
 153
 154 my $progname = $0;
 155 $progname =~ s|.*/||;  # only basename left
 156 $progname =~ s/\.\w*$//; #strip extension if any
 157
 158 $VERSION = "5.810";
 159
 160 #$Getopt::Long::debug = 1;
 161 #$Getopt::Long::ignorecase = 0;
 162
 163 # Defaults
 164 $MAX_DEPTH = 5;
 165 $MAX_DOCS  = 50;
 166
 167 GetOptions('version'  => \&print_version,
 168            'help'     => \&usage,
 169            'depth=i'  => \$MAX_DEPTH,
 170            'limit=i'  => \$MAX_DOCS,
 171            'verbose!' => \$VERBOSE,
 172            'quiet!'   => \$QUIET,
 173            'sleep=i'  => \$SLEEP,
 174            'prefix:s' => \$PREFIX,
 175            'referer:s'=> \$REFERER,
 176            'hier'     => \$HIER,
 177            'auth=s'   => \$AUTH,
 178            'iis'      => \$IIS,
 179            'tolower'  => \$TOLOWER,
 180            'nospace'  => \$NOSPACE,
 181            'keepext=s' => \$KEEPEXT{'OPT'},
 182           ) || usage();
 183
 184 sub print_version {
 185     require LWP;
 186     my $DISTNAME = 'libwww-perl-' . LWP::Version();
 187     print <<"EOT";
 188 This is lwp-rget version $VERSION ($DISTNAME)
 189
 190 Copyright 1996-1998, Gisle Aas.
 191
 192 This program is free software; you can redistribute it and/or
 193 modify it under the same terms as Perl itself.
 194 EOT
 195     exit 0;
 196 }
 197
 198 my $start_url = shift || usage();
 199 usage() if @ARGV;
 200
 201 require LWP::UserAgent;
 202 my $ua = new LWP::UserAgent;
 203 $ua->agent("$progname/$VERSION " . $ua->agent);
 204 $ua->env_proxy;
 205
 206 unless (defined $PREFIX) {
 207     $PREFIX = url($start_url);   # limit to URLs below this one
 208     eval {
 209         $PREFIX->eparams(undef);
 210         $PREFIX->equery(undef);
 211     };
 212
 213     $_ = $PREFIX->epath;
 214     s|[^/]+$||;
 215     $PREFIX->epath($_);
 216     $PREFIX = $PREFIX->as_string;
 217 }
 218
 219 %KEEPEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEPEXT{'OPT'}||0));
 220
 221 my $SUPPRESS_REFERER;
 222 $SUPPRESS_REFERER++ if ($REFERER || "") eq "NONE";
 223
 224 print <<"" if $VERBOSE;
 225 START     = $start_url
 226 MAX_DEPTH = $MAX_DEPTH
 227 MAX_DOCS  = $MAX_DOCS
 228 PREFIX    = $PREFIX
 229
 230 my $no_docs = 0;
 231 my %seen = ();     # mapping from URL => local_file
 232
 233 my $filename = fetch($start_url, undef, $REFERER);
 234 print "$filename\n" unless $QUIET;
 235
 236 sub fetch
 237 {
 238     my($url, $type, $referer, $depth) = @_;
 239
 240     # Fix http://sitename.com/../blah/blah.html to
 241     #     http://sitename.com/blah/blah.html
 242     $url = $url->as_string if (ref($url));
 243     while ($url =~ s#(https?://[^/]+/)\.\.\/#$1#) {}
 244
 245     # Fix backslashes (\) in URL if $IIS defined
 246     $url = fix_backslashes($url) if (defined $IIS);
 247
 248     $url = url($url);
 249     $type  ||= 'a';
 250     # Might be the background attribute
 251     $type = 'img' if ($type eq 'body' || $type eq 'td');
 252     $depth ||= 0;
 253
 254     # Print the URL before we start checking...
 255     my $out = (" " x $depth) . $url . " ";
 256     $out .= "." x (60 - length($out));
 257     print STDERR $out . " " if $VERBOSE;
 258
 259     # Can't get mailto things
 260     if ($url->scheme eq 'mailto') {
 261         print STDERR "*skipping mailto*\n" if $VERBOSE;
 262         return $url->as_string;
 263     }
 264
 265     # The $plain_url is a URL without the fragment part
 266     my $plain_url = $url->clone;
 267     $plain_url->frag(undef);
 268
 269     # Check PREFIX, but not for <IMG ...> links
 270     if ($type ne 'img' and  $url->as_string !~ /^\Q$PREFIX/o) {
 271         print STDERR "*outsider*\n" if $VERBOSE;
 272         return $url->as_string;
 273     }
 274
 275     # Translate URL to lowercase if $TOLOWER defined
 276     $plain_url = to_lower($plain_url) if (defined $TOLOWER);
 277
 278     # If we already have it, then there is nothing to be done
 279     my $seen = $seen{$plain_url->as_string};
 280     if ($seen) {
 281         my $frag = $url->frag;
 282         $seen .= "#$frag" if defined($frag);
 283         $seen = protect_frag_spaces($seen);
 284         print STDERR "$seen (again)\n" if $VERBOSE;
 285         return $seen;
 286     }
 287
 288     # Too much or too deep
 289     if ($depth > $MAX_DEPTH and $type ne 'img') {
 290         print STDERR "*too deep*\n" if $VERBOSE;
 291         return $url;
 292     }
 293     if ($no_docs > $MAX_DOCS) {
 294         print STDERR "*too many*\n" if $VERBOSE;
 295         return $url;
 296     }
 297
 298     # Fetch document
 299     $no_docs++;
 300     sleep($SLEEP) if $SLEEP;
 301     my $req = HTTP::Request->new(GET => $url);
 302     # See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT
 303     $req->header ('Accept', '*/*') if (defined $IIS);  # GIF/JPG from IIS 2.0
 304     $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
 305     if ($referer && !$SUPPRESS_REFERER) {
 306         if ($req->url->scheme eq 'http') {
 307             # RFC 2616, section 15.1.3
 308             $referer = url($referer) unless ref($referer);
 309             undef $referer if ($referer->scheme || '') eq 'https';
 310         }
 311         $req->referer($referer) if $referer;
 312     }
 313     my $res = $ua->request($req);
 314
 315     # Check outcome
 316     if ($res->is_success) {
 317         my $doc = $res->content;
 318         my $ct = $res->content_type;
 319         my $name = find_name($res->request->url, $ct);
 320         print STDERR "$name\n" unless $QUIET;
 321         $seen{$plain_url->as_string} = $name;
 322
 323         # If the file is HTML, then we look for internal links
 324         if ($ct eq "text/html") {
 325             # Save an unprosessed version of the HTML document.  This
 326             # both reserves the name used, and it also ensures that we
 327             # don't loose everything if this program is killed before
 328             # we finish.
 329             save($name, $doc);
 330             my $base = $res->base;
 331
 332             # Follow and substitute links...
 333             $doc =~
 334 s/
 335   (
 336     <(img|a|body|area|frame|td)\b   # some interesting tag
 337     [^>]+                           # still inside tag (not strictly correct)
 338     \b(?:src|href|background)       # some link attribute
 339     \s*=\s*                         # =
 340   )
 341     (?:                             # scope of OR-ing
 342          (")([^"]*)"    |           # value in double quotes  OR
 343          (')([^']*)'    |           # value in single quotes  OR
 344             ([^\s>]+)               # quoteless value
 345     )
 346 /
 347   new_link($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7),
 348            $base, $name, "$url", $depth+1)
 349 /giex;
 350            # XXX
 351            # The regular expression above is not strictly correct.
 352            # It is not really possible to parse HTML with a single
 353            # regular expression, but it is faster.  Tags that might
 354            # confuse us include:
 355            #    <a alt="href" href=link.html>
 356            #    <a alt=">" href="link.html">
 357            #
 358         }
 359         save($name, $doc);
 360         return $name;
 361     }
 362     else {
 363         print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
 364         $seen{$plain_url->as_string} = $url->as_string;
 365         return $url->as_string;
 366     }
 367 }
 368
 369 sub new_link
 370 {
 371     my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth) = @_;
 372
 373     $url = protect_frag_spaces($url);
 374
 375     $url = fetch(url($url, $base)->abs, $type, $referer, $depth);
 376     $url = url("file:$url", "file:$localbase")->rel
 377         unless $url =~ /^[.+\-\w]+:/;
 378
 379     $url = unprotect_frag_spaces($url);
 380
 381     return $pre . $quote . $url . $quote;
 382 }
 383
 384
 385 sub protect_frag_spaces
 386 {
 387     my ($url) = @_;
 388
 389     $url = $url->as_string if (ref($url));
 390
 391     if ($url =~ m/^([^#]*#)(.+)$/)
 392     {
 393       my ($base, $frag) = ($1, $2);
 394       $frag =~ s/ /%20/g;
 395       $url = $base . $frag;
 396     }
 397
 398     return $url;
 399 }
 400
 401
 402 sub unprotect_frag_spaces
 403 {
 404     my ($url) = @_;
 405
 406     $url = $url->as_string if (ref($url));
 407
 408     if ($url =~ m/^([^#]*#)(.+)$/)
 409     {
 410       my ($base, $frag) = ($1, $2);
 411       $frag =~ s/%20/ /g;
 412       $url = $base . $frag;
 413     }
 414
 415     return $url;
 416 }
 417
 418
 419 sub fix_backslashes
 420 {
 421     my ($url) = @_;
 422     my ($base, $frag);
 423
 424     $url = $url->as_string if (ref($url));
 425
 426     if ($url =~ m/([^#]+)(#.*)/)
 427     {
 428       ($base, $frag) = ($1, $2);
 429     }
 430     else
 431     {
 432       $base = $url;
 433       $frag = "";
 434     }
 435
 436     $base =~ tr/\\/\//;
 437     $base =~ s/%5[cC]/\//g;     # URL-encoded back slash is %5C
 438
 439     return $base . $frag;
 440 }
 441
 442
 443 sub to_lower
 444 {
 445     my ($url) = @_;
 446     my $was_object = 0;
 447
 448     if (ref($url))
 449     {
 450       $url = $url->as_string;
 451       $was_object = 1;
 452     }
 453
 454     if ($url =~ m/([^#]+)(#.*)/)
 455     {
 456       $url = lc($1) . $2;
 457     }
 458     else
 459     {
 460       $url = lc($url);
 461     }
 462
 463     if ($was_object == 1)
 464     {
 465       return url($url);
 466     }
 467     else
 468     {
 469       return $url;
 470     }
 471 }
 472
 473
 474 sub translate_spaces
 475 {
 476     my ($url) = @_;
 477     my ($base, $frag);
 478
 479     $url = $url->as_string if (ref($url));
 480
 481     if ($url =~ m/([^#]+)(#.*)/)
 482     {
 483       ($base, $frag) = ($1, $2);
 484     }
 485     else
 486     {
 487       $base = $url;
 488       $frag = "";
 489     }
 490
 491     $base =~ s/^ *//;   # Remove initial spaces from base
 492     $base =~ s/ *$//;   # Remove trailing spaces from base
 493
 494     $base =~ tr/ /_/;
 495     $base =~ s/%20/_/g; # URL-encoded space is %20
 496
 497     return $base . $frag;
 498 }
 499
 500
 501 sub mkdirp
 502 {
 503     my($directory, $mode) = @_;
 504     my @dirs = split(/\//, $directory);
 505     my $path = shift(@dirs);   # build it as we go
 506     my $result = 1;   # assume it will work
 507
 508     unless (-d $path) {
 509         $result &&= mkdir($path, $mode);
 510     }
 511
 512     foreach (@dirs) {
 513         $path .= "/$_";
 514         if ( ! -d $path) {
 515             $result &&= mkdir($path, $mode);
 516         }
 517     }
 518
 519     return $result;
 520 }
 521
 522
 523 sub find_name
 524 {
 525     my($url, $type) = @_;
 526     #print "find_name($url, $type)\n";
 527
 528     # Translate spaces in URL to underscores (_) if $NOSPACE defined
 529     $url = translate_spaces($url) if (defined $NOSPACE);
 530
 531     # Translate URL to lowercase if $TOLOWER defined
 532     $url = to_lower($url) if (defined $TOLOWER);
 533
 534     $url = url($url) unless ref($url);
 535
 536     my $path = $url->path;
 537
 538     # trim path until only the basename is left
 539     $path =~ s|(.*/)||;
 540     my $dirname = ".$1";
 541     if (!$HIER) {
 542         $dirname = "";
 543     }
 544     elsif (! -d $dirname) {
 545         mkdirp($dirname, 0775);
 546     }
 547
 548     my $extra = "";  # something to make the name unique
 549     my $suffix;
 550
 551     if ($KEEPEXT{lc($type)}) {
 552         $suffix = ($path =~ m/\.(.*)/) ? $1 : "";
 553     }
 554     else {
 555         $suffix = media_suffix($type);
 556     }
 557
 558     $path =~ s|\..*||;  # trim suffix
 559     $path = "index" unless length $path;
 560
 561     while (1) {
 562         # Construct a new file name
 563         my $file = $dirname . $path . $extra;
 564         $file .= ".$suffix" if $suffix;
 565         # Check if it is unique
 566         return $file unless -f $file;
 567
 568         # Try something extra
 569         unless ($extra) {
 570             $extra = "001";
 571             next;
 572         }
 573         $extra++;
 574     }
 575 }
 576
 577
 578 sub save
 579 {
 580     my $name = shift;
 581     #print "save($name,...)\n";
 582     open(FILE, ">$name") || die "Can't save $name: $!";
 583     binmode FILE;
 584     print FILE $_[0];
 585     close(FILE);
 586 }
 587
 588
 589 sub usage
 590 {
 591     print <<""; exit 1;
 592 Usage: $progname [options] <URL>
 593 Allowed options are:
 594   --auth=USER:PASS  Set authentication credentials for web site
 595   --depth=N         Maximum depth to traverse (default is $MAX_DEPTH)
 596   --hier            Download into hierarchy (not all files into cwd)
 597   --referer=URI     Set initial referer header (or "NONE")
 598   --iis             Workaround IIS 2.0 bug by sending "Accept: */*" MIME
 599                     header; translates backslashes (\\) to forward slashes (/)
 600   --keepext=type    Keep file extension for MIME types (comma-separated list)
 601   --limit=N         A limit on the number documents to get (default is $MAX_DOCS)
 602   --nospace         Translate spaces URLs (not #fragments) to underscores (_)
 603   --version         Print version number and quit
 604   --verbose         More output
 605   --quiet           No output
 606   --sleep=SECS      Sleep between gets, ie. go slowly
 607   --prefix=PREFIX   Limit URLs to follow to those which begin with PREFIX
 608   --tolower         Translate all URLs to lowercase (useful with IIS servers)
 609
 610 }