vcs.maemo.org Git - dh-make-perl/blob - dev/arm/libwww-perl/libwww-perl-5.813/debian/libwww-perl/usr/share/perl5/WWW/RobotRules.pm

   1 package WWW::RobotRules;
   2
   3 $VERSION = "5.810";
   4 sub Version { $VERSION; }
   5
   6 use strict;
   7 use URI ();
   8
   9
  10
  11 sub new {
  12     my($class, $ua) = @_;
  13
  14     # This ugly hack is needed to ensure backwards compatibility.
  15     # The "WWW::RobotRules" class is now really abstract.
  16     $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
  17
  18     my $self = bless { }, $class;
  19     $self->agent($ua);
  20     $self;
  21 }
  22
  23
  24 sub parse {
  25     my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
  26     $robot_txt_uri = URI->new("$robot_txt_uri");
  27     my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
  28
  29     $self->clear_rules($netloc);
  30     $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
  31
  32     my $ua;
  33     my $is_me = 0;              # 1 iff this record is for me
  34     my $is_anon = 0;            # 1 iff this record is for *
  35     my $seen_disallow = 0;      # watch for missing record separators
  36     my @me_disallowed = ();     # rules disallowed for me
  37     my @anon_disallowed = ();   # rules disallowed for *
  38
  39     # blank lines are significant, so turn CRLF into LF to avoid generating
  40     # false ones
  41     $txt =~ s/\015\012/\012/g;
  42
  43     # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
  44     for(split(/[\012\015]/, $txt)) {
  45
  46         # Lines containing only a comment are discarded completely, and
  47         # therefore do not indicate a record boundary.
  48         next if /^\s*\#/;
  49
  50         s/\s*\#.*//;        # remove comments at end-of-line
  51
  52         if (/^\s*$/) {      # blank line
  53             last if $is_me; # That was our record. No need to read the rest.
  54             $is_anon = 0;
  55             $seen_disallow = 0;
  56         }
  57         elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
  58             $ua = $1;
  59             $ua =~ s/\s+$//;
  60
  61             if ($seen_disallow) {
  62                 # treat as start of a new record
  63                 $seen_disallow = 0;
  64                 last if $is_me; # That was our record. No need to read the rest.
  65                 $is_anon = 0;
  66             }
  67
  68             if ($is_me) {
  69                 # This record already had a User-agent that
  70                 # we matched, so just continue.
  71             }
  72             elsif ($ua eq '*') {
  73                 $is_anon = 1;
  74             }
  75             elsif($self->is_me($ua)) {
  76                 $is_me = 1;
  77             }
  78         }
  79         elsif (/^\s*Disallow\s*:\s*(.*)/i) {
  80             unless (defined $ua) {
  81                 warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
  82                 $is_anon = 1;  # assume that User-agent: * was intended
  83             }
  84             my $disallow = $1;
  85             $disallow =~ s/\s+$//;
  86             $seen_disallow = 1;
  87             if (length $disallow) {
  88                 my $ignore;
  89                 eval {
  90                     my $u = URI->new_abs($disallow, $robot_txt_uri);
  91                     $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
  92                     $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
  93                     $ignore++ if $u->port ne $robot_txt_uri->port;
  94                     $disallow = $u->path_query;
  95                     $disallow = "/" unless length $disallow;
  96                 };
  97                 next if $@;
  98                 next if $ignore;
  99             }
 100
 101             if ($is_me) {
 102                 push(@me_disallowed, $disallow);
 103             }
 104             elsif ($is_anon) {
 105                 push(@anon_disallowed, $disallow);
 106             }
 107         }
 108         else {
 109             warn "RobotRules <$robot_txt_uri>: Unexpected line: $_\n" if $^W;
 110         }
 111     }
 112
 113     if ($is_me) {
 114         $self->push_rules($netloc, @me_disallowed);
 115     }
 116     else {
 117         $self->push_rules($netloc, @anon_disallowed);
 118     }
 119 }
 120
 121
 122 #
 123 # Returns TRUE if the given name matches the
 124 # name of this robot
 125 #
 126 sub is_me {
 127     my($self, $ua_line) = @_;
 128     my $me = $self->agent;
 129
 130     # See whether my short-name is a substring of the
 131     #  "User-Agent: ..." line that we were passed:
 132
 133     if(index(lc($me), lc($ua_line)) >= 0) {
 134       LWP::Debug::debug("\"$ua_line\" applies to \"$me\"")
 135        if defined &LWP::Debug::debug;
 136       return 1;
 137     }
 138     else {
 139       LWP::Debug::debug("\"$ua_line\" does not apply to \"$me\"")
 140        if defined &LWP::Debug::debug;
 141       return '';
 142     }
 143 }
 144
 145
 146 sub allowed {
 147     my($self, $uri) = @_;
 148     $uri = URI->new("$uri");
 149
 150     return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
 151      # Robots.txt applies to only those schemes.
 152
 153     my $netloc = $uri->host . ":" . $uri->port;
 154
 155     my $fresh_until = $self->fresh_until($netloc);
 156     return -1 if !defined($fresh_until) || $fresh_until < time;
 157
 158     my $str = $uri->path_query;
 159     my $rule;
 160     for $rule ($self->rules($netloc)) {
 161         return 1 unless length $rule;
 162         return 0 if index($str, $rule) == 0;
 163     }
 164     return 1;
 165 }
 166
 167
 168 # The following methods must be provided by the subclass.
 169 sub agent;
 170 sub visit;
 171 sub no_visits;
 172 sub last_visits;
 173 sub fresh_until;
 174 sub push_rules;
 175 sub clear_rules;
 176 sub rules;
 177 sub dump;
 178
 179
 180
 181 package WWW::RobotRules::InCore;
 182
 183 use vars qw(@ISA);
 184 @ISA = qw(WWW::RobotRules);
 185
 186
 187
 188 sub agent {
 189     my ($self, $name) = @_;
 190     my $old = $self->{'ua'};
 191     if ($name) {
 192         # Strip it so that it's just the short name.
 193         # I.e., "FooBot"                                      => "FooBot"
 194         #       "FooBot/1.2"                                  => "FooBot"
 195         #       "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
 196
 197         $name = $1 if $name =~ m/(\S+)/; # get first word
 198         $name =~ s!/.*!!;  # get rid of version
 199         unless ($old && $old eq $name) {
 200             delete $self->{'loc'}; # all old info is now stale
 201             $self->{'ua'} = $name;
 202         }
 203     }
 204     $old;
 205 }
 206
 207
 208 sub visit {
 209     my($self, $netloc, $time) = @_;
 210     return unless $netloc;
 211     $time ||= time;
 212     $self->{'loc'}{$netloc}{'last'} = $time;
 213     my $count = \$self->{'loc'}{$netloc}{'count'};
 214     if (!defined $$count) {
 215         $$count = 1;
 216     }
 217     else {
 218         $$count++;
 219     }
 220 }
 221
 222
 223 sub no_visits {
 224     my ($self, $netloc) = @_;
 225     $self->{'loc'}{$netloc}{'count'};
 226 }
 227
 228
 229 sub last_visit {
 230     my ($self, $netloc) = @_;
 231     $self->{'loc'}{$netloc}{'last'};
 232 }
 233
 234
 235 sub fresh_until {
 236     my ($self, $netloc, $fresh_until) = @_;
 237     my $old = $self->{'loc'}{$netloc}{'fresh'};
 238     if (defined $fresh_until) {
 239         $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
 240     }
 241     $old;
 242 }
 243
 244
 245 sub push_rules {
 246     my($self, $netloc, @rules) = @_;
 247     push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
 248 }
 249
 250
 251 sub clear_rules {
 252     my($self, $netloc) = @_;
 253     delete $self->{'loc'}{$netloc}{'rules'};
 254 }
 255
 256
 257 sub rules {
 258     my($self, $netloc) = @_;
 259     if (defined $self->{'loc'}{$netloc}{'rules'}) {
 260         return @{$self->{'loc'}{$netloc}{'rules'}};
 261     }
 262     else {
 263         return ();
 264     }
 265 }
 266
 267
 268 sub dump
 269 {
 270     my $self = shift;
 271     for (keys %$self) {
 272         next if $_ eq 'loc';
 273         print "$_ = $self->{$_}\n";
 274     }
 275     for (keys %{$self->{'loc'}}) {
 276         my @rules = $self->rules($_);
 277         print "$_: ", join("; ", @rules), "\n";
 278     }
 279 }
 280
 281
 282 1;
 283
 284 __END__
 285
 286
 287 # Bender: "Well, I don't have anything else
 288 #          planned for today.  Let's get drunk!"
 289
 290 =head1 NAME
 291
 292 WWW::RobotRules - database of robots.txt-derived permissions
 293
 294 =head1 SYNOPSIS
 295
 296  use WWW::RobotRules;
 297  my $rules = WWW::RobotRules->new('MOMspider/1.0');
 298
 299  use LWP::Simple qw(get);
 300
 301  {
 302    my $url = "http://some.place/robots.txt";
 303    my $robots_txt = get $url;
 304    $rules->parse($url, $robots_txt) if defined $robots_txt;
 305  }
 306
 307  {
 308    my $url = "http://some.other.place/robots.txt";
 309    my $robots_txt = get $url;
 310    $rules->parse($url, $robots_txt) if defined $robots_txt;
 311  }
 312
 313  # Now we can check if a URL is valid for those servers
 314  # whose "robots.txt" files we've gotten and parsed:
 315  if($rules->allowed($url)) {
 316      $c = get $url;
 317      ...
 318  }
 319
 320 =head1 DESCRIPTION
 321
 322 This module parses F</robots.txt> files as specified in
 323 "A Standard for Robot Exclusion", at
 324 <http://www.robotstxt.org/wc/norobots.html>
 325 Webmasters can use the F</robots.txt> file to forbid conforming
 326 robots from accessing parts of their web site.
 327
 328 The parsed files are kept in a WWW::RobotRules object, and this object
 329 provides methods to check if access to a given URL is prohibited.  The
 330 same WWW::RobotRules object can be used for one or more parsed
 331 F</robots.txt> files on any number of hosts.
 332
 333 The following methods are provided:
 334
 335 =over 4
 336
 337 =item $rules = WWW::RobotRules->new($robot_name)
 338
 339 This is the constructor for WWW::RobotRules objects.  The first
 340 argument given to new() is the name of the robot.
 341
 342 =item $rules->parse($robot_txt_url, $content, $fresh_until)
 343
 344 The parse() method takes as arguments the URL that was used to
 345 retrieve the F</robots.txt> file, and the contents of the file.
 346
 347 =item $rules->allowed($uri)
 348
 349 Returns TRUE if this robot is allowed to retrieve this URL.
 350
 351 =item $rules->agent([$name])
 352
 353 Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
 354 rules and expire times out of the cache.
 355
 356 =back
 357
 358 =head1 ROBOTS.TXT
 359
 360 The format and semantics of the "/robots.txt" file are as follows
 361 (this is an edited abstract of
 362 <http://www.robotstxt.org/wc/norobots.html> ):
 363
 364 The file consists of one or more records separated by one or more
 365 blank lines. Each record contains lines of the form
 366
 367   <field-name>: <value>
 368
 369 The field name is case insensitive.  Text after the '#' character on a
 370 line is ignored during parsing.  This is used for comments.  The
 371 following <field-names> can be used:
 372
 373 =over 3
 374
 375 =item User-Agent
 376
 377 The value of this field is the name of the robot the record is
 378 describing access policy for.  If more than one I<User-Agent> field is
 379 present the record describes an identical access policy for more than
 380 one robot. At least one field needs to be present per record.  If the
 381 value is '*', the record describes the default access policy for any
 382 robot that has not not matched any of the other records.
 383
 384 The I<User-Agent> fields must occur before the I<Disallow> fields.  If a
 385 record contains a I<User-Agent> field after a I<Disallow> field, that
 386 constitutes a malformed record.  This parser will assume that a blank
 387 line should have been placed before that I<User-Agent> field, and will
 388 break the record into two.  All the fields before the I<User-Agent> field
 389 will constitute a record, and the I<User-Agent> field will be the first
 390 field in a new record.
 391
 392 =item Disallow
 393
 394 The value of this field specifies a partial URL that is not to be
 395 visited. This can be a full path, or a partial path; any URL that
 396 starts with this value will not be retrieved
 397
 398 =back
 399
 400 =head1 ROBOTS.TXT EXAMPLES
 401
 402 The following example "/robots.txt" file specifies that no robots
 403 should visit any URL starting with "/cyberworld/map/" or "/tmp/":
 404
 405   User-agent: *
 406   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 407   Disallow: /tmp/ # these will soon disappear
 408
 409 This example "/robots.txt" file specifies that no robots should visit
 410 any URL starting with "/cyberworld/map/", except the robot called
 411 "cybermapper":
 412
 413   User-agent: *
 414   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 415
 416   # Cybermapper knows where to go.
 417   User-agent: cybermapper
 418   Disallow:
 419
 420 This example indicates that no robots should visit this site further:
 421
 422   # go away
 423   User-agent: *
 424   Disallow: /
 425
 426 This is an example of a malformed robots.txt file.
 427
 428   # robots.txt for ancientcastle.example.com
 429   # I've locked myself away.
 430   User-agent: *
 431   Disallow: /
 432   # The castle is your home now, so you can go anywhere you like.
 433   User-agent: Belle
 434   Disallow: /west-wing/ # except the west wing!
 435   # It's good to be the Prince...
 436   User-agent: Beast
 437   Disallow:
 438
 439 This file is missing the required blank lines between records.
 440 However, the intention is clear.
 441
 442 =head1 SEE ALSO
 443
 444 L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>