1 package WWW::RobotRules;
4 sub Version { $VERSION; }
14 # This ugly hack is needed to ensure backwards compatibility.
15 # The "WWW::RobotRules" class is now really abstract.
16 $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
18 my $self = bless { }, $class;
25 my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
26 $robot_txt_uri = URI->new("$robot_txt_uri");
27 my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
29 $self->clear_rules($netloc);
30 $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
33 my $is_me = 0; # 1 iff this record is for me
34 my $is_anon = 0; # 1 iff this record is for *
35 my $seen_disallow = 0; # watch for missing record separators
36 my @me_disallowed = (); # rules disallowed for me
37 my @anon_disallowed = (); # rules disallowed for *
39 # blank lines are significant, so turn CRLF into LF to avoid generating
41 $txt =~ s/\015\012/\012/g;
43 # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
44 for(split(/[\012\015]/, $txt)) {
46 # Lines containing only a comment are discarded completely, and
47 # therefore do not indicate a record boundary.
50 s/\s*\#.*//; # remove comments at end-of-line
52 if (/^\s*$/) { # blank line
53 last if $is_me; # That was our record. No need to read the rest.
57 elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
62 # treat as start of a new record
64 last if $is_me; # That was our record. No need to read the rest.
69 # This record already had a User-agent that
70 # we matched, so just continue.
75 elsif($self->is_me($ua)) {
79 elsif (/^\s*Disallow\s*:\s*(.*)/i) {
80 unless (defined $ua) {
81 warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
82 $is_anon = 1; # assume that User-agent: * was intended
85 $disallow =~ s/\s+$//;
87 if (length $disallow) {
90 my $u = URI->new_abs($disallow, $robot_txt_uri);
91 $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
92 $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
93 $ignore++ if $u->port ne $robot_txt_uri->port;
94 $disallow = $u->path_query;
95 $disallow = "/" unless length $disallow;
102 push(@me_disallowed, $disallow);
105 push(@anon_disallowed, $disallow);
109 warn "RobotRules <$robot_txt_uri>: Unexpected line: $_\n" if $^W;
114 $self->push_rules($netloc, @me_disallowed);
117 $self->push_rules($netloc, @anon_disallowed);
123 # Returns TRUE if the given name matches the
127 my($self, $ua_line) = @_;
128 my $me = $self->agent;
130 # See whether my short-name is a substring of the
131 # "User-Agent: ..." line that we were passed:
133 if(index(lc($me), lc($ua_line)) >= 0) {
134 LWP::Debug::debug("\"$ua_line\" applies to \"$me\"")
135 if defined &LWP::Debug::debug;
139 LWP::Debug::debug("\"$ua_line\" does not apply to \"$me\"")
140 if defined &LWP::Debug::debug;
147 my($self, $uri) = @_;
148 $uri = URI->new("$uri");
150 return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
151 # Robots.txt applies to only those schemes.
153 my $netloc = $uri->host . ":" . $uri->port;
155 my $fresh_until = $self->fresh_until($netloc);
156 return -1 if !defined($fresh_until) || $fresh_until < time;
158 my $str = $uri->path_query;
160 for $rule ($self->rules($netloc)) {
161 return 1 unless length $rule;
162 return 0 if index($str, $rule) == 0;
168 # The following methods must be provided by the subclass.
181 package WWW::RobotRules::InCore;
184 @ISA = qw(WWW::RobotRules);
189 my ($self, $name) = @_;
190 my $old = $self->{'ua'};
192 # Strip it so that it's just the short name.
193 # I.e., "FooBot" => "FooBot"
194 # "FooBot/1.2" => "FooBot"
195 # "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
197 $name = $1 if $name =~ m/(\S+)/; # get first word
198 $name =~ s!/.*!!; # get rid of version
199 unless ($old && $old eq $name) {
200 delete $self->{'loc'}; # all old info is now stale
201 $self->{'ua'} = $name;
209 my($self, $netloc, $time) = @_;
210 return unless $netloc;
212 $self->{'loc'}{$netloc}{'last'} = $time;
213 my $count = \$self->{'loc'}{$netloc}{'count'};
214 if (!defined $$count) {
224 my ($self, $netloc) = @_;
225 $self->{'loc'}{$netloc}{'count'};
230 my ($self, $netloc) = @_;
231 $self->{'loc'}{$netloc}{'last'};
236 my ($self, $netloc, $fresh_until) = @_;
237 my $old = $self->{'loc'}{$netloc}{'fresh'};
238 if (defined $fresh_until) {
239 $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
246 my($self, $netloc, @rules) = @_;
247 push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
252 my($self, $netloc) = @_;
253 delete $self->{'loc'}{$netloc}{'rules'};
258 my($self, $netloc) = @_;
259 if (defined $self->{'loc'}{$netloc}{'rules'}) {
260 return @{$self->{'loc'}{$netloc}{'rules'}};
273 print "$_ = $self->{$_}\n";
275 for (keys %{$self->{'loc'}}) {
276 my @rules = $self->rules($_);
277 print "$_: ", join("; ", @rules), "\n";
287 # Bender: "Well, I don't have anything else
288 # planned for today. Let's get drunk!"
292 WWW::RobotRules - database of robots.txt-derived permissions
297 my $rules = WWW::RobotRules->new('MOMspider/1.0');
299 use LWP::Simple qw(get);
302 my $url = "http://some.place/robots.txt";
303 my $robots_txt = get $url;
304 $rules->parse($url, $robots_txt) if defined $robots_txt;
308 my $url = "http://some.other.place/robots.txt";
309 my $robots_txt = get $url;
310 $rules->parse($url, $robots_txt) if defined $robots_txt;
313 # Now we can check if a URL is valid for those servers
314 # whose "robots.txt" files we've gotten and parsed:
315 if($rules->allowed($url)) {
322 This module parses F</robots.txt> files as specified in
323 "A Standard for Robot Exclusion", at
324 <http://www.robotstxt.org/wc/norobots.html>
325 Webmasters can use the F</robots.txt> file to forbid conforming
326 robots from accessing parts of their web site.
328 The parsed files are kept in a WWW::RobotRules object, and this object
329 provides methods to check if access to a given URL is prohibited. The
330 same WWW::RobotRules object can be used for one or more parsed
331 F</robots.txt> files on any number of hosts.
333 The following methods are provided:
337 =item $rules = WWW::RobotRules->new($robot_name)
339 This is the constructor for WWW::RobotRules objects. The first
340 argument given to new() is the name of the robot.
342 =item $rules->parse($robot_txt_url, $content, $fresh_until)
344 The parse() method takes as arguments the URL that was used to
345 retrieve the F</robots.txt> file, and the contents of the file.
347 =item $rules->allowed($uri)
349 Returns TRUE if this robot is allowed to retrieve this URL.
351 =item $rules->agent([$name])
353 Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
354 rules and expire times out of the cache.
360 The format and semantics of the "/robots.txt" file are as follows
361 (this is an edited abstract of
362 <http://www.robotstxt.org/wc/norobots.html> ):
364 The file consists of one or more records separated by one or more
365 blank lines. Each record contains lines of the form
367 <field-name>: <value>
369 The field name is case insensitive. Text after the '#' character on a
370 line is ignored during parsing. This is used for comments. The
371 following <field-names> can be used:
377 The value of this field is the name of the robot the record is
378 describing access policy for. If more than one I<User-Agent> field is
379 present the record describes an identical access policy for more than
380 one robot. At least one field needs to be present per record. If the
381 value is '*', the record describes the default access policy for any
382 robot that has not not matched any of the other records.
384 The I<User-Agent> fields must occur before the I<Disallow> fields. If a
385 record contains a I<User-Agent> field after a I<Disallow> field, that
386 constitutes a malformed record. This parser will assume that a blank
387 line should have been placed before that I<User-Agent> field, and will
388 break the record into two. All the fields before the I<User-Agent> field
389 will constitute a record, and the I<User-Agent> field will be the first
390 field in a new record.
394 The value of this field specifies a partial URL that is not to be
395 visited. This can be a full path, or a partial path; any URL that
396 starts with this value will not be retrieved
400 =head1 ROBOTS.TXT EXAMPLES
402 The following example "/robots.txt" file specifies that no robots
403 should visit any URL starting with "/cyberworld/map/" or "/tmp/":
406 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
407 Disallow: /tmp/ # these will soon disappear
409 This example "/robots.txt" file specifies that no robots should visit
410 any URL starting with "/cyberworld/map/", except the robot called
414 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
416 # Cybermapper knows where to go.
417 User-agent: cybermapper
420 This example indicates that no robots should visit this site further:
426 This is an example of a malformed robots.txt file.
428 # robots.txt for ancientcastle.example.com
429 # I've locked myself away.
432 # The castle is your home now, so you can go anywhere you like.
434 Disallow: /west-wing/ # except the west wing!
435 # It's good to be the Prince...
439 This file is missing the required blank lines between records.
440 However, the intention is clear.
444 L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>