1 package HTML::HeadParser;
5 HTML::HeadParser - Parse <HEAD> section of a HTML document
9 require HTML::HeadParser;
10 $p = HTML::HeadParser->new;
11 $p->parse($text) and print "not finished";
13 $p->header('Title') # to access <title>....</title>
14 $p->header('Content-Base') # to access <base href="http://...">
15 $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
19 The C<HTML::HeadParser> is a specialized (and lightweight)
20 C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
21 section of an HTML document. The parse() method
22 will return a FALSE value as soon as some E<lt>BODY> element or body
23 text are found, and should not be called again after this.
25 Note that the C<HTML::HeadParser> might get confused if raw undecoded
26 UTF-8 is passed to the parse() method. Make sure the strings are
27 properly decoded before passing them on.
29 The C<HTML::HeadParser> keeps a reference to a header object, and the
30 parser will update this header object as the various elements of the
31 E<lt>HEAD> section of the HTML document are recognized. The following
32 header fields are affected:
38 The I<Content-Base> header is initialized from the E<lt>base
43 The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
48 The I<Isindex> header will be added if there is a E<lt>isindex>
49 element in the E<lt>head>. The header value is initialized from the
50 I<prompt> attribute if it is present. If no I<prompt> attribute is
51 given it will have '?' as the value.
55 All E<lt>meta> elements will initialize headers with the prefix
56 "C<X-Meta->" on the name. If the E<lt>meta> element contains a
57 C<http-equiv> attribute, then it will be honored as the header name.
63 The following methods (in addition to those provided by the
64 superclass) are available:
72 @ISA = qw(HTML::Parser);
74 use HTML::Entities ();
77 use vars qw($VERSION $DEBUG);
79 $VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/);
81 =item $hp = HTML::HeadParser->new
83 =item $hp = HTML::HeadParser->new( $header )
85 The object constructor. The optional $header argument should be a
86 reference to an object that implement the header() and push_header()
87 methods as defined by the C<HTTP::Headers> class. Normally it will be
88 of some class that isa or delegates to the C<HTTP::Headers> class.
90 If no $header is given C<HTML::HeadParser> will create an
91 C<HTTP::Header> object by itself (initially empty).
97 my($class, $header) = @_;
99 require HTTP::Headers;
100 $header = HTTP::Headers->new;
103 my $self = $class->SUPER::new(api_version => 2,
104 ignore_elements => [qw(script style)],
106 $self->{'header'} = $header;
107 $self->{'tag'} = ''; # name of active element that takes textual content
108 $self->{'text'} = ''; # the accumulated text associated with the element
114 Returns a reference to the header object.
116 =item $hp->header( $key )
118 Returns a header value. It is just a shorter way to write
119 C<$hp-E<gt>header-E<gt>header($key)>.
126 return $self->{'header'} unless @_;
127 $self->{'header'}->header(@_);
130 sub as_string # legacy
133 $self->{'header'}->as_string;
136 sub flush_text # internal
139 my $tag = $self->{'tag'};
140 my $text = $self->{'text'};
144 print "FLUSH $tag => '$text'\n" if $DEBUG;
145 if ($tag eq 'title') {
146 HTML::Entities::decode($text);
147 $self->{'header'}->push_header(Title => $text);
149 $self->{'tag'} = $self->{'text'} = '';
152 # This is an quote from the HTML3.2 DTD which shows which elements
153 # that might be present in a <HEAD>...</HEAD>. Also note that the
154 # <HEAD> tags themselves might be missing:
156 # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
157 # SCRIPT* & META* & LINK*">
159 # <!ELEMENT HEAD O O (%head.content)>
164 my($self, $tag, $attr) = @_; # $attr is reference to a HASH
165 print "START[$tag]\n" if $DEBUG;
166 $self->flush_text if $self->{'tag'};
167 if ($tag eq 'meta') {
168 my $key = $attr->{'http-equiv'};
169 if (!defined($key) || !length($key)) {
170 return unless $attr->{'name'};
171 $key = "X-Meta-\u$attr->{'name'}";
173 $self->{'header'}->push_header($key => $attr->{content});
174 } elsif ($tag eq 'base') {
175 return unless exists $attr->{href};
176 $self->{'header'}->push_header('Content-Base' => $attr->{href});
177 } elsif ($tag eq 'isindex') {
178 # This is a non-standard header. Perhaps we should just ignore
180 $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
181 } elsif ($tag =~ /^(?:title|script|style)$/) {
182 # Just remember tag. Initialize header when we see the end tag.
183 $self->{'tag'} = $tag;
184 } elsif ($tag eq 'link') {
185 return unless exists $attr->{href};
186 # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
187 my $h_val = "<" . delete($attr->{href}) . ">";
188 for (sort keys %{$attr}) {
189 $h_val .= qq(; $_="$attr->{$_}");
191 $self->{'header'}->push_header(Link => $h_val);
192 } elsif ($tag eq 'head' || $tag eq 'html') {
202 my($self, $tag) = @_;
203 print "END[$tag]\n" if $DEBUG;
204 $self->flush_text if $self->{'tag'};
205 $self->eof if $tag eq 'head';
210 my($self, $text) = @_;
211 $text =~ s/\x{FEFF}//; # drop Unicode BOM if found
212 print "TEXT[$text]\n" if $DEBUG;
213 my $tag = $self->{tag};
214 if (!$tag && $text =~ /\S/) {
215 # Normal text means start of body
219 return if $tag ne 'title';
220 $self->{'text'} .= $text;
231 $h = HTTP::Headers->new;
232 $p = HTML::HeadParser->new($h);
234 <title>Stupid example</title>
235 <base href="http://www.linpro.no/lwp/">
236 Normal text starts here.
239 print $h->title; # should print "Stupid example"
243 L<HTML::Parser>, L<HTTP::Headers>
245 The C<HTTP::Headers> class is distributed as part of the
246 I<libwww-perl> package. If you don't have that distribution installed
247 you need to provide the $header argument to the C<HTML::HeadParser>
248 constructor with your own object that implements the documented
253 Copyright 1996-2001 Gisle Aas. All rights reserved.
255 This library is free software; you can redistribute it and/or
256 modify it under the same terms as Perl itself.