X-Git-Url: http://vcs.maemo.org/git/?a=blobdiff_plain;f=dev%2Farm%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2Fdebian%2Flibhtml-parser-perl%2Fusr%2Flib%2Fperl5%2FHTML%2FTokeParser.pm;fp=dev%2Farm%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2Fdebian%2Flibhtml-parser-perl%2Fusr%2Flib%2Fperl5%2FHTML%2FTokeParser.pm;h=a1b8837cb4d38064e83fe8c8a3fee9c9c054ecf8;hb=f477fa73365d491991707e7ed9217b48d6994551;hp=0000000000000000000000000000000000000000;hpb=da95c414033799c3a62606f299c3c00b5c77ca11;p=dh-make-perl diff --git a/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/debian/libhtml-parser-perl/usr/lib/perl5/HTML/TokeParser.pm b/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/debian/libhtml-parser-perl/usr/lib/perl5/HTML/TokeParser.pm new file mode 100644 index 0000000..a1b8837 --- /dev/null +++ b/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/debian/libhtml-parser-perl/usr/lib/perl5/HTML/TokeParser.pm @@ -0,0 +1,371 @@ +package HTML::TokeParser; + +# $Id: TokeParser.pm,v 2.37 2006/04/26 08:00:28 gisle Exp $ + +require HTML::PullParser; +@ISA=qw(HTML::PullParser); +$VERSION = sprintf("%d.%02d", q$Revision: 2.37 $ =~ /(\d+)\.(\d+)/); + +use strict; +use Carp (); +use HTML::Entities qw(decode_entities); +use HTML::Tagset (); + +my %ARGS = +( + start => "'S',tagname,attr,attrseq,text", + end => "'E',tagname,text", + text => "'T',text,is_cdata", + process => "'PI',token0,text", + comment => "'C',text", + declaration => "'D',text", + + # options that default on + unbroken_text => 1, +); + + +sub new +{ + my $class = shift; + my %cnf; + if (@_ == 1) { + my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; + %cnf = ($type => $_[0]); + } + else { + %cnf = @_; + } + + my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; + + my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; + + $self->{textify} = $textify; + $self; +} + + +sub get_tag +{ + my $self = shift; + my $token; + while (1) { + $token = $self->get_token || return undef; + my $type = shift @$token; + next unless $type eq "S" || $type eq "E"; + substr($token->[0], 0, 0) = "/" if $type eq "E"; + return $token unless @_; + for (@_) { + return $token if $token->[0] eq $_; + } + } +} + + +sub _textify { + my($self, $token) = @_; + my $tag = $token->[1]; + return undef unless exists $self->{textify}{$tag}; + + my $alt = $self->{textify}{$tag}; + my $text; + if (ref($alt)) { + $text = &$alt(@$token); + } else { + $text = $token->[2]{$alt || "alt"}; + $text = "[\U$tag]" unless defined $text; + } + return $text; +} + + +sub get_text +{ + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } else { + $tag = "/$tag"; + } + if (!@_ || grep $_ eq $tag, @_) { + $self->unget_token($token); + last; + } + push(@text, " ") + if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; + } + } + join("", @text); +} + + +sub get_trimmed_text +{ + my $self = shift; + my $text = $self->get_text(@_); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +sub get_phrase { + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } + if (!$HTML::Tagset::isPhraseMarkup{$tag}) { + $self->unget_token($token); + last; + } + push(@text, " ") if $tag eq "br"; + } + } + my $text = join("", @text); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +1; + + +__END__ + +=head1 NAME + +HTML::TokeParser - Alternative HTML::Parser interface + +=head1 SYNOPSIS + + require HTML::TokeParser; + $p = HTML::TokeParser->new("index.html") || + die "Can't open: $!"; + $p->empty_element_tags(1); # configure its behaviour + + while (my $token = $p->get_token) { + #... + } + +=head1 DESCRIPTION + +The C is an alternative interface to the +C class. It is an C subclass with a +predeclared set of token types. If you wish the tokens to be reported +differently you probably want to use the C directly. + +The following methods are available: + +=over 4 + +=item $p = HTML::TokeParser->new( $filename, %opt ); + +=item $p = HTML::TokeParser->new( $filehandle, %opt ); + +=item $p = HTML::TokeParser->new( \$document, %opt ); + +The object constructor argument is either a file name, a file handle +object, or the complete document to be parsed. Extra options can be +provided as key/value pairs and are processed as documented by the base +classes. + +If the argument is a plain scalar, then it is taken as the name of a +file to be opened and parsed. If the file can't be opened for +reading, then the constructor will return C and $! will tell +you why it failed. + +If the argument is a reference to a plain scalar, then this scalar is +taken to be the literal document to parse. The value of this +scalar should not be changed before all tokens have been extracted. + +Otherwise the argument is taken to be some object that the +C can read() from when it needs more data. Typically +it will be a filehandle of some kind. The stream will be read() until +EOF, but not closed. + +A newly constructed C differ from its base classes +by having the C attribute enabled by default. See +L for a description of this and other attributes that +influence how the document is parsed. It is often a good idea to enable +C behaviour. + +Note that the parsing result will likely not be valid if raw undecoded +UTF-8 is used as a source. When parsing UTF-8 encoded files turn +on UTF-8 decoding: + + open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; + my $p = HTML::TokeParser->new( $fh ); + # ... + +If a $filename is passed to the constructor the file will be opened in +raw mode and the parsing result will only be valid if its content is +Latin-1 or pure ASCII. + +If parsing from an UTF-8 encoded string buffer decode it first: + + utf8::decode($document); + my $p = HTML::TokeParser->new( \$document ); + # ... + +=item $p->get_token + +This method will return the next I found in the HTML document, +or C at the end of the document. The token is returned as an +array reference. The first element of the array will be a string +denoting the type of this token: "S" for start tag, "E" for end tag, +"T" for text, "C" for comment, "D" for declaration, and "PI" for +process instructions. The rest of the token array depend on the type +like this: + + ["S", $tag, $attr, $attrseq, $text] + ["E", $tag, $text] + ["T", $text, $is_data] + ["C", $text] + ["D", $text] + ["PI", $token0, $text] + +where $attr is a hash reference, $attrseq is an array reference and +the rest are plain scalars. The L explains the +details. + +=item $p->unget_token( @tokens ) + +If you find you have read too many tokens you can push them back, +so that they are returned the next time $p->get_token is called. + +=item $p->get_tag + +=item $p->get_tag( @tags ) + +This method returns the next start or end tag (skipping any other +tokens), or C if there are no more tags in the document. If +one or more arguments are given, then we skip tokens until one of the +specified tag types is found. For example: + + $p->get_tag("font", "/font"); + +will find the next start or end tag for a font-element. + +The tag information is returned as an array reference in the same form +as for $p->get_token above, but the type code (first element) is +missing. A start tag will be returned like this: + + [$tag, $attr, $attrseq, $text] + +The tagname of end tags are prefixed with "/", i.e. end tag is +returned like this: + + ["/$tag", $text] + +=item $p->get_text + +=item $p->get_text( @endtags ) + +This method returns all text found at the current position. It will +return a zero length string if the next token is not text. Any +entities will be converted to their corresponding character. + +If one or more arguments are given, then we return all text occurring +before the first of the specified tags found. For example: + + $p->get_text("p", "br"); + +will return the text up to either a paragraph of linebreak element. + +The text might span tags that should be I. This is +controlled by the $p->{textify} attribute, which is a hash that +defines how certain tags can be treated as text. If the name of a +start tag matches a key in this hash then this tag is converted to +text. The hash value is used to specify which tag attribute to obtain +the text from. If this tag attribute is missing, then the upper case +name of the tag enclosed in brackets is returned, e.g. "[IMG]". The +hash value can also be a subroutine reference. In this case the +routine is called with the start tag token content as its argument and +the return value is treated as the text. + +The default $p->{textify} value is: + + {img => "alt", applet => "alt"} + +This means that and tags are treated as text, and that +the text to substitute can be found in the ALT attribute. + +=item $p->get_trimmed_text + +=item $p->get_trimmed_text( @endtags ) + +Same as $p->get_text above, but will collapse any sequences of white +space to a single space character. Leading and trailing white space is +removed. + +=item $p->get_phrase + +This will return all text found at the current position ignoring any +phrasal-level tags. Text is extracted until the first non +phrasal-level tag. Textification of tags is the same as for +get_text(). This method will collapse white space in the same way as +get_trimmed_text() does. + +The definition of phrasal-level tags is obtained from the +HTML::Tagset module. + +=back + +=head1 EXAMPLES + +This example extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the ... tags: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + + while (my $token = $p->get_tag("a")) { + my $url = $token->[1]{href} || "-"; + my $text = $p->get_trimmed_text("/a"); + print "$url\t$text\n"; + } + +This example extract the from the document: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + if ($p->get_tag("title")) { + my $title = $p->get_trimmed_text; + print "Title: $title\n"; + } + +=head1 SEE ALSO + +L<HTML::PullParser>, L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1998-2005 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut