vcs.maemo.org Git - pkg-perl/blob - deb-src/libhtml-parser-perl/libhtml-parser-perl-3.56/t/parser.t

   1 use Test::More tests => 7;
   2
   3 $HTML = <<'HTML';
   4
   5 <!DOCTYPE HTML>
   6
   7 <body>
   8
   9 Various entities.  The parser must never break them in the middle:
  10
  11 &#x2F
  12 &#x2F;
  13 &#200
  14 &#3030;
  15 &#XFFFF;
  16 &aring-&Aring;
  17
  18 <ul>
  19 <li><a href="foo 'bar' baz>" id=33>This is a link</a>
  20 <li><a href='foo "bar" baz> &aring' id=34>This is another one</a>
  21 </ul>
  22
  23 <p><div align="center"><img src="http://www.perl.com/perl.gif"
  24 alt="camel"></div>
  25
  26 <!-- this is
  27 a comment --> and this is not.
  28
  29 <!-- this is the kind of >comment< -- --> that Netscape hates -->
  30
  31 < this > was not a tag. <this is/not either>
  32
  33 </body>
  34
  35 HTML
  36
  37 #-------------------------------------------------------------------
  38
  39 {
  40     package P;
  41     require HTML::Parser;
  42     @ISA=qw(HTML::Parser);
  43     $OUT='';
  44     $COUNT=0;
  45
  46     sub new
  47     {
  48         my $class = shift;
  49         my $self = $class->SUPER::new;
  50         $OUT = '';
  51         die "Can only have one" if $COUNT++;
  52         $self;
  53     }
  54
  55     sub DESTROY
  56     {
  57         my $self = shift;
  58         eval { $self->SUPER::DESTROY; };
  59         $COUNT--;
  60     }
  61
  62     sub declaration
  63     {
  64         my($self, $decl) = @_;
  65         $OUT .= "[[$decl]]|";
  66     }
  67
  68     sub start
  69     {
  70         my($self, $tag, $attr) = @_;
  71         $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
  72         $attr = "/$attr" if length $attr;
  73         $OUT .= "<<$tag$attr>>|";
  74     }
  75
  76     sub end
  77     {
  78         my($self, $tag) = @_;
  79         $OUT .= ">>$tag<<|";
  80     }
  81
  82     sub comment
  83     {
  84         my($self, $comment) = @_;
  85         $OUT .= "##$comment##|";
  86     }
  87
  88     sub text
  89     {
  90         my($self, $text) = @_;
  91         #$text =~ s/\n/\\n/g;
  92         #$text =~ s/\t/\\t/g;
  93         #$text =~ s/ /·/g;
  94         $OUT .= "$text|";
  95     }
  96
  97     sub result
  98     {
  99         $OUT;
 100     }
 101 }
 102
 103 for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") {
 104 #for $chunksize (1) {
 105     if ($chunksize =~ /^file/) {
 106         #print "Parsing from $chunksize";
 107     } else {
 108         #print "Parsing using $chunksize byte chunks";
 109     }
 110     my $p = P->new;
 111
 112     if ($chunksize =~ /^file/) {
 113         # First we must create the file
 114         my $tmpfile = "tmp-$$.html";
 115         my $file = $tmpfile;
 116         die "$file already exists" if -e $file;
 117         open(FILE, ">$file") or die "Can't create $file: $!";
 118         binmode FILE;
 119         print FILE $HTML;
 120         close(FILE);
 121
 122         if ($chunksize eq "filehandle") {
 123             require FileHandle;
 124             my $fh = FileHandle->new($file) || die "Can't open $file: $!";
 125             $file = $fh;
 126         }
 127
 128         # then we can parse it.
 129         $p->parse_file($file);
 130         close $file if $chunksize eq "filehandle";
 131         unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
 132     } else {
 133         my $copy = $HTML;
 134         while (length $copy) {
 135             my $chunk = substr($copy, 0, $chunksize);
 136             substr($copy, 0, $chunksize) = '';
 137             $p->parse($chunk);
 138         }
 139         $p->eof;
 140     }
 141
 142     my $res = $p->result;
 143     my $bad;
 144
 145     # Then we start looking for things that should not happen
 146     if ($res =~ /\s\|\s/) {
 147         diag "broken space";
 148         $bad++;
 149     }
 150     for (
 151          # Make sure entities are not broken
 152          '&#x2F', '&#x2F;', '&#200', '&#3030;', '&#XFFFF;', '&aring', '&Aring',
 153
 154          # Some elements that should be produced
 155          "|[[DOCTYPE HTML]]|",
 156          "|## this is\na comment ##|",
 157          "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|",
 158          '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>',
 159          "|>>ul<<|", "|>>body<<|\n\n|",
 160         )
 161    {
 162         if (index($res, $_) < 0) {
 163             diag "Can't find '$_' in parsed document";
 164             $bad++;
 165         }
 166     }
 167
 168     diag $res if $bad || $ENV{PRINT_RESULTS};
 169
 170     # And we check that we get the same result all the time
 171     $res =~ s/\|//g;  # remove all break marks
 172     if ($last_res && $res ne $last_res) {
 173         diag "The result is not the same as last time";
 174         $bad++;
 175     }
 176     $last_res = $res;
 177
 178     unless ($res =~ /Various entities/) {
 179         diag "Some text must be missing";
 180         $bad++;
 181     }
 182
 183     ok(!$bad);
 184 }