vcs.maemo.org Git - pkg-perl/blob - deb-src/libhtml-parser-perl/libhtml-parser-perl-3.56/t/unicode.t

   1 #!perl -w
   2
   3 use strict;
   4 use HTML::Parser;
   5 use Test::More tests => 103;
   6
   7 SKIP: {
   8 skip "This perl does not support Unicode", 103 if $] < 5.008;
   9
  10 my @warn;
  11 $SIG{__WARN__} = sub {
  12     push(@warn, $_[0]);
  13 };
  14
  15 my @parsed;
  16 my $p = HTML::Parser->new(
  17   api_version => 3,
  18   default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
  19 );
  20
  21 my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile &#x263a</h1>\x{0420}";
  22 is(length($doc), 46);
  23
  24 $p->parse($doc)->eof;
  25
  26 #use Data::Dump; Data::Dump::dump(@parsed);
  27
  28 is(@parsed, 9);
  29 is($parsed[0][0], "start_document");
  30
  31 is($parsed[1][0], "start");
  32 is($parsed[1][1], "<title>");
  33 SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
  34 is($parsed[1][3], 0);
  35 is($parsed[1][4], 7);
  36
  37 is($parsed[2][0], "text");
  38 is(ord($parsed[2][1]), 0x263A);
  39 is($parsed[2][2], chr(0x263A));
  40 is($parsed[2][3], 7);
  41 is($parsed[2][4], 1);
  42 is($parsed[2][5], 8);
  43 is($parsed[2][6], 7);
  44
  45 is($parsed[3][0], "end");
  46 is($parsed[3][1], "</title>");
  47 is($parsed[3][3], 8);
  48 is($parsed[3][6], 8);
  49
  50 is($parsed[4][0], "start");
  51 is($parsed[4][1], "<h1 id=\x{2600} f>");
  52 is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
  53 is($parsed[4][8]{id}, "\x{2600}");
  54
  55 is($parsed[5][0], "text");
  56 is($parsed[5][1], "Smile &#x263a");
  57 is($parsed[5][2], "Smile \x{263A}");
  58
  59 is($parsed[7][0], "text");
  60 is($parsed[7][1], "\x{0420}");
  61 is($parsed[7][2], "\x{0420}");
  62
  63 is($parsed[8][0], "end_document");
  64 is($parsed[8][3], length($doc));
  65 is($parsed[8][5], length($doc));
  66 is($parsed[8][6], length($doc));
  67 is(@warn, 0);
  68
  69 # Try to parse it as an UTF8 encoded string
  70 utf8::encode($doc);
  71 is(length($doc), 51);
  72
  73 @parsed = ();
  74 $p->parse($doc)->eof;
  75
  76 #use Data::Dump; Data::Dump::dump(@parsed);
  77
  78 is(@parsed, 9);
  79 is($parsed[0][0], "start_document");
  80
  81 is($parsed[1][0], "start");
  82 is($parsed[1][1], "<title>");
  83 SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
  84 is($parsed[1][3], 0);
  85 is($parsed[1][4], 7);
  86
  87 is($parsed[2][0], "text");
  88 is(ord($parsed[2][1]), 226);
  89 is($parsed[2][1], "\xE2\x98\xBA");
  90 is($parsed[2][2], "\xE2\x98\xBA");
  91 is($parsed[2][3], 7);
  92 is($parsed[2][4], 3);
  93 is($parsed[2][5], 10);
  94 is($parsed[2][6], 7);
  95
  96 is($parsed[3][0], "end");
  97 is($parsed[3][1], "</title>");
  98 is($parsed[3][3], 10);
  99 is($parsed[3][6], 10);
 100
 101 is($parsed[4][0], "start");
 102 is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
 103 is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
 104 is($parsed[4][8]{id}, "\xE2\x98\x80");
 105
 106 is($parsed[5][0], "text");
 107 is($parsed[5][1], "Smile &#x263a");
 108 is($parsed[5][2], "Smile \x{263A}");
 109
 110 is($parsed[8][0], "end_document");
 111 is($parsed[8][3], length($doc));
 112 is($parsed[8][5], length($doc));
 113 is($parsed[8][6], length($doc));
 114
 115 is(@warn, 1);
 116 like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
 117
 118 my $file = "test-$$.html";
 119 open(my $fh, ">:utf8", $file) || die;
 120 print $fh <<EOT;
 121 \x{FEFF}
 122 <title>\x{263A} Love! </title>
 123 <h1 id=&hearts;\x{2665}>&hearts; Love \x{2665}<h1>
 124 EOT
 125 close($fh) || die;
 126
 127 @warn = ();
 128 @parsed = ();
 129 $p->parse_file($file);
 130 is(@parsed, "11");
 131 is($parsed[6][0], "start");
 132 is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
 133 is($parsed[7][0], "text");
 134 is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
 135 is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5");  # expected garbage
 136 is($parsed[10][3], -s $file);
 137 is(@warn, 1);
 138 like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
 139
 140 @warn = ();
 141 @parsed = ();
 142 open($fh, "<:raw:utf8", $file) || die;
 143 $p->parse_file($fh);
 144 is(@parsed, "11");
 145 is($parsed[6][0], "start");
 146 is($parsed[6][8]{id}, "\x{2665}\x{2665}");
 147 is($parsed[7][0], "text");
 148 is($parsed[7][1], "&hearts; Love \x{2665}");
 149 is($parsed[7][2], "\x{2665} Love \x{2665}");
 150 is($parsed[10][3], (-s $file) - 2 * 4);
 151 is(@warn, 0);
 152
 153 @warn = ();
 154 @parsed = ();
 155 open($fh, "<:raw", $file) || die;
 156 $p->utf8_mode(1);
 157 $p->parse_file($fh);
 158 is(@parsed, "11");
 159 is($parsed[6][0], "start");
 160 is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
 161 is($parsed[7][0], "text");
 162 is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
 163 is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
 164 is($parsed[10][3], -s $file);
 165 is(@warn, 0);
 166
 167 unlink($file);
 168
 169 @parsed = ();
 170 $p->parse(q(<a href="a=1&lang=2&times=3">foo</a>))->eof;
 171 is(@parsed, "5");
 172 is($parsed[1][0], "start");
 173 is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
 174
 175 ok(!HTML::Entities::_probably_utf8_chunk(""));
 176 ok(!HTML::Entities::_probably_utf8_chunk("f"));
 177 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
 178 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
 179 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
 180 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
 181 ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
 182 ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
 183 }