5 use Test::More tests => 103;
8 skip "This perl does not support Unicode", 103 if $] < 5.008;
11 $SIG{__WARN__} = sub {
16 my $p = HTML::Parser->new(
18 default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
21 my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile ☺</h1>\x{0420}";
26 #use Data::Dump; Data::Dump::dump(@parsed);
29 is($parsed[0][0], "start_document");
31 is($parsed[1][0], "start");
32 is($parsed[1][1], "<title>");
33 SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
37 is($parsed[2][0], "text");
38 is(ord($parsed[2][1]), 0x263A);
39 is($parsed[2][2], chr(0x263A));
45 is($parsed[3][0], "end");
46 is($parsed[3][1], "</title>");
50 is($parsed[4][0], "start");
51 is($parsed[4][1], "<h1 id=\x{2600} f>");
52 is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
53 is($parsed[4][8]{id}, "\x{2600}");
55 is($parsed[5][0], "text");
56 is($parsed[5][1], "Smile ☺");
57 is($parsed[5][2], "Smile \x{263A}");
59 is($parsed[7][0], "text");
60 is($parsed[7][1], "\x{0420}");
61 is($parsed[7][2], "\x{0420}");
63 is($parsed[8][0], "end_document");
64 is($parsed[8][3], length($doc));
65 is($parsed[8][5], length($doc));
66 is($parsed[8][6], length($doc));
69 # Try to parse it as an UTF8 encoded string
76 #use Data::Dump; Data::Dump::dump(@parsed);
79 is($parsed[0][0], "start_document");
81 is($parsed[1][0], "start");
82 is($parsed[1][1], "<title>");
83 SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
87 is($parsed[2][0], "text");
88 is(ord($parsed[2][1]), 226);
89 is($parsed[2][1], "\xE2\x98\xBA");
90 is($parsed[2][2], "\xE2\x98\xBA");
93 is($parsed[2][5], 10);
96 is($parsed[3][0], "end");
97 is($parsed[3][1], "</title>");
98 is($parsed[3][3], 10);
99 is($parsed[3][6], 10);
101 is($parsed[4][0], "start");
102 is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
103 is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
104 is($parsed[4][8]{id}, "\xE2\x98\x80");
106 is($parsed[5][0], "text");
107 is($parsed[5][1], "Smile ☺");
108 is($parsed[5][2], "Smile \x{263A}");
110 is($parsed[8][0], "end_document");
111 is($parsed[8][3], length($doc));
112 is($parsed[8][5], length($doc));
113 is($parsed[8][6], length($doc));
116 like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
118 my $file = "test-$$.html";
119 open(my $fh, ">:utf8", $file) || die;
122 <title>\x{263A} Love! </title>
123 <h1 id=♥\x{2665}>♥ Love \x{2665}<h1>
129 $p->parse_file($file);
131 is($parsed[6][0], "start");
132 is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
133 is($parsed[7][0], "text");
134 is($parsed[7][1], "♥ Love \xE2\x99\xA5");
135 is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
136 is($parsed[10][3], -s $file);
138 like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
142 open($fh, "<:raw:utf8", $file) || die;
145 is($parsed[6][0], "start");
146 is($parsed[6][8]{id}, "\x{2665}\x{2665}");
147 is($parsed[7][0], "text");
148 is($parsed[7][1], "♥ Love \x{2665}");
149 is($parsed[7][2], "\x{2665} Love \x{2665}");
150 is($parsed[10][3], (-s $file) - 2 * 4);
155 open($fh, "<:raw", $file) || die;
159 is($parsed[6][0], "start");
160 is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
161 is($parsed[7][0], "text");
162 is($parsed[7][1], "♥ Love \xE2\x99\xA5");
163 is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
164 is($parsed[10][3], -s $file);
170 $p->parse(q(<a href="a=1&lang=2×=3">foo</a>))->eof;
172 is($parsed[1][0], "start");
173 is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
175 ok(!HTML::Entities::_probably_utf8_chunk(""));
176 ok(!HTML::Entities::_probably_utf8_chunk("f"));
177 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
178 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
179 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
180 ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
181 ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
182 ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));