1 package HTML::Entities;
3 # $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $
7 HTML::Entities - Encode or decode strings with HTML entities
13 $a = "Våre norske tegn bør æres";
15 encode_entities($a, "\200-\377");
19 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20 print encode_entities($input), "\n"
24 vis-à-vis Beyoncé's naïve
25 papier-mâché résumé
29 This module deals with encoding and decoding of strings with HTML
30 character entities. The module provides the following functions:
34 =item decode_entities( $string, ... )
36 This routine replaces HTML entities found in the $string with the
37 corresponding Unicode character. Under perl 5.6 and earlier only
38 characters in the Latin-1 range are replaced. Unrecognized
39 entities are left alone.
41 If multiple strings are provided as argument they are each decoded
42 separately and the same number of strings are returned.
44 If called in void context the arguments are decoded in-place.
46 This routine is exported by default.
48 =item _decode_entities( $string, \%entity2char )
50 =item _decode_entities( $string, \%entity2char, $expand_prefix )
52 This will in-place replace HTML entities in $string. The %entity2char
53 hash must be provided. Named entities not found in the %entity2char
54 hash are left alone. Numeric entities are expanded unless their value
57 The keys in %entity2char are the entity names to be expanded and their
58 values are what they should expand into. The values do not have to be
59 single character strings. If a key has ";" as suffix,
60 then occurrences in $string are only expanded if properly terminated
61 with ";". Entities without ";" will be expanded regardless of how
62 they are terminated for compatiblity with how common browsers treat
63 entities in the Latin-1 range.
65 If $expand_prefix is TRUE then entities without trailing ";" in
66 %entity2char will even be expanded as a prefix of a longer
67 unrecognized name. The longest matching name in %entity2char will be
68 used. This is mainly present for compatibility with an MSIE
71 $string = "foo bar";
72 _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
73 print $string; # will print "foo bar"
75 This routine is exported by default.
77 =item encode_entities( $string )
79 =item encode_entities( $string, $unsafe_chars )
81 This routine replaces unsafe characters in $string with their entity
82 representation. A second argument can be given to specify which
83 characters to consider unsafe (i.e., which to escape). The default set
84 of characters to encode are control chars, high-bit chars, and the
85 C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
86 characters. But this, for example, would encode I<just> the
87 C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
89 $encoded = encode_entities($input, '<>&"');
91 This routine is exported by default.
93 =item encode_entities_numeric( $string )
95 =item encode_entities_numeric( $string, $unsafe_chars )
97 This routine works just like encode_entities, except that the replacement
98 entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
99 example, C<encode_entities("r\xF4le")> returns "rôle", but
100 C<encode_entities_numeric("r\xF4le")> returns "rôle".
102 This routine is I<not> exported by default. But you can always
103 export it with C<use HTML::Entities qw(encode_entities_numeric);>
104 or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
108 All these routines modify the string passed as the first argument, if
109 called in a void context. In scalar and array contexts, the encoded or
110 decoded string is returned (without changing the input string).
112 If you prefer not to import these routines into your namespace, you can
115 use HTML::Entities ();
116 $decoded = HTML::Entities::decode($a);
117 $encoded = HTML::Entities::encode($a);
118 $encoded = HTML::Entities::encode_numeric($a);
120 The module can also export the %char2entity and the %entity2char
121 hashes, which contain the mapping from all characters to the
122 corresponding entities (and vice versa, respectively).
126 Copyright 1995-2006 Gisle Aas. All rights reserved.
128 This library is free software; you can redistribute it and/or
129 modify it under the same terms as Perl itself.
134 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
135 use vars qw(%entity2char %char2entity);
141 @EXPORT = qw(encode_entities decode_entities _decode_entities);
142 @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
144 $VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/);
145 sub Version { $VERSION; }
147 require HTML::Parser; # for fast XS implemented decode_entities
151 # Some normal chars that have special meaning in SGML context
152 amp => '&', # ampersand
153 'gt' => '>', # greater than
154 'lt' => '<', # less than
155 quot => '"', # double quote
156 apos => "'", # single quote
158 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
159 AElig => chr(198), # capital AE diphthong (ligature)
160 Aacute => chr(193), # capital A, acute accent
161 Acirc => chr(194), # capital A, circumflex accent
162 Agrave => chr(192), # capital A, grave accent
163 Aring => chr(197), # capital A, ring
164 Atilde => chr(195), # capital A, tilde
165 Auml => chr(196), # capital A, dieresis or umlaut mark
166 Ccedil => chr(199), # capital C, cedilla
167 ETH => chr(208), # capital Eth, Icelandic
168 Eacute => chr(201), # capital E, acute accent
169 Ecirc => chr(202), # capital E, circumflex accent
170 Egrave => chr(200), # capital E, grave accent
171 Euml => chr(203), # capital E, dieresis or umlaut mark
172 Iacute => chr(205), # capital I, acute accent
173 Icirc => chr(206), # capital I, circumflex accent
174 Igrave => chr(204), # capital I, grave accent
175 Iuml => chr(207), # capital I, dieresis or umlaut mark
176 Ntilde => chr(209), # capital N, tilde
177 Oacute => chr(211), # capital O, acute accent
178 Ocirc => chr(212), # capital O, circumflex accent
179 Ograve => chr(210), # capital O, grave accent
180 Oslash => chr(216), # capital O, slash
181 Otilde => chr(213), # capital O, tilde
182 Ouml => chr(214), # capital O, dieresis or umlaut mark
183 THORN => chr(222), # capital THORN, Icelandic
184 Uacute => chr(218), # capital U, acute accent
185 Ucirc => chr(219), # capital U, circumflex accent
186 Ugrave => chr(217), # capital U, grave accent
187 Uuml => chr(220), # capital U, dieresis or umlaut mark
188 Yacute => chr(221), # capital Y, acute accent
189 aacute => chr(225), # small a, acute accent
190 acirc => chr(226), # small a, circumflex accent
191 aelig => chr(230), # small ae diphthong (ligature)
192 agrave => chr(224), # small a, grave accent
193 aring => chr(229), # small a, ring
194 atilde => chr(227), # small a, tilde
195 auml => chr(228), # small a, dieresis or umlaut mark
196 ccedil => chr(231), # small c, cedilla
197 eacute => chr(233), # small e, acute accent
198 ecirc => chr(234), # small e, circumflex accent
199 egrave => chr(232), # small e, grave accent
200 eth => chr(240), # small eth, Icelandic
201 euml => chr(235), # small e, dieresis or umlaut mark
202 iacute => chr(237), # small i, acute accent
203 icirc => chr(238), # small i, circumflex accent
204 igrave => chr(236), # small i, grave accent
205 iuml => chr(239), # small i, dieresis or umlaut mark
206 ntilde => chr(241), # small n, tilde
207 oacute => chr(243), # small o, acute accent
208 ocirc => chr(244), # small o, circumflex accent
209 ograve => chr(242), # small o, grave accent
210 oslash => chr(248), # small o, slash
211 otilde => chr(245), # small o, tilde
212 ouml => chr(246), # small o, dieresis or umlaut mark
213 szlig => chr(223), # small sharp s, German (sz ligature)
214 thorn => chr(254), # small thorn, Icelandic
215 uacute => chr(250), # small u, acute accent
216 ucirc => chr(251), # small u, circumflex accent
217 ugrave => chr(249), # small u, grave accent
218 uuml => chr(252), # small u, dieresis or umlaut mark
219 yacute => chr(253), # small y, acute accent
220 yuml => chr(255), # small y, dieresis or umlaut mark
222 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
223 copy => chr(169), # copyright sign
224 reg => chr(174), # registered sign
225 nbsp => chr(160), # non breaking space
227 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
238 'not' => chr(172), # not is a keyword in perl
257 'times' => chr(215), # times is a keyword in perl
261 'OElig;' => chr(338),
262 'oelig;' => chr(339),
263 'Scaron;' => chr(352),
264 'scaron;' => chr(353),
268 'tilde;' => chr(732),
269 'Alpha;' => chr(913),
271 'Gamma;' => chr(915),
272 'Delta;' => chr(916),
273 'Epsilon;' => chr(917),
276 'Theta;' => chr(920),
278 'Kappa;' => chr(922),
279 'Lambda;' => chr(923),
283 'Omicron;' => chr(927),
286 'Sigma;' => chr(931),
288 'Upsilon;' => chr(933),
292 'Omega;' => chr(937),
293 'alpha;' => chr(945),
295 'gamma;' => chr(947),
296 'delta;' => chr(948),
297 'epsilon;' => chr(949),
300 'theta;' => chr(952),
302 'kappa;' => chr(954),
303 'lambda;' => chr(955),
307 'omicron;' => chr(959),
310 'sigmaf;' => chr(962),
311 'sigma;' => chr(963),
313 'upsilon;' => chr(965),
317 'omega;' => chr(969),
318 'thetasym;' => chr(977),
319 'upsih;' => chr(978),
321 'ensp;' => chr(8194),
322 'emsp;' => chr(8195),
323 'thinsp;' => chr(8201),
324 'zwnj;' => chr(8204),
328 'ndash;' => chr(8211),
329 'mdash;' => chr(8212),
330 'lsquo;' => chr(8216),
331 'rsquo;' => chr(8217),
332 'sbquo;' => chr(8218),
333 'ldquo;' => chr(8220),
334 'rdquo;' => chr(8221),
335 'bdquo;' => chr(8222),
336 'dagger;' => chr(8224),
337 'Dagger;' => chr(8225),
338 'bull;' => chr(8226),
339 'hellip;' => chr(8230),
340 'permil;' => chr(8240),
341 'prime;' => chr(8242),
342 'Prime;' => chr(8243),
343 'lsaquo;' => chr(8249),
344 'rsaquo;' => chr(8250),
345 'oline;' => chr(8254),
346 'frasl;' => chr(8260),
347 'euro;' => chr(8364),
348 'image;' => chr(8465),
349 'weierp;' => chr(8472),
350 'real;' => chr(8476),
351 'trade;' => chr(8482),
352 'alefsym;' => chr(8501),
353 'larr;' => chr(8592),
354 'uarr;' => chr(8593),
355 'rarr;' => chr(8594),
356 'darr;' => chr(8595),
357 'harr;' => chr(8596),
358 'crarr;' => chr(8629),
359 'lArr;' => chr(8656),
360 'uArr;' => chr(8657),
361 'rArr;' => chr(8658),
362 'dArr;' => chr(8659),
363 'hArr;' => chr(8660),
364 'forall;' => chr(8704),
365 'part;' => chr(8706),
366 'exist;' => chr(8707),
367 'empty;' => chr(8709),
368 'nabla;' => chr(8711),
369 'isin;' => chr(8712),
370 'notin;' => chr(8713),
372 'prod;' => chr(8719),
374 'minus;' => chr(8722),
375 'lowast;' => chr(8727),
376 'radic;' => chr(8730),
377 'prop;' => chr(8733),
378 'infin;' => chr(8734),
385 'there4;' => chr(8756),
387 'cong;' => chr(8773),
388 'asymp;' => chr(8776),
390 'equiv;' => chr(8801),
395 'nsub;' => chr(8836),
396 'sube;' => chr(8838),
397 'supe;' => chr(8839),
398 'oplus;' => chr(8853),
399 'otimes;' => chr(8855),
400 'perp;' => chr(8869),
401 'sdot;' => chr(8901),
402 'lceil;' => chr(8968),
403 'rceil;' => chr(8969),
404 'lfloor;' => chr(8970),
405 'rfloor;' => chr(8971),
406 'lang;' => chr(9001),
407 'rang;' => chr(9002),
409 'spades;' => chr(9824),
410 'clubs;' => chr(9827),
411 'hearts;' => chr(9829),
412 'diams;' => chr(9830),
417 # Make the opposite mapping
418 while (my($entity, $char) = each(%entity2char)) {
420 $char2entity{$char} = "&$entity;";
422 delete $char2entity{"'"}; # only one-way decoding
424 # Fill in missing entities
426 next if exists $char2entity{chr($_)};
427 $char2entity{chr($_)} = "&#$_;";
430 my %subst; # compiled encoding regexps
432 sub decode_entities_old
435 if (defined wantarray) {
436 $array = [@_]; # copy
438 $array = \@_; # modify in-place
442 s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
443 s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
444 s/(&(\w+);?)/$entity2char{$2} || $1/eg;
446 wantarray ? @$array : $array->[0];
452 if (defined wantarray) {
456 $ref = \$_[0]; # modify in-place
458 if (defined $_[1] and length $_[1]) {
459 unless (exists $subst{$_[1]}) {
460 # Because we can't compile regex we fake it with a cached sub
461 my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
462 $subst{$_[1]} = eval $code;
463 die( $@ . " while trying to turn range: \"$_[1]\"\n "
464 . "into code: $code\n "
467 &{$subst{$_[1]}}($$ref);
469 # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
470 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
475 sub encode_entities_numeric {
477 return &encode_entities; # a goto &encode_entities wouldn't work
482 sprintf "&#x%X;", ord($_[0]);
486 *encode = \&encode_entities;
487 *encode_numeric = \&encode_entities_numeric;
488 *encode_numerically = \&encode_entities_numeric;
489 *decode = \&decode_entities;