vcs.maemo.org Git - dh-make-perl/blob - dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/debian/libhtml-parser-perl/usr/lib/perl5/HTML/Entities.pm

   1 package HTML::Entities;
   2
   3 # $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $
   4
   5 =head1 NAME
   6
   7 HTML::Entities - Encode or decode strings with HTML entities
   8
   9 =head1 SYNOPSIS
  10
  11  use HTML::Entities;
  12
  13  $a = "V&aring;re norske tegn b&oslash;r &#230res";
  14  decode_entities($a);
  15  encode_entities($a, "\200-\377");
  16
  17 For example, this:
  18
  19  $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
  20  print encode_entities($input), "\n"
  21
  22 Prints this out:
  23
  24  vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve
  25  papier-m&acirc;ch&eacute; r&eacute;sum&eacute;
  26
  27 =head1 DESCRIPTION
  28
  29 This module deals with encoding and decoding of strings with HTML
  30 character entities.  The module provides the following functions:
  31
  32 =over 4
  33
  34 =item decode_entities( $string, ... )
  35
  36 This routine replaces HTML entities found in the $string with the
  37 corresponding Unicode character.  Under perl 5.6 and earlier only
  38 characters in the Latin-1 range are replaced. Unrecognized
  39 entities are left alone.
  40
  41 If multiple strings are provided as argument they are each decoded
  42 separately and the same number of strings are returned.
  43
  44 If called in void context the arguments are decoded in-place.
  45
  46 This routine is exported by default.
  47
  48 =item _decode_entities( $string, \%entity2char )
  49
  50 =item _decode_entities( $string, \%entity2char, $expand_prefix )
  51
  52 This will in-place replace HTML entities in $string.  The %entity2char
  53 hash must be provided.  Named entities not found in the %entity2char
  54 hash are left alone.  Numeric entities are expanded unless their value
  55 overflow.
  56
  57 The keys in %entity2char are the entity names to be expanded and their
  58 values are what they should expand into.  The values do not have to be
  59 single character strings.  If a key has ";" as suffix,
  60 then occurrences in $string are only expanded if properly terminated
  61 with ";".  Entities without ";" will be expanded regardless of how
  62 they are terminated for compatiblity with how common browsers treat
  63 entities in the Latin-1 range.
  64
  65 If $expand_prefix is TRUE then entities without trailing ";" in
  66 %entity2char will even be expanded as a prefix of a longer
  67 unrecognized name.  The longest matching name in %entity2char will be
  68 used. This is mainly present for compatibility with an MSIE
  69 misfeature.
  70
  71    $string = "foo&nbspbar";
  72    _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
  73    print $string;  # will print "foo bar"
  74
  75 This routine is exported by default.
  76
  77 =item encode_entities( $string )
  78
  79 =item encode_entities( $string, $unsafe_chars )
  80
  81 This routine replaces unsafe characters in $string with their entity
  82 representation. A second argument can be given to specify which
  83 characters to consider unsafe (i.e., which to escape). The default set
  84 of characters to encode are control chars, high-bit chars, and the
  85 C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
  86 characters.  But this, for example, would encode I<just> the
  87 C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
  88
  89   $encoded = encode_entities($input, '<>&"');
  90
  91 This routine is exported by default.
  92
  93 =item encode_entities_numeric( $string )
  94
  95 =item encode_entities_numeric( $string, $unsafe_chars )
  96
  97 This routine works just like encode_entities, except that the replacement
  98 entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>.  For
  99 example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
 100 C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
 101
 102 This routine is I<not> exported by default.  But you can always
 103 export it with C<use HTML::Entities qw(encode_entities_numeric);>
 104 or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
 105
 106 =back
 107
 108 All these routines modify the string passed as the first argument, if
 109 called in a void context.  In scalar and array contexts, the encoded or
 110 decoded string is returned (without changing the input string).
 111
 112 If you prefer not to import these routines into your namespace, you can
 113 call them as:
 114
 115   use HTML::Entities ();
 116   $decoded = HTML::Entities::decode($a);
 117   $encoded = HTML::Entities::encode($a);
 118   $encoded = HTML::Entities::encode_numeric($a);
 119
 120 The module can also export the %char2entity and the %entity2char
 121 hashes, which contain the mapping from all characters to the
 122 corresponding entities (and vice versa, respectively).
 123
 124 =head1 COPYRIGHT
 125
 126 Copyright 1995-2006 Gisle Aas. All rights reserved.
 127
 128 This library is free software; you can redistribute it and/or
 129 modify it under the same terms as Perl itself.
 130
 131 =cut
 132
 133 use strict;
 134 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
 135 use vars qw(%entity2char %char2entity);
 136
 137 require 5.004;
 138 require Exporter;
 139 @ISA = qw(Exporter);
 140
 141 @EXPORT = qw(encode_entities decode_entities _decode_entities);
 142 @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
 143
 144 $VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/);
 145 sub Version { $VERSION; }
 146
 147 require HTML::Parser;  # for fast XS implemented decode_entities
 148
 149
 150 %entity2char = (
 151  # Some normal chars that have special meaning in SGML context
 152  amp    => '&',  # ampersand
 153 'gt'    => '>',  # greater than
 154 'lt'    => '<',  # less than
 155  quot   => '"',  # double quote
 156  apos   => "'",  # single quote
 157
 158  # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
 159  AElig  => chr(198),  # capital AE diphthong (ligature)
 160  Aacute => chr(193),  # capital A, acute accent
 161  Acirc  => chr(194),  # capital A, circumflex accent
 162  Agrave => chr(192),  # capital A, grave accent
 163  Aring  => chr(197),  # capital A, ring
 164  Atilde => chr(195),  # capital A, tilde
 165  Auml   => chr(196),  # capital A, dieresis or umlaut mark
 166  Ccedil => chr(199),  # capital C, cedilla
 167  ETH    => chr(208),  # capital Eth, Icelandic
 168  Eacute => chr(201),  # capital E, acute accent
 169  Ecirc  => chr(202),  # capital E, circumflex accent
 170  Egrave => chr(200),  # capital E, grave accent
 171  Euml   => chr(203),  # capital E, dieresis or umlaut mark
 172  Iacute => chr(205),  # capital I, acute accent
 173  Icirc  => chr(206),  # capital I, circumflex accent
 174  Igrave => chr(204),  # capital I, grave accent
 175  Iuml   => chr(207),  # capital I, dieresis or umlaut mark
 176  Ntilde => chr(209),  # capital N, tilde
 177  Oacute => chr(211),  # capital O, acute accent
 178  Ocirc  => chr(212),  # capital O, circumflex accent
 179  Ograve => chr(210),  # capital O, grave accent
 180  Oslash => chr(216),  # capital O, slash
 181  Otilde => chr(213),  # capital O, tilde
 182  Ouml   => chr(214),  # capital O, dieresis or umlaut mark
 183  THORN  => chr(222),  # capital THORN, Icelandic
 184  Uacute => chr(218),  # capital U, acute accent
 185  Ucirc  => chr(219),  # capital U, circumflex accent
 186  Ugrave => chr(217),  # capital U, grave accent
 187  Uuml   => chr(220),  # capital U, dieresis or umlaut mark
 188  Yacute => chr(221),  # capital Y, acute accent
 189  aacute => chr(225),  # small a, acute accent
 190  acirc  => chr(226),  # small a, circumflex accent
 191  aelig  => chr(230),  # small ae diphthong (ligature)
 192  agrave => chr(224),  # small a, grave accent
 193  aring  => chr(229),  # small a, ring
 194  atilde => chr(227),  # small a, tilde
 195  auml   => chr(228),  # small a, dieresis or umlaut mark
 196  ccedil => chr(231),  # small c, cedilla
 197  eacute => chr(233),  # small e, acute accent
 198  ecirc  => chr(234),  # small e, circumflex accent
 199  egrave => chr(232),  # small e, grave accent
 200  eth    => chr(240),  # small eth, Icelandic
 201  euml   => chr(235),  # small e, dieresis or umlaut mark
 202  iacute => chr(237),  # small i, acute accent
 203  icirc  => chr(238),  # small i, circumflex accent
 204  igrave => chr(236),  # small i, grave accent
 205  iuml   => chr(239),  # small i, dieresis or umlaut mark
 206  ntilde => chr(241),  # small n, tilde
 207  oacute => chr(243),  # small o, acute accent
 208  ocirc  => chr(244),  # small o, circumflex accent
 209  ograve => chr(242),  # small o, grave accent
 210  oslash => chr(248),  # small o, slash
 211  otilde => chr(245),  # small o, tilde
 212  ouml   => chr(246),  # small o, dieresis or umlaut mark
 213  szlig  => chr(223),  # small sharp s, German (sz ligature)
 214  thorn  => chr(254),  # small thorn, Icelandic
 215  uacute => chr(250),  # small u, acute accent
 216  ucirc  => chr(251),  # small u, circumflex accent
 217  ugrave => chr(249),  # small u, grave accent
 218  uuml   => chr(252),  # small u, dieresis or umlaut mark
 219  yacute => chr(253),  # small y, acute accent
 220  yuml   => chr(255),  # small y, dieresis or umlaut mark
 221
 222  # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
 223  copy   => chr(169),  # copyright sign
 224  reg    => chr(174),  # registered sign
 225  nbsp   => chr(160),  # non breaking space
 226
 227  # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
 228  iexcl  => chr(161),
 229  cent   => chr(162),
 230  pound  => chr(163),
 231  curren => chr(164),
 232  yen    => chr(165),
 233  brvbar => chr(166),
 234  sect   => chr(167),
 235  uml    => chr(168),
 236  ordf   => chr(170),
 237  laquo  => chr(171),
 238 'not'   => chr(172),    # not is a keyword in perl
 239  shy    => chr(173),
 240  macr   => chr(175),
 241  deg    => chr(176),
 242  plusmn => chr(177),
 243  sup1   => chr(185),
 244  sup2   => chr(178),
 245  sup3   => chr(179),
 246  acute  => chr(180),
 247  micro  => chr(181),
 248  para   => chr(182),
 249  middot => chr(183),
 250  cedil  => chr(184),
 251  ordm   => chr(186),
 252  raquo  => chr(187),
 253  frac14 => chr(188),
 254  frac12 => chr(189),
 255  frac34 => chr(190),
 256  iquest => chr(191),
 257 'times' => chr(215),    # times is a keyword in perl
 258  divide => chr(247),
 259
 260  ( $] > 5.007 ? (
 261   'OElig;'    => chr(338),
 262   'oelig;'    => chr(339),
 263   'Scaron;'   => chr(352),
 264   'scaron;'   => chr(353),
 265   'Yuml;'     => chr(376),
 266   'fnof;'     => chr(402),
 267   'circ;'     => chr(710),
 268   'tilde;'    => chr(732),
 269   'Alpha;'    => chr(913),
 270   'Beta;'     => chr(914),
 271   'Gamma;'    => chr(915),
 272   'Delta;'    => chr(916),
 273   'Epsilon;'  => chr(917),
 274   'Zeta;'     => chr(918),
 275   'Eta;'      => chr(919),
 276   'Theta;'    => chr(920),
 277   'Iota;'     => chr(921),
 278   'Kappa;'    => chr(922),
 279   'Lambda;'   => chr(923),
 280   'Mu;'       => chr(924),
 281   'Nu;'       => chr(925),
 282   'Xi;'       => chr(926),
 283   'Omicron;'  => chr(927),
 284   'Pi;'       => chr(928),
 285   'Rho;'      => chr(929),
 286   'Sigma;'    => chr(931),
 287   'Tau;'      => chr(932),
 288   'Upsilon;'  => chr(933),
 289   'Phi;'      => chr(934),
 290   'Chi;'      => chr(935),
 291   'Psi;'      => chr(936),
 292   'Omega;'    => chr(937),
 293   'alpha;'    => chr(945),
 294   'beta;'     => chr(946),
 295   'gamma;'    => chr(947),
 296   'delta;'    => chr(948),
 297   'epsilon;'  => chr(949),
 298   'zeta;'     => chr(950),
 299   'eta;'      => chr(951),
 300   'theta;'    => chr(952),
 301   'iota;'     => chr(953),
 302   'kappa;'    => chr(954),
 303   'lambda;'   => chr(955),
 304   'mu;'       => chr(956),
 305   'nu;'       => chr(957),
 306   'xi;'       => chr(958),
 307   'omicron;'  => chr(959),
 308   'pi;'       => chr(960),
 309   'rho;'      => chr(961),
 310   'sigmaf;'   => chr(962),
 311   'sigma;'    => chr(963),
 312   'tau;'      => chr(964),
 313   'upsilon;'  => chr(965),
 314   'phi;'      => chr(966),
 315   'chi;'      => chr(967),
 316   'psi;'      => chr(968),
 317   'omega;'    => chr(969),
 318   'thetasym;' => chr(977),
 319   'upsih;'    => chr(978),
 320   'piv;'      => chr(982),
 321   'ensp;'     => chr(8194),
 322   'emsp;'     => chr(8195),
 323   'thinsp;'   => chr(8201),
 324   'zwnj;'     => chr(8204),
 325   'zwj;'      => chr(8205),
 326   'lrm;'      => chr(8206),
 327   'rlm;'      => chr(8207),
 328   'ndash;'    => chr(8211),
 329   'mdash;'    => chr(8212),
 330   'lsquo;'    => chr(8216),
 331   'rsquo;'    => chr(8217),
 332   'sbquo;'    => chr(8218),
 333   'ldquo;'    => chr(8220),
 334   'rdquo;'    => chr(8221),
 335   'bdquo;'    => chr(8222),
 336   'dagger;'   => chr(8224),
 337   'Dagger;'   => chr(8225),
 338   'bull;'     => chr(8226),
 339   'hellip;'   => chr(8230),
 340   'permil;'   => chr(8240),
 341   'prime;'    => chr(8242),
 342   'Prime;'    => chr(8243),
 343   'lsaquo;'   => chr(8249),
 344   'rsaquo;'   => chr(8250),
 345   'oline;'    => chr(8254),
 346   'frasl;'    => chr(8260),
 347   'euro;'     => chr(8364),
 348   'image;'    => chr(8465),
 349   'weierp;'   => chr(8472),
 350   'real;'     => chr(8476),
 351   'trade;'    => chr(8482),
 352   'alefsym;'  => chr(8501),
 353   'larr;'     => chr(8592),
 354   'uarr;'     => chr(8593),
 355   'rarr;'     => chr(8594),
 356   'darr;'     => chr(8595),
 357   'harr;'     => chr(8596),
 358   'crarr;'    => chr(8629),
 359   'lArr;'     => chr(8656),
 360   'uArr;'     => chr(8657),
 361   'rArr;'     => chr(8658),
 362   'dArr;'     => chr(8659),
 363   'hArr;'     => chr(8660),
 364   'forall;'   => chr(8704),
 365   'part;'     => chr(8706),
 366   'exist;'    => chr(8707),
 367   'empty;'    => chr(8709),
 368   'nabla;'    => chr(8711),
 369   'isin;'     => chr(8712),
 370   'notin;'    => chr(8713),
 371   'ni;'       => chr(8715),
 372   'prod;'     => chr(8719),
 373   'sum;'      => chr(8721),
 374   'minus;'    => chr(8722),
 375   'lowast;'   => chr(8727),
 376   'radic;'    => chr(8730),
 377   'prop;'     => chr(8733),
 378   'infin;'    => chr(8734),
 379   'ang;'      => chr(8736),
 380   'and;'      => chr(8743),
 381   'or;'       => chr(8744),
 382   'cap;'      => chr(8745),
 383   'cup;'      => chr(8746),
 384   'int;'      => chr(8747),
 385   'there4;'   => chr(8756),
 386   'sim;'      => chr(8764),
 387   'cong;'     => chr(8773),
 388   'asymp;'    => chr(8776),
 389   'ne;'       => chr(8800),
 390   'equiv;'    => chr(8801),
 391   'le;'       => chr(8804),
 392   'ge;'       => chr(8805),
 393   'sub;'      => chr(8834),
 394   'sup;'      => chr(8835),
 395   'nsub;'     => chr(8836),
 396   'sube;'     => chr(8838),
 397   'supe;'     => chr(8839),
 398   'oplus;'    => chr(8853),
 399   'otimes;'   => chr(8855),
 400   'perp;'     => chr(8869),
 401   'sdot;'     => chr(8901),
 402   'lceil;'    => chr(8968),
 403   'rceil;'    => chr(8969),
 404   'lfloor;'   => chr(8970),
 405   'rfloor;'   => chr(8971),
 406   'lang;'     => chr(9001),
 407   'rang;'     => chr(9002),
 408   'loz;'      => chr(9674),
 409   'spades;'   => chr(9824),
 410   'clubs;'    => chr(9827),
 411   'hearts;'   => chr(9829),
 412   'diams;'    => chr(9830),
 413  ) : ())
 414 );
 415
 416
 417 # Make the opposite mapping
 418 while (my($entity, $char) = each(%entity2char)) {
 419     $entity =~ s/;\z//;
 420     $char2entity{$char} = "&$entity;";
 421 }
 422 delete $char2entity{"'"};  # only one-way decoding
 423
 424 # Fill in missing entities
 425 for (0 .. 255) {
 426     next if exists $char2entity{chr($_)};
 427     $char2entity{chr($_)} = "&#$_;";
 428 }
 429
 430 my %subst;  # compiled encoding regexps
 431
 432 sub decode_entities_old
 433 {
 434     my $array;
 435     if (defined wantarray) {
 436         $array = [@_]; # copy
 437     } else {
 438         $array = \@_;  # modify in-place
 439     }
 440     my $c;
 441     for (@$array) {
 442         s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
 443         s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
 444         s/(&(\w+);?)/$entity2char{$2} || $1/eg;
 445     }
 446     wantarray ? @$array : $array->[0];
 447 }
 448
 449 sub encode_entities
 450 {
 451     my $ref;
 452     if (defined wantarray) {
 453         my $x = $_[0];
 454         $ref = \$x;     # copy
 455     } else {
 456         $ref = \$_[0];  # modify in-place
 457     }
 458     if (defined $_[1] and length $_[1]) {
 459         unless (exists $subst{$_[1]}) {
 460             # Because we can't compile regex we fake it with a cached sub
 461             my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
 462             $subst{$_[1]} = eval $code;
 463             die( $@ . " while trying to turn range: \"$_[1]\"\n "
 464               . "into code: $code\n "
 465             ) if $@;
 466         }
 467         &{$subst{$_[1]}}($$ref);
 468     } else {
 469         # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
 470         $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
 471     }
 472     $$ref;
 473 }
 474
 475 sub encode_entities_numeric {
 476     local %char2entity;
 477     return &encode_entities;   # a goto &encode_entities wouldn't work
 478 }
 479
 480
 481 sub num_entity {
 482     sprintf "&#x%X;", ord($_[0]);
 483 }
 484
 485 # Set up aliases
 486 *encode = \&encode_entities;
 487 *encode_numeric = \&encode_entities_numeric;
 488 *encode_numerically = \&encode_entities_numeric;
 489 *decode = \&decode_entities;
 490
 491 1;