vcs.maemo.org Git - feedingit/blob - src/BeautifulSoup.py

   1 """Beautiful Soup
   2 Elixir and Tonic
   3 "The Screen-Scraper's Friend"
   4 http://www.crummy.com/software/BeautifulSoup/
   5
   6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 tree representation. It provides methods and Pythonic idioms that make
   8 it easy to navigate, search, and modify the tree.
   9
  10 A well-formed XML/HTML document yields a well-formed data
  11 structure. An ill-formed XML/HTML document yields a correspondingly
  12 ill-formed data structure. If your document is only locally
  13 well-formed, you can use this library to find and process the
  14 well-formed part of it.
  15
  16 Beautiful Soup works with Python 2.2 and up. It has no external
  17 dependencies, but you'll have more success at converting data to UTF-8
  18 if you also install these three packages:
  19
  20 * chardet, for auto-detecting character encodings
  21   http://chardet.feedparser.org/
  22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23   by stock Python.
  24   http://cjkpython.i18n.org/
  25
  26 Beautiful Soup defines classes for two main parsing strategies:
  27
  28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29    language that kind of looks like XML.
  30
  31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32    or invalid. This class has web browser-like heuristics for
  33    obtaining a sensible parse tree in the face of common HTML errors.
  34
  35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 the encoding of an HTML or XML document, and converting it to
  37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38
  39 For more than you ever wanted to know about Beautiful Soup, see the
  40 documentation:
  41 http://www.crummy.com/software/BeautifulSoup/documentation.html
  42
  43 Here, have some legalese:
  44
  45 Copyright (c) 2004-2009, Leonard Richardson
  46
  47 All rights reserved.
  48
  49 Redistribution and use in source and binary forms, with or without
  50 modification, are permitted provided that the following conditions are
  51 met:
  52
  53   * Redistributions of source code must retain the above copyright
  54     notice, this list of conditions and the following disclaimer.
  55
  56   * Redistributions in binary form must reproduce the above
  57     copyright notice, this list of conditions and the following
  58     disclaimer in the documentation and/or other materials provided
  59     with the distribution.
  60
  61   * Neither the name of the the Beautiful Soup Consortium and All
  62     Night Kosher Bakery nor the names of its contributors may be
  63     used to endorse or promote products derived from this software
  64     without specific prior written permission.
  65
  66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77
  78 """
  79 from __future__ import generators
  80
  81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 __version__ = "3.0.8"
  83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
  84 __license__ = "New-style BSD"
  85
  86 from sgmllib import SGMLParser, SGMLParseError
  87 import codecs
  88 import markupbase
  89 import types
  90 import re
  91 import sgmllib
  92 try:
  93   from htmlentitydefs import name2codepoint
  94 except ImportError:
  95   name2codepoint = {}
  96 try:
  97     set
  98 except NameError:
  99     from sets import Set as set
 100
 101 #These hacks make Beautiful Soup able to parse XML with namespaces
 102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 104
 105 DEFAULT_OUTPUT_ENCODING = "utf-8"
 106
 107 def _match_css_class(str):
 108     """Build a RE to match the given CSS class."""
 109     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 110
 111 # First, the classes that represent markup elements.
 112
 113 class PageElement(object):
 114     """Contains the navigational information for some part of the page
 115     (either a tag or a piece of text)"""
 116
 117     def setup(self, parent=None, previous=None):
 118         """Sets up the initial relations between this element and
 119         other elements."""
 120         self.parent = parent
 121         self.previous = previous
 122         self.next = None
 123         self.previousSibling = None
 124         self.nextSibling = None
 125         if self.parent and self.parent.contents:
 126             self.previousSibling = self.parent.contents[-1]
 127             self.previousSibling.nextSibling = self
 128
 129     def replaceWith(self, replaceWith):
 130         oldParent = self.parent
 131         myIndex = self.parent.index(self)
 132         if hasattr(replaceWith, "parent")\
 133                   and replaceWith.parent is self.parent:
 134             # We're replacing this element with one of its siblings.
 135             index = replaceWith.parent.index(replaceWith)
 136             if index and index < myIndex:
 137                 # Furthermore, it comes before this element. That
 138                 # means that when we extract it, the index of this
 139                 # element will change.
 140                 myIndex = myIndex - 1
 141         self.extract()
 142         oldParent.insert(myIndex, replaceWith)
 143
 144     def replaceWithChildren(self):
 145         myParent = self.parent
 146         myIndex = self.parent.index(self)
 147         self.extract()
 148         reversedChildren = list(self.contents)
 149         reversedChildren.reverse()
 150         for child in reversedChildren:
 151             myParent.insert(myIndex, child)
 152
 153     def extract(self):
 154         """Destructively rips this element out of the tree."""
 155         if self.parent:
 156             try:
 157                 del self.parent.contents[self.parent.index(self)]
 158             except ValueError:
 159                 pass
 160
 161         #Find the two elements that would be next to each other if
 162         #this element (and any children) hadn't been parsed. Connect
 163         #the two.
 164         lastChild = self._lastRecursiveChild()
 165         nextElement = lastChild.next
 166
 167         if self.previous:
 168             self.previous.next = nextElement
 169         if nextElement:
 170             nextElement.previous = self.previous
 171         self.previous = None
 172         lastChild.next = None
 173
 174         self.parent = None
 175         if self.previousSibling:
 176             self.previousSibling.nextSibling = self.nextSibling
 177         if self.nextSibling:
 178             self.nextSibling.previousSibling = self.previousSibling
 179         self.previousSibling = self.nextSibling = None
 180         return self
 181
 182     def _lastRecursiveChild(self):
 183         "Finds the last element beneath this object to be parsed."
 184         lastChild = self
 185         while hasattr(lastChild, 'contents') and lastChild.contents:
 186             lastChild = lastChild.contents[-1]
 187         return lastChild
 188
 189     def insert(self, position, newChild):
 190         if isinstance(newChild, basestring) \
 191             and not isinstance(newChild, NavigableString):
 192             newChild = NavigableString(newChild)
 193
 194         position =  min(position, len(self.contents))
 195         if hasattr(newChild, 'parent') and newChild.parent is not None:
 196             # We're 'inserting' an element that's already one
 197             # of this object's children.
 198             if newChild.parent is self:
 199                 index = self.index(newChild)
 200                 if index > position:
 201                     # Furthermore we're moving it further down the
 202                     # list of this object's children. That means that
 203                     # when we extract this element, our target index
 204                     # will jump down one.
 205                     position = position - 1
 206             newChild.extract()
 207
 208         newChild.parent = self
 209         previousChild = None
 210         if position == 0:
 211             newChild.previousSibling = None
 212             newChild.previous = self
 213         else:
 214             previousChild = self.contents[position-1]
 215             newChild.previousSibling = previousChild
 216             newChild.previousSibling.nextSibling = newChild
 217             newChild.previous = previousChild._lastRecursiveChild()
 218         if newChild.previous:
 219             newChild.previous.next = newChild
 220
 221         newChildsLastElement = newChild._lastRecursiveChild()
 222
 223         if position >= len(self.contents):
 224             newChild.nextSibling = None
 225
 226             parent = self
 227             parentsNextSibling = None
 228             while not parentsNextSibling:
 229                 parentsNextSibling = parent.nextSibling
 230                 parent = parent.parent
 231                 if not parent: # This is the last element in the document.
 232                     break
 233             if parentsNextSibling:
 234                 newChildsLastElement.next = parentsNextSibling
 235             else:
 236                 newChildsLastElement.next = None
 237         else:
 238             nextChild = self.contents[position]
 239             newChild.nextSibling = nextChild
 240             if newChild.nextSibling:
 241                 newChild.nextSibling.previousSibling = newChild
 242             newChildsLastElement.next = nextChild
 243
 244         if newChildsLastElement.next:
 245             newChildsLastElement.next.previous = newChildsLastElement
 246         self.contents.insert(position, newChild)
 247
 248     def append(self, tag):
 249         """Appends the given tag to the contents of this tag."""
 250         self.insert(len(self.contents), tag)
 251
 252     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 253         """Returns the first item that matches the given criteria and
 254         appears after this Tag in the document."""
 255         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 256
 257     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 258                     **kwargs):
 259         """Returns all items that match the given criteria and appear
 260         after this Tag in the document."""
 261         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 262                              **kwargs)
 263
 264     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 265         """Returns the closest sibling to this Tag that matches the
 266         given criteria and appears after this Tag in the document."""
 267         return self._findOne(self.findNextSiblings, name, attrs, text,
 268                              **kwargs)
 269
 270     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 271                          **kwargs):
 272         """Returns the siblings of this Tag that match the given
 273         criteria and appear after this Tag in the document."""
 274         return self._findAll(name, attrs, text, limit,
 275                              self.nextSiblingGenerator, **kwargs)
 276     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 277
 278     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 279         """Returns the first item that matches the given criteria and
 280         appears before this Tag in the document."""
 281         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 282
 283     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 284                         **kwargs):
 285         """Returns all items that match the given criteria and appear
 286         before this Tag in the document."""
 287         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 288                            **kwargs)
 289     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 290
 291     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 292         """Returns the closest sibling to this Tag that matches the
 293         given criteria and appears before this Tag in the document."""
 294         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 295                              **kwargs)
 296
 297     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 298                              limit=None, **kwargs):
 299         """Returns the siblings of this Tag that match the given
 300         criteria and appear before this Tag in the document."""
 301         return self._findAll(name, attrs, text, limit,
 302                              self.previousSiblingGenerator, **kwargs)
 303     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 304
 305     def findParent(self, name=None, attrs={}, **kwargs):
 306         """Returns the closest parent of this Tag that matches the given
 307         criteria."""
 308         # NOTE: We can't use _findOne because findParents takes a different
 309         # set of arguments.
 310         r = None
 311         l = self.findParents(name, attrs, 1)
 312         if l:
 313             r = l[0]
 314         return r
 315
 316     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 317         """Returns the parents of this Tag that match the given
 318         criteria."""
 319
 320         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 321                              **kwargs)
 322     fetchParents = findParents # Compatibility with pre-3.x
 323
 324     #These methods do the real heavy lifting.
 325
 326     def _findOne(self, method, name, attrs, text, **kwargs):
 327         r = None
 328         l = method(name, attrs, text, 1, **kwargs)
 329         if l:
 330             r = l[0]
 331         return r
 332
 333     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 334         "Iterates over a generator looking for things that match."
 335
 336         if isinstance(name, SoupStrainer):
 337             strainer = name
 338         # Special case some findAll* searches
 339         # findAll*(True)
 340         elif not limit and name is True and not attrs and not kwargs:
 341             return [element for element in generator()
 342                     if isinstance(element, Tag)]
 343
 344         # findAll*('tag-name')
 345         elif not limit and isinstance(name, basestring) and not attrs \
 346                 and not kwargs:
 347             return [element for element in generator()
 348                     if isinstance(element, Tag) and element.name == name]
 349
 350         # Build a SoupStrainer
 351         else:
 352             strainer = SoupStrainer(name, attrs, text, **kwargs)
 353         results = ResultSet(strainer)
 354         g = generator()
 355         while True:
 356             try:
 357                 i = g.next()
 358             except StopIteration:
 359                 break
 360             if i:
 361                 found = strainer.search(i)
 362                 if found:
 363                     results.append(found)
 364                     if limit and len(results) >= limit:
 365                         break
 366         return results
 367
 368     #These Generators can be used to navigate starting from both
 369     #NavigableStrings and Tags.
 370     def nextGenerator(self):
 371         i = self
 372         while i is not None:
 373             i = i.next
 374             yield i
 375
 376     def nextSiblingGenerator(self):
 377         i = self
 378         while i is not None:
 379             i = i.nextSibling
 380             yield i
 381
 382     def previousGenerator(self):
 383         i = self
 384         while i is not None:
 385             i = i.previous
 386             yield i
 387
 388     def previousSiblingGenerator(self):
 389         i = self
 390         while i is not None:
 391             i = i.previousSibling
 392             yield i
 393
 394     def parentGenerator(self):
 395         i = self
 396         while i is not None:
 397             i = i.parent
 398             yield i
 399
 400     # Utility methods
 401     def substituteEncoding(self, str, encoding=None):
 402         encoding = encoding or "utf-8"
 403         return str.replace("%SOUP-ENCODING%", encoding)
 404
 405     def toEncoding(self, s, encoding=None):
 406         """Encodes an object to a string in some encoding, or to Unicode.
 407         ."""
 408         if isinstance(s, unicode):
 409             if encoding:
 410                 s = s.encode(encoding)
 411         elif isinstance(s, str):
 412             if encoding:
 413                 s = s.encode(encoding)
 414             else:
 415                 s = unicode(s)
 416         else:
 417             if encoding:
 418                 s  = self.toEncoding(str(s), encoding)
 419             else:
 420                 s = unicode(s)
 421         return s
 422
 423 class NavigableString(unicode, PageElement):
 424
 425     def __new__(cls, value):
 426         """Create a new NavigableString.
 427
 428         When unpickling a NavigableString, this method is called with
 429         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 430         passed in to the superclass's __new__ or the superclass won't know
 431         how to handle non-ASCII characters.
 432         """
 433         if isinstance(value, unicode):
 434             return unicode.__new__(cls, value)
 435         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 436
 437     def __getnewargs__(self):
 438         return (NavigableString.__str__(self),)
 439
 440     def __getattr__(self, attr):
 441         """text.string gives you text. This is for backwards
 442         compatibility for Navigable*String, but for CData* it lets you
 443         get the string without the CData wrapper."""
 444         if attr == 'string':
 445             return self
 446         else:
 447             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 448
 449     def __unicode__(self):
 450         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 451
 452     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 453         if encoding:
 454             return self.encode(encoding)
 455         else:
 456             return self
 457
 458 class CData(NavigableString):
 459
 460     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 461         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 462
 463 class ProcessingInstruction(NavigableString):
 464     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 465         output = self
 466         if "%SOUP-ENCODING%" in output:
 467             output = self.substituteEncoding(output, encoding)
 468         return "<?%s?>" % self.toEncoding(output, encoding)
 469
 470 class Comment(NavigableString):
 471     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 472         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 473
 474 class Declaration(NavigableString):
 475     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 476         return "<!%s>" % NavigableString.__str__(self, encoding)
 477
 478 class Tag(PageElement):
 479
 480     """Represents a found HTML tag with its attributes and contents."""
 481
 482     def _invert(h):
 483         "Cheap function to invert a hash."
 484         i = {}
 485         for k,v in h.items():
 486             i[v] = k
 487         return i
 488
 489     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 490                                       "quot" : '"',
 491                                       "amp" : "&",
 492                                       "lt" : "<",
 493                                       "gt" : ">" }
 494
 495     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 496
 497     def _convertEntities(self, match):
 498         """Used in a call to re.sub to replace HTML, XML, and numeric
 499         entities with the appropriate Unicode characters. If HTML
 500         entities are being converted, any unrecognized entities are
 501         escaped."""
 502         x = match.group(1)
 503         if self.convertHTMLEntities and x in name2codepoint:
 504             return unichr(name2codepoint[x])
 505         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 506             if self.convertXMLEntities:
 507                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 508             else:
 509                 return u'&%s;' % x
 510         elif len(x) > 0 and x[0] == '#':
 511             # Handle numeric entities
 512             if len(x) > 1 and x[1] == 'x':
 513                 return unichr(int(x[2:], 16))
 514             else:
 515                 return unichr(int(x[1:]))
 516
 517         elif self.escapeUnrecognizedEntities:
 518             return u'&amp;%s;' % x
 519         else:
 520             return u'&%s;' % x
 521
 522     def __init__(self, parser, name, attrs=None, parent=None,
 523                  previous=None):
 524         "Basic constructor."
 525
 526         # We don't actually store the parser object: that lets extracted
 527         # chunks be garbage-collected
 528         self.parserClass = parser.__class__
 529         self.isSelfClosing = parser.isSelfClosingTag(name)
 530         self.name = name
 531         if attrs is None:
 532             attrs = []
 533         self.attrs = attrs
 534         self.contents = []
 535         self.setup(parent, previous)
 536         self.hidden = False
 537         self.containsSubstitutions = False
 538         self.convertHTMLEntities = parser.convertHTMLEntities
 539         self.convertXMLEntities = parser.convertXMLEntities
 540         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 541
 542         # Convert any HTML, XML, or numeric entities in the attribute values.
 543         convert = lambda(k, val): (k,
 544                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 545                                           self._convertEntities,
 546                                           val))
 547         self.attrs = map(convert, self.attrs)
 548
 549     def getString(self):
 550         if (len(self.contents) == 1
 551             and isinstance(self.contents[0], NavigableString)):
 552             return self.contents[0]
 553
 554     def setString(self, string):
 555         """Replace the contents of the tag with a string"""
 556         self.clear()
 557         self.append(string)
 558
 559     string = property(getString, setString)
 560
 561     def getText(self, separator=u""):
 562         if not len(self.contents):
 563             return u""
 564         stopNode = self._lastRecursiveChild().next
 565         strings = []
 566         current = self.contents[0]
 567         while current is not stopNode:
 568             if isinstance(current, NavigableString):
 569                 strings.append(current.strip())
 570             current = current.next
 571         return separator.join(strings)
 572
 573     text = property(getText)
 574
 575     def get(self, key, default=None):
 576         """Returns the value of the 'key' attribute for the tag, or
 577         the value given for 'default' if it doesn't have that
 578         attribute."""
 579         return self._getAttrMap().get(key, default)
 580
 581     def clear(self):
 582         """Extract all children."""
 583         for child in self.contents[:]:
 584             child.extract()
 585
 586     def index(self, element):
 587         for i, child in enumerate(self.contents):
 588             if child is element:
 589                 return i
 590         raise ValueError("Tag.index: element not in tag")
 591
 592     def has_key(self, key):
 593         return self._getAttrMap().has_key(key)
 594
 595     def __getitem__(self, key):
 596         """tag[key] returns the value of the 'key' attribute for the tag,
 597         and throws an exception if it's not there."""
 598         return self._getAttrMap()[key]
 599
 600     def __iter__(self):
 601         "Iterating over a tag iterates over its contents."
 602         return iter(self.contents)
 603
 604     def __len__(self):
 605         "The length of a tag is the length of its list of contents."
 606         return len(self.contents)
 607
 608     def __contains__(self, x):
 609         return x in self.contents
 610
 611     def __nonzero__(self):
 612         "A tag is non-None even if it has no contents."
 613         return True
 614
 615     def __setitem__(self, key, value):
 616         """Setting tag[key] sets the value of the 'key' attribute for the
 617         tag."""
 618         self._getAttrMap()
 619         self.attrMap[key] = value
 620         found = False
 621         for i in range(0, len(self.attrs)):
 622             if self.attrs[i][0] == key:
 623                 self.attrs[i] = (key, value)
 624                 found = True
 625         if not found:
 626             self.attrs.append((key, value))
 627         self._getAttrMap()[key] = value
 628
 629     def __delitem__(self, key):
 630         "Deleting tag[key] deletes all 'key' attributes for the tag."
 631         for item in self.attrs:
 632             if item[0] == key:
 633                 self.attrs.remove(item)
 634                 #We don't break because bad HTML can define the same
 635                 #attribute multiple times.
 636             self._getAttrMap()
 637             if self.attrMap.has_key(key):
 638                 del self.attrMap[key]
 639
 640     def __call__(self, *args, **kwargs):
 641         """Calling a tag like a function is the same as calling its
 642         findAll() method. Eg. tag('a') returns a list of all the A tags
 643         found within this tag."""
 644         return apply(self.findAll, args, kwargs)
 645
 646     def __getattr__(self, tag):
 647         #print "Getattr %s.%s" % (self.__class__, tag)
 648         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 649             return self.find(tag[:-3])
 650         elif tag.find('__') != 0:
 651             return self.find(tag)
 652         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 653
 654     def __eq__(self, other):
 655         """Returns true iff this tag has the same name, the same attributes,
 656         and the same contents (recursively) as the given tag.
 657
 658         NOTE: right now this will return false if two tags have the
 659         same attributes in a different order. Should this be fixed?"""
 660         if other is self:
 661             return True
 662         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 663             return False
 664         for i in range(0, len(self.contents)):
 665             if self.contents[i] != other.contents[i]:
 666                 return False
 667         return True
 668
 669     def __ne__(self, other):
 670         """Returns true iff this tag is not identical to the other tag,
 671         as defined in __eq__."""
 672         return not self == other
 673
 674     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 675         """Renders this tag as a string."""
 676         return self.__str__(encoding)
 677
 678     def __unicode__(self):
 679         return self.__str__(None)
 680
 681     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 682                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 683                                            + ")")
 684
 685     def _sub_entity(self, x):
 686         """Used with a regular expression to substitute the
 687         appropriate XML entity for an XML special character."""
 688         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 689
 690     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 691                 prettyPrint=False, indentLevel=0):
 692         """Returns a string or Unicode representation of this tag and
 693         its contents. To get Unicode, pass None for encoding.
 694
 695         NOTE: since Python's HTML parser consumes whitespace, this
 696         method is not certain to reproduce the whitespace present in
 697         the original string."""
 698
 699         encodedName = self.toEncoding(self.name, encoding)
 700
 701         attrs = []
 702         if self.attrs:
 703             for key, val in self.attrs:
 704                 fmt = '%s="%s"'
 705                 if isinstance(val, basestring):
 706                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 707                         val = self.substituteEncoding(val, encoding)
 708
 709                     # The attribute value either:
 710                     #
 711                     # * Contains no embedded double quotes or single quotes.
 712                     #   No problem: we enclose it in double quotes.
 713                     # * Contains embedded single quotes. No problem:
 714                     #   double quotes work here too.
 715                     # * Contains embedded double quotes. No problem:
 716                     #   we enclose it in single quotes.
 717                     # * Embeds both single _and_ double quotes. This
 718                     #   can't happen naturally, but it can happen if
 719                     #   you modify an attribute value after parsing
 720                     #   the document. Now we have a bit of a
 721                     #   problem. We solve it by enclosing the
 722                     #   attribute in single quotes, and escaping any
 723                     #   embedded single quotes to XML entities.
 724                     if '"' in val:
 725                         fmt = "%s='%s'"
 726                         if "'" in val:
 727                             # TODO: replace with apos when
 728                             # appropriate.
 729                             val = val.replace("'", "&squot;")
 730
 731                     # Now we're okay w/r/t quotes. But the attribute
 732                     # value might also contain angle brackets, or
 733                     # ampersands that aren't part of entities. We need
 734                     # to escape those to XML entities too.
 735                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 736
 737                 attrs.append(fmt % (self.toEncoding(key, encoding),
 738                                     self.toEncoding(val, encoding)))
 739         close = ''
 740         closeTag = ''
 741         if self.isSelfClosing:
 742             close = ' /'
 743         else:
 744             closeTag = '</%s>' % encodedName
 745
 746         indentTag, indentContents = 0, 0
 747         if prettyPrint:
 748             indentTag = indentLevel
 749             space = (' ' * (indentTag-1))
 750             indentContents = indentTag + 1
 751         contents = self.renderContents(encoding, prettyPrint, indentContents)
 752         if self.hidden:
 753             s = contents
 754         else:
 755             s = []
 756             attributeString = ''
 757             if attrs:
 758                 attributeString = ' ' + ' '.join(attrs)
 759             if prettyPrint:
 760                 s.append(space)
 761             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 762             if prettyPrint:
 763                 s.append("\n")
 764             s.append(contents)
 765             if prettyPrint and contents and contents[-1] != "\n":
 766                 s.append("\n")
 767             if prettyPrint and closeTag:
 768                 s.append(space)
 769             s.append(closeTag)
 770             if prettyPrint and closeTag and self.nextSibling:
 771                 s.append("\n")
 772             s = ''.join(s)
 773         return s
 774
 775     def decompose(self):
 776         """Recursively destroys the contents of this tree."""
 777         self.extract()
 778         if len(self.contents) == 0:
 779             return
 780         current = self.contents[0]
 781         while current is not None:
 782             next = current.next
 783             if isinstance(current, Tag):
 784                 del current.contents[:]
 785             current.parent = None
 786             current.previous = None
 787             current.previousSibling = None
 788             current.next = None
 789             current.nextSibling = None
 790             current = next
 791
 792     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 793         return self.__str__(encoding, True)
 794
 795     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 796                        prettyPrint=False, indentLevel=0):
 797         """Renders the contents of this tag as a string in the given
 798         encoding. If encoding is None, returns a Unicode string.."""
 799         s=[]
 800         for c in self:
 801             text = None
 802             if isinstance(c, NavigableString):
 803                 text = c.__str__(encoding)
 804             elif isinstance(c, Tag):
 805                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 806             if text and prettyPrint:
 807                 text = text.strip()
 808             if text:
 809                 if prettyPrint:
 810                     s.append(" " * (indentLevel-1))
 811                 s.append(text)
 812                 if prettyPrint:
 813                     s.append("\n")
 814         return ''.join(s)
 815
 816     #Soup methods
 817
 818     def find(self, name=None, attrs={}, recursive=True, text=None,
 819              **kwargs):
 820         """Return only the first child of this Tag matching the given
 821         criteria."""
 822         r = None
 823         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 824         if l:
 825             r = l[0]
 826         return r
 827     findChild = find
 828
 829     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 830                 limit=None, **kwargs):
 831         """Extracts a list of Tag objects that match the given
 832         criteria.  You can specify the name of the Tag and any
 833         attributes you want the Tag to have.
 834
 835         The value of a key-value pair in the 'attrs' map can be a
 836         string, a list of strings, a regular expression object, or a
 837         callable that takes a string and returns whether or not the
 838         string matches for some custom definition of 'matches'. The
 839         same is true of the tag name."""
 840         generator = self.recursiveChildGenerator
 841         if not recursive:
 842             generator = self.childGenerator
 843         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 844     findChildren = findAll
 845
 846     # Pre-3.x compatibility methods
 847     first = find
 848     fetch = findAll
 849
 850     def fetchText(self, text=None, recursive=True, limit=None):
 851         return self.findAll(text=text, recursive=recursive, limit=limit)
 852
 853     def firstText(self, text=None, recursive=True):
 854         return self.find(text=text, recursive=recursive)
 855
 856     #Private methods
 857
 858     def _getAttrMap(self):
 859         """Initializes a map representation of this tag's attributes,
 860         if not already initialized."""
 861         if not getattr(self, 'attrMap'):
 862             self.attrMap = {}
 863             for (key, value) in self.attrs:
 864                 self.attrMap[key] = value
 865         return self.attrMap
 866
 867     #Generator methods
 868     def childGenerator(self):
 869         # Just use the iterator from the contents
 870         return iter(self.contents)
 871
 872     def recursiveChildGenerator(self):
 873         if not len(self.contents):
 874             raise StopIteration
 875         stopNode = self._lastRecursiveChild().next
 876         current = self.contents[0]
 877         while current is not stopNode:
 878             yield current
 879             current = current.next
 880
 881
 882 # Next, a couple classes to represent queries and their results.
 883 class SoupStrainer:
 884     """Encapsulates a number of ways of matching a markup element (tag or
 885     text)."""
 886
 887     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 888         self.name = name
 889         if isinstance(attrs, basestring):
 890             kwargs['class'] = _match_css_class(attrs)
 891             attrs = None
 892         if kwargs:
 893             if attrs:
 894                 attrs = attrs.copy()
 895                 attrs.update(kwargs)
 896             else:
 897                 attrs = kwargs
 898         self.attrs = attrs
 899         self.text = text
 900
 901     def __str__(self):
 902         if self.text:
 903             return self.text
 904         else:
 905             return "%s|%s" % (self.name, self.attrs)
 906
 907     def searchTag(self, markupName=None, markupAttrs={}):
 908         found = None
 909         markup = None
 910         if isinstance(markupName, Tag):
 911             markup = markupName
 912             markupAttrs = markup
 913         callFunctionWithTagData = callable(self.name) \
 914                                 and not isinstance(markupName, Tag)
 915
 916         if (not self.name) \
 917                or callFunctionWithTagData \
 918                or (markup and self._matches(markup, self.name)) \
 919                or (not markup and self._matches(markupName, self.name)):
 920             if callFunctionWithTagData:
 921                 match = self.name(markupName, markupAttrs)
 922             else:
 923                 match = True
 924                 markupAttrMap = None
 925                 for attr, matchAgainst in self.attrs.items():
 926                     if not markupAttrMap:
 927                          if hasattr(markupAttrs, 'get'):
 928                             markupAttrMap = markupAttrs
 929                          else:
 930                             markupAttrMap = {}
 931                             for k,v in markupAttrs:
 932                                 markupAttrMap[k] = v
 933                     attrValue = markupAttrMap.get(attr)
 934                     if not self._matches(attrValue, matchAgainst):
 935                         match = False
 936                         break
 937             if match:
 938                 if markup:
 939                     found = markup
 940                 else:
 941                     found = markupName
 942         return found
 943
 944     def search(self, markup):
 945         #print 'looking for %s in %s' % (self, markup)
 946         found = None
 947         # If given a list of items, scan it for a text element that
 948         # matches.
 949         if hasattr(markup, "__iter__") \
 950                 and not isinstance(markup, Tag):
 951             for element in markup:
 952                 if isinstance(element, NavigableString) \
 953                        and self.search(element):
 954                     found = element
 955                     break
 956         # If it's a Tag, make sure its name or attributes match.
 957         # Don't bother with Tags if we're searching for text.
 958         elif isinstance(markup, Tag):
 959             if not self.text:
 960                 found = self.searchTag(markup)
 961         # If it's text, make sure the text matches.
 962         elif isinstance(markup, NavigableString) or \
 963                  isinstance(markup, basestring):
 964             if self._matches(markup, self.text):
 965                 found = markup
 966         else:
 967             raise Exception, "I don't know how to match against a %s" \
 968                   % markup.__class__
 969         return found
 970
 971     def _matches(self, markup, matchAgainst):
 972         #print "Matching %s against %s" % (markup, matchAgainst)
 973         result = False
 974         if matchAgainst is True:
 975             result = markup is not None
 976         elif callable(matchAgainst):
 977             result = matchAgainst(markup)
 978         else:
 979             #Custom match methods take the tag as an argument, but all
 980             #other ways of matching match the tag name as a string.
 981             if isinstance(markup, Tag):
 982                 markup = markup.name
 983             if markup and not isinstance(markup, basestring):
 984                 markup = unicode(markup)
 985             #Now we know that chunk is either a string, or None.
 986             if hasattr(matchAgainst, 'match'):
 987                 # It's a regexp object.
 988                 result = markup and matchAgainst.search(markup)
 989             elif hasattr(matchAgainst, '__iter__'): # list-like
 990                 result = markup in matchAgainst
 991             elif hasattr(matchAgainst, 'items'):
 992                 result = markup.has_key(matchAgainst)
 993             elif matchAgainst and isinstance(markup, basestring):
 994                 if isinstance(markup, unicode):
 995                     matchAgainst = unicode(matchAgainst)
 996                 else:
 997                     matchAgainst = str(matchAgainst)
 998
 999             if not result:
1000                 result = matchAgainst == markup
1001         return result
1002
1003 class ResultSet(list):
1004     """A ResultSet is just a list that keeps track of the SoupStrainer
1005     that created it."""
1006     def __init__(self, source):
1007         list.__init__([])
1008         self.source = source
1009
1010 # Now, some helper functions.
1011
1012 def buildTagMap(default, *args):
1013     """Turns a list of maps, lists, or scalars into a single map.
1014     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1015     NESTING_RESET_TAGS maps out of lists and partial maps."""
1016     built = {}
1017     for portion in args:
1018         if hasattr(portion, 'items'):
1019             #It's a map. Merge it.
1020             for k,v in portion.items():
1021                 built[k] = v
1022         elif hasattr(portion, '__iter__'): # is a list
1023             #It's a list. Map each item to the default.
1024             for k in portion:
1025                 built[k] = default
1026         else:
1027             #It's a scalar. Map it to the default.
1028             built[portion] = default
1029     return built
1030
1031 # Now, the parser classes.
1032
1033 class BeautifulStoneSoup(Tag, SGMLParser):
1034
1035     """This class contains the basic parser and search code. It defines
1036     a parser that knows nothing about tag behavior except for the
1037     following:
1038
1039       You can't close a tag without closing all the tags it encloses.
1040       That is, "<foo><bar></foo>" actually means
1041       "<foo><bar></bar></foo>".
1042
1043     [Another possible explanation is "<foo><bar /></foo>", but since
1044     this class defines no SELF_CLOSING_TAGS, it will never use that
1045     explanation.]
1046
1047     This class is useful for parsing XML or made-up markup languages,
1048     or when BeautifulSoup makes an assumption counter to what you were
1049     expecting."""
1050
1051     SELF_CLOSING_TAGS = {}
1052     NESTABLE_TAGS = {}
1053     RESET_NESTING_TAGS = {}
1054     QUOTE_TAGS = {}
1055     PRESERVE_WHITESPACE_TAGS = []
1056
1057     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1058                        lambda x: x.group(1) + ' />'),
1059                       (re.compile('<!\s+([^<>]*)>'),
1060                        lambda x: '<!' + x.group(1) + '>')
1061                       ]
1062
1063     ROOT_TAG_NAME = u'[document]'
1064
1065     HTML_ENTITIES = "html"
1066     XML_ENTITIES = "xml"
1067     XHTML_ENTITIES = "xhtml"
1068     # TODO: This only exists for backwards-compatibility
1069     ALL_ENTITIES = XHTML_ENTITIES
1070
1071     # Used when determining whether a text node is all whitespace and
1072     # can be replaced with a single space. A text node that contains
1073     # fancy Unicode spaces (usually non-breaking) should be left
1074     # alone.
1075     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1076
1077     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1078                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1079                  convertEntities=None, selfClosingTags=None, isHTML=False):
1080         """The Soup object is initialized as the 'root tag', and the
1081         provided markup (which can be a string or a file-like object)
1082         is fed into the underlying parser.
1083
1084         sgmllib will process most bad HTML, and the BeautifulSoup
1085         class has some tricks for dealing with some HTML that kills
1086         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1087         if your data uses self-closing tags or declarations
1088         incorrectly.
1089
1090         By default, Beautiful Soup uses regexes to sanitize input,
1091         avoiding the vast majority of these problems. If the problems
1092         don't apply to you, pass in False for markupMassage, and
1093         you'll get better performance.
1094
1095         The default parser massage techniques fix the two most common
1096         instances of invalid HTML that choke sgmllib:
1097
1098          <br/> (No space between name of closing tag and tag close)
1099          <! --Comment--> (Extraneous whitespace in declaration)
1100
1101         You can pass in a custom list of (RE object, replace method)
1102         tuples to get Beautiful Soup to scrub your input the way you
1103         want."""
1104
1105         self.parseOnlyThese = parseOnlyThese
1106         self.fromEncoding = fromEncoding
1107         self.smartQuotesTo = smartQuotesTo
1108         self.convertEntities = convertEntities
1109         # Set the rules for how we'll deal with the entities we
1110         # encounter
1111         if self.convertEntities:
1112             # It doesn't make sense to convert encoded characters to
1113             # entities even while you're converting entities to Unicode.
1114             # Just convert it all to Unicode.
1115             self.smartQuotesTo = None
1116             if convertEntities == self.HTML_ENTITIES:
1117                 self.convertXMLEntities = False
1118                 self.convertHTMLEntities = True
1119                 self.escapeUnrecognizedEntities = True
1120             elif convertEntities == self.XHTML_ENTITIES:
1121                 self.convertXMLEntities = True
1122                 self.convertHTMLEntities = True
1123                 self.escapeUnrecognizedEntities = False
1124             elif convertEntities == self.XML_ENTITIES:
1125                 self.convertXMLEntities = True
1126                 self.convertHTMLEntities = False
1127                 self.escapeUnrecognizedEntities = False
1128         else:
1129             self.convertXMLEntities = False
1130             self.convertHTMLEntities = False
1131             self.escapeUnrecognizedEntities = False
1132
1133         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1134         SGMLParser.__init__(self)
1135
1136         if hasattr(markup, 'read'):        # It's a file-type object.
1137             markup = markup.read()
1138         self.markup = markup
1139         self.markupMassage = markupMassage
1140         try:
1141             self._feed(isHTML=isHTML)
1142         except StopParsing:
1143             pass
1144         self.markup = None                 # The markup can now be GCed
1145
1146     def convert_charref(self, name):
1147         """This method fixes a bug in Python's SGMLParser."""
1148         try:
1149             n = int(name)
1150         except ValueError:
1151             return
1152         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1153             return
1154         return self.convert_codepoint(n)
1155
1156     def _feed(self, inDocumentEncoding=None, isHTML=False):
1157         # Convert the document to Unicode.
1158         markup = self.markup
1159         if isinstance(markup, unicode):
1160             if not hasattr(self, 'originalEncoding'):
1161                 self.originalEncoding = None
1162         else:
1163             dammit = UnicodeDammit\
1164                      (markup, [self.fromEncoding, inDocumentEncoding],
1165                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1166             markup = dammit.unicode
1167             self.originalEncoding = dammit.originalEncoding
1168             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1169         if markup:
1170             if self.markupMassage:
1171                 if not hasattr(self.markupMassage, "__iter__"):
1172                     self.markupMassage = self.MARKUP_MASSAGE
1173                 for fix, m in self.markupMassage:
1174                     markup = fix.sub(m, markup)
1175                 # TODO: We get rid of markupMassage so that the
1176                 # soup object can be deepcopied later on. Some
1177                 # Python installations can't copy regexes. If anyone
1178                 # was relying on the existence of markupMassage, this
1179                 # might cause problems.
1180                 del(self.markupMassage)
1181         self.reset()
1182
1183         SGMLParser.feed(self, markup)
1184         # Close out any unfinished strings and close all the open tags.
1185         self.endData()
1186         while self.currentTag.name != self.ROOT_TAG_NAME:
1187             self.popTag()
1188
1189     def __getattr__(self, methodName):
1190         """This method routes method call requests to either the SGMLParser
1191         superclass or the Tag superclass, depending on the method name."""
1192         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1193
1194         if methodName.startswith('start_') or methodName.startswith('end_') \
1195                or methodName.startswith('do_'):
1196             return SGMLParser.__getattr__(self, methodName)
1197         elif not methodName.startswith('__'):
1198             return Tag.__getattr__(self, methodName)
1199         else:
1200             raise AttributeError
1201
1202     def isSelfClosingTag(self, name):
1203         """Returns true iff the given string is the name of a
1204         self-closing tag according to this parser."""
1205         return self.SELF_CLOSING_TAGS.has_key(name) \
1206                or self.instanceSelfClosingTags.has_key(name)
1207
1208     def reset(self):
1209         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1210         self.hidden = 1
1211         SGMLParser.reset(self)
1212         self.currentData = []
1213         self.currentTag = None
1214         self.tagStack = []
1215         self.quoteStack = []
1216         self.pushTag(self)
1217
1218     def popTag(self):
1219         tag = self.tagStack.pop()
1220
1221         #print "Pop", tag.name
1222         if self.tagStack:
1223             self.currentTag = self.tagStack[-1]
1224         return self.currentTag
1225
1226     def pushTag(self, tag):
1227         #print "Push", tag.name
1228         if self.currentTag:
1229             self.currentTag.contents.append(tag)
1230         self.tagStack.append(tag)
1231         self.currentTag = self.tagStack[-1]
1232
1233     def endData(self, containerClass=NavigableString):
1234         if self.currentData:
1235             currentData = u''.join(self.currentData)
1236             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1237                 not set([tag.name for tag in self.tagStack]).intersection(
1238                     self.PRESERVE_WHITESPACE_TAGS)):
1239                 if '\n' in currentData:
1240                     currentData = '\n'
1241                 else:
1242                     currentData = ' '
1243             self.currentData = []
1244             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1245                    (not self.parseOnlyThese.text or \
1246                     not self.parseOnlyThese.search(currentData)):
1247                 return
1248             o = containerClass(currentData)
1249             o.setup(self.currentTag, self.previous)
1250             if self.previous:
1251                 self.previous.next = o
1252             self.previous = o
1253             self.currentTag.contents.append(o)
1254
1255
1256     def _popToTag(self, name, inclusivePop=True):
1257         """Pops the tag stack up to and including the most recent
1258         instance of the given tag. If inclusivePop is false, pops the tag
1259         stack up to but *not* including the most recent instqance of
1260         the given tag."""
1261         #print "Popping to %s" % name
1262         if name == self.ROOT_TAG_NAME:
1263             return
1264
1265         numPops = 0
1266         mostRecentTag = None
1267         for i in range(len(self.tagStack)-1, 0, -1):
1268             if name == self.tagStack[i].name:
1269                 numPops = len(self.tagStack)-i
1270                 break
1271         if not inclusivePop:
1272             numPops = numPops - 1
1273
1274         for i in range(0, numPops):
1275             mostRecentTag = self.popTag()
1276         return mostRecentTag
1277
1278     def _smartPop(self, name):
1279
1280         """We need to pop up to the previous tag of this type, unless
1281         one of this tag's nesting reset triggers comes between this
1282         tag and the previous tag of this type, OR unless this tag is a
1283         generic nesting trigger and another generic nesting trigger
1284         comes between this tag and the previous tag of this type.
1285
1286         Examples:
1287          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1288          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1289          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1290
1291          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1292          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1293          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1294         """
1295
1296         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1297         isNestable = nestingResetTriggers != None
1298         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1299         popTo = None
1300         inclusive = True
1301         for i in range(len(self.tagStack)-1, 0, -1):
1302             p = self.tagStack[i]
1303             if (not p or p.name == name) and not isNestable:
1304                 #Non-nestable tags get popped to the top or to their
1305                 #last occurance.
1306                 popTo = name
1307                 break
1308             if (nestingResetTriggers is not None
1309                 and p.name in nestingResetTriggers) \
1310                 or (nestingResetTriggers is None and isResetNesting
1311                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1312
1313                 #If we encounter one of the nesting reset triggers
1314                 #peculiar to this tag, or we encounter another tag
1315                 #that causes nesting to reset, pop up to but not
1316                 #including that tag.
1317                 popTo = p.name
1318                 inclusive = False
1319                 break
1320             p = p.parent
1321         if popTo:
1322             self._popToTag(popTo, inclusive)
1323
1324     def unknown_starttag(self, name, attrs, selfClosing=0):
1325         #print "Start tag %s: %s" % (name, attrs)
1326         if self.quoteStack:
1327             #This is not a real tag.
1328             #print "<%s> is not real!" % name
1329             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1330             self.handle_data('<%s%s>' % (name, attrs))
1331             return
1332         self.endData()
1333
1334         if not self.isSelfClosingTag(name) and not selfClosing:
1335             self._smartPop(name)
1336
1337         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1338                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1339             return
1340
1341         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1342         if self.previous:
1343             self.previous.next = tag
1344         self.previous = tag
1345         self.pushTag(tag)
1346         if selfClosing or self.isSelfClosingTag(name):
1347             self.popTag()
1348         if name in self.QUOTE_TAGS:
1349             #print "Beginning quote (%s)" % name
1350             self.quoteStack.append(name)
1351             self.literal = 1
1352         return tag
1353
1354     def unknown_endtag(self, name):
1355         #print "End tag %s" % name
1356         if self.quoteStack and self.quoteStack[-1] != name:
1357             #This is not a real end tag.
1358             #print "</%s> is not real!" % name
1359             self.handle_data('</%s>' % name)
1360             return
1361         self.endData()
1362         self._popToTag(name)
1363         if self.quoteStack and self.quoteStack[-1] == name:
1364             self.quoteStack.pop()
1365             self.literal = (len(self.quoteStack) > 0)
1366
1367     def handle_data(self, data):
1368         self.currentData.append(data)
1369
1370     def _toStringSubclass(self, text, subclass):
1371         """Adds a certain piece of text to the tree as a NavigableString
1372         subclass."""
1373         self.endData()
1374         self.handle_data(text)
1375         self.endData(subclass)
1376
1377     def handle_pi(self, text):
1378         """Handle a processing instruction as a ProcessingInstruction
1379         object, possibly one with a %SOUP-ENCODING% slot into which an
1380         encoding will be plugged later."""
1381         if text[:3] == "xml":
1382             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1383         self._toStringSubclass(text, ProcessingInstruction)
1384
1385     def handle_comment(self, text):
1386         "Handle comments as Comment objects."
1387         self._toStringSubclass(text, Comment)
1388
1389     def handle_charref(self, ref):
1390         "Handle character references as data."
1391         if self.convertEntities:
1392             data = unichr(int(ref))
1393         else:
1394             data = '&#%s;' % ref
1395         self.handle_data(data)
1396
1397     def handle_entityref(self, ref):
1398         """Handle entity references as data, possibly converting known
1399         HTML and/or XML entity references to the corresponding Unicode
1400         characters."""
1401         data = None
1402         if self.convertHTMLEntities:
1403             try:
1404                 data = unichr(name2codepoint[ref])
1405             except KeyError:
1406                 pass
1407
1408         if not data and self.convertXMLEntities:
1409                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1410
1411         if not data and self.convertHTMLEntities and \
1412             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1413                 # TODO: We've got a problem here. We're told this is
1414                 # an entity reference, but it's not an XML entity
1415                 # reference or an HTML entity reference. Nonetheless,
1416                 # the logical thing to do is to pass it through as an
1417                 # unrecognized entity reference.
1418                 #
1419                 # Except: when the input is "&carol;" this function
1420                 # will be called with input "carol". When the input is
1421                 # "AT&T", this function will be called with input
1422                 # "T". We have no way of knowing whether a semicolon
1423                 # was present originally, so we don't know whether
1424                 # this is an unknown entity or just a misplaced
1425                 # ampersand.
1426                 #
1427                 # The more common case is a misplaced ampersand, so I
1428                 # escape the ampersand and omit the trailing semicolon.
1429                 data = "&amp;%s" % ref
1430         if not data:
1431             # This case is different from the one above, because we
1432             # haven't already gone through a supposedly comprehensive
1433             # mapping of entities to Unicode characters. We might not
1434             # have gone through any mapping at all. So the chances are
1435             # very high that this is a real entity, and not a
1436             # misplaced ampersand.
1437             data = "&%s;" % ref
1438         self.handle_data(data)
1439
1440     def handle_decl(self, data):
1441         "Handle DOCTYPEs and the like as Declaration objects."
1442         self._toStringSubclass(data, Declaration)
1443
1444     def parse_declaration(self, i):
1445         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1446         declaration as a CData object."""
1447         j = None
1448         if self.rawdata[i:i+9] == '<![CDATA[':
1449              k = self.rawdata.find(']]>', i)
1450              if k == -1:
1451                  k = len(self.rawdata)
1452              data = self.rawdata[i+9:k]
1453              j = k+3
1454              self._toStringSubclass(data, CData)
1455         else:
1456             try:
1457                 j = SGMLParser.parse_declaration(self, i)
1458             except SGMLParseError:
1459                 toHandle = self.rawdata[i:]
1460                 self.handle_data(toHandle)
1461                 j = i + len(toHandle)
1462         return j
1463
1464 class BeautifulSoup(BeautifulStoneSoup):
1465
1466     """This parser knows the following facts about HTML:
1467
1468     * Some tags have no closing tag and should be interpreted as being
1469       closed as soon as they are encountered.
1470
1471     * The text inside some tags (ie. 'script') may contain tags which
1472       are not really part of the document and which should be parsed
1473       as text, not tags. If you want to parse the text as tags, you can
1474       always fetch it and parse it explicitly.
1475
1476     * Tag nesting rules:
1477
1478       Most tags can't be nested at all. For instance, the occurance of
1479       a <p> tag should implicitly close the previous <p> tag.
1480
1481        <p>Para1<p>Para2
1482         should be transformed into:
1483        <p>Para1</p><p>Para2
1484
1485       Some tags can be nested arbitrarily. For instance, the occurance
1486       of a <blockquote> tag should _not_ implicitly close the previous
1487       <blockquote> tag.
1488
1489        Alice said: <blockquote>Bob said: <blockquote>Blah
1490         should NOT be transformed into:
1491        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1492
1493       Some tags can be nested, but the nesting is reset by the
1494       interposition of other tags. For instance, a <tr> tag should
1495       implicitly close the previous <tr> tag within the same <table>,
1496       but not close a <tr> tag in another table.
1497
1498        <table><tr>Blah<tr>Blah
1499         should be transformed into:
1500        <table><tr>Blah</tr><tr>Blah
1501         but,
1502        <tr>Blah<table><tr>Blah
1503         should NOT be transformed into
1504        <tr>Blah<table></tr><tr>Blah
1505
1506     Differing assumptions about tag nesting rules are a major source
1507     of problems with the BeautifulSoup class. If BeautifulSoup is not
1508     treating as nestable a tag your page author treats as nestable,
1509     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1510     BeautifulStoneSoup before writing your own subclass."""
1511
1512     def __init__(self, *args, **kwargs):
1513         if not kwargs.has_key('smartQuotesTo'):
1514             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1515         kwargs['isHTML'] = True
1516         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1517
1518     SELF_CLOSING_TAGS = buildTagMap(None,
1519                                     ('br' , 'hr', 'input', 'img', 'meta',
1520                                     'spacer', 'link', 'frame', 'base', 'col'))
1521
1522     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1523
1524     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1525
1526     #According to the HTML standard, each of these inline tags can
1527     #contain another tag of the same type. Furthermore, it's common
1528     #to actually use these tags this way.
1529     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1530                             'center')
1531
1532     #According to the HTML standard, these block tags can contain
1533     #another tag of the same type. Furthermore, it's common
1534     #to actually use these tags this way.
1535     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1536
1537     #Lists can contain other lists, but there are restrictions.
1538     NESTABLE_LIST_TAGS = { 'ol' : [],
1539                            'ul' : [],
1540                            'li' : ['ul', 'ol'],
1541                            'dl' : [],
1542                            'dd' : ['dl'],
1543                            'dt' : ['dl'] }
1544
1545     #Tables can contain other tables, but there are restrictions.
1546     NESTABLE_TABLE_TAGS = {'table' : [],
1547                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1548                            'td' : ['tr'],
1549                            'th' : ['tr'],
1550                            'thead' : ['table'],
1551                            'tbody' : ['table'],
1552                            'tfoot' : ['table'],
1553                            }
1554
1555     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1556
1557     #If one of these tags is encountered, all tags up to the next tag of
1558     #this type are popped.
1559     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1560                                      NON_NESTABLE_BLOCK_TAGS,
1561                                      NESTABLE_LIST_TAGS,
1562                                      NESTABLE_TABLE_TAGS)
1563
1564     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1565                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1566
1567     # Used to detect the charset in a META tag; see start_meta
1568     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1569
1570     def start_meta(self, attrs):
1571         """Beautiful Soup can detect a charset included in a META tag,
1572         try to convert the document to that charset, and re-parse the
1573         document from the beginning."""
1574         httpEquiv = None
1575         contentType = None
1576         contentTypeIndex = None
1577         tagNeedsEncodingSubstitution = False
1578
1579         for i in range(0, len(attrs)):
1580             key, value = attrs[i]
1581             key = key.lower()
1582             if key == 'http-equiv':
1583                 httpEquiv = value
1584             elif key == 'content':
1585                 contentType = value
1586                 contentTypeIndex = i
1587
1588         if httpEquiv and contentType: # It's an interesting meta tag.
1589             match = self.CHARSET_RE.search(contentType)
1590             if match:
1591                 if (self.declaredHTMLEncoding is not None or
1592                     self.originalEncoding == self.fromEncoding):
1593                     # An HTML encoding was sniffed while converting
1594                     # the document to Unicode, or an HTML encoding was
1595                     # sniffed during a previous pass through the
1596                     # document, or an encoding was specified
1597                     # explicitly and it worked. Rewrite the meta tag.
1598                     def rewrite(match):
1599                         return match.group(1) + "%SOUP-ENCODING%"
1600                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1601                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1602                                                newAttr)
1603                     tagNeedsEncodingSubstitution = True
1604                 else:
1605                     # This is our first pass through the document.
1606                     # Go through it again with the encoding information.
1607                     newCharset = match.group(3)
1608                     if newCharset and newCharset != self.originalEncoding:
1609                         self.declaredHTMLEncoding = newCharset
1610                         self._feed(self.declaredHTMLEncoding)
1611                         raise StopParsing
1612                     pass
1613         tag = self.unknown_starttag("meta", attrs)
1614         if tag and tagNeedsEncodingSubstitution:
1615             tag.containsSubstitutions = True
1616
1617 class StopParsing(Exception):
1618     pass
1619
1620 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1621
1622     """The BeautifulSoup class is oriented towards skipping over
1623     common HTML errors like unclosed tags. However, sometimes it makes
1624     errors of its own. For instance, consider this fragment:
1625
1626      <b>Foo<b>Bar</b></b>
1627
1628     This is perfectly valid (if bizarre) HTML. However, the
1629     BeautifulSoup class will implicitly close the first b tag when it
1630     encounters the second 'b'. It will think the author wrote
1631     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1632     there's no real-world reason to bold something that's already
1633     bold. When it encounters '</b></b>' it will close two more 'b'
1634     tags, for a grand total of three tags closed instead of two. This
1635     can throw off the rest of your document structure. The same is
1636     true of a number of other tags, listed below.
1637
1638     It's much more common for someone to forget to close a 'b' tag
1639     than to actually use nested 'b' tags, and the BeautifulSoup class
1640     handles the common case. This class handles the not-co-common
1641     case: where you can't believe someone wrote what they did, but
1642     it's valid HTML and BeautifulSoup screwed up by assuming it
1643     wouldn't be."""
1644
1645     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1646      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1647       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1648       'big')
1649
1650     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript')
1651
1652     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1653                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1654                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1655
1656 class MinimalSoup(BeautifulSoup):
1657     """The MinimalSoup class is for parsing HTML that contains
1658     pathologically bad markup. It makes no assumptions about tag
1659     nesting, but it does know which tags are self-closing, that
1660     <script> tags contain Javascript and should not be parsed, that
1661     META tags may contain encoding information, and so on.
1662
1663     This also makes it better for subclassing than BeautifulStoneSoup
1664     or BeautifulSoup."""
1665
1666     RESET_NESTING_TAGS = buildTagMap('noscript')
1667     NESTABLE_TAGS = {}
1668
1669 class BeautifulSOAP(BeautifulStoneSoup):
1670     """This class will push a tag with only a single string child into
1671     the tag's parent as an attribute. The attribute's name is the tag
1672     name, and the value is the string child. An example should give
1673     the flavor of the change:
1674
1675     <foo><bar>baz</bar></foo>
1676      =>
1677     <foo bar="baz"><bar>baz</bar></foo>
1678
1679     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1680
1681     This is, of course, useful for scraping structures that tend to
1682     use subelements instead of attributes, such as SOAP messages. Note
1683     that it modifies its input, so don't print the modified version
1684     out.
1685
1686     I'm not sure how many people really want to use this class; let me
1687     know if you do. Mainly I like the name."""
1688
1689     def popTag(self):
1690         if len(self.tagStack) > 1:
1691             tag = self.tagStack[-1]
1692             parent = self.tagStack[-2]
1693             parent._getAttrMap()
1694             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1695                 isinstance(tag.contents[0], NavigableString) and
1696                 not parent.attrMap.has_key(tag.name)):
1697                 parent[tag.name] = tag.contents[0]
1698         BeautifulStoneSoup.popTag(self)
1699
1700 #Enterprise class names! It has come to our attention that some people
1701 #think the names of the Beautiful Soup parser classes are too silly
1702 #and "unprofessional" for use in enterprise screen-scraping. We feel
1703 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1704 #All-Night Kosher Bakery recommends renaming this file to
1705 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1706 #"RobustParserBeanInterface.class") and using the following
1707 #enterprise-friendly class aliases:
1708 class RobustXMLParser(BeautifulStoneSoup):
1709     pass
1710 class RobustHTMLParser(BeautifulSoup):
1711     pass
1712 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1713     pass
1714 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1715     pass
1716 class SimplifyingSOAPParser(BeautifulSOAP):
1717     pass
1718
1719 ######################################################
1720 #
1721 # Bonus library: Unicode, Dammit
1722 #
1723 # This class forces XML data into a standard format (usually to UTF-8
1724 # or Unicode).  It is heavily based on code from Mark Pilgrim's
1725 # Universal Feed Parser. It does not rewrite the XML or HTML to
1726 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1727 # (XML) and BeautifulSoup.start_meta (HTML).
1728
1729 # Autodetects character encodings.
1730 # Download from http://chardet.feedparser.org/
1731 try:
1732     import chardet
1733 #    import chardet.constants
1734 #    chardet.constants._debug = 1
1735 except ImportError:
1736     chardet = None
1737
1738 # cjkcodecs and iconv_codec make Python know about more character encodings.
1739 # Both are available from http://cjkpython.i18n.org/
1740 # They're built in if you use Python 2.4.
1741 try:
1742     import cjkcodecs.aliases
1743 except ImportError:
1744     pass
1745 try:
1746     import iconv_codec
1747 except ImportError:
1748     pass
1749
1750 class UnicodeDammit:
1751     """A class for detecting the encoding of a *ML document and
1752     converting it to a Unicode string. If the source encoding is
1753     windows-1252, can replace MS smart quotes with their HTML or XML
1754     equivalents."""
1755
1756     # This dictionary maps commonly seen values for "charset" in HTML
1757     # meta tags to the corresponding Python codec names. It only covers
1758     # values that aren't in Python's aliases and can't be determined
1759     # by the heuristics in find_codec.
1760     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1761                         "x-sjis" : "shift-jis" }
1762
1763     def __init__(self, markup, overrideEncodings=[],
1764                  smartQuotesTo='xml', isHTML=False):
1765         self.declaredHTMLEncoding = None
1766         self.markup, documentEncoding, sniffedEncoding = \
1767                      self._detectEncoding(markup, isHTML)
1768         self.smartQuotesTo = smartQuotesTo
1769         self.triedEncodings = []
1770         if markup == '' or isinstance(markup, unicode):
1771             self.originalEncoding = None
1772             self.unicode = unicode(markup)
1773             return
1774
1775         u = None
1776         for proposedEncoding in overrideEncodings:
1777             u = self._convertFrom(proposedEncoding)
1778             if u: break
1779         if not u:
1780             for proposedEncoding in (documentEncoding, sniffedEncoding):
1781                 u = self._convertFrom(proposedEncoding)
1782                 if u: break
1783
1784         # If no luck and we have auto-detection library, try that:
1785         if not u and chardet and not isinstance(self.markup, unicode):
1786             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1787
1788         # As a last resort, try utf-8 and windows-1252:
1789         if not u:
1790             for proposed_encoding in ("utf-8", "windows-1252"):
1791                 u = self._convertFrom(proposed_encoding)
1792                 if u: break
1793
1794         self.unicode = u
1795         if not u: self.originalEncoding = None
1796
1797     def _subMSChar(self, orig):
1798         """Changes a MS smart quote character to an XML or HTML
1799         entity."""
1800         sub = self.MS_CHARS.get(orig)
1801         if isinstance(sub, tuple):
1802             if self.smartQuotesTo == 'xml':
1803                 sub = '&#x%s;' % sub[1]
1804             else:
1805                 sub = '&%s;' % sub[0]
1806         return sub
1807
1808     def _convertFrom(self, proposed):
1809         proposed = self.find_codec(proposed)
1810         if not proposed or proposed in self.triedEncodings:
1811             return None
1812         self.triedEncodings.append(proposed)
1813         markup = self.markup
1814
1815         # Convert smart quotes to HTML if coming from an encoding
1816         # that might have them.
1817         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1818                                                       "iso-8859-1",
1819                                                       "iso-8859-2"):
1820             markup = re.compile("([\x80-\x9f])").sub \
1821                      (lambda(x): self._subMSChar(x.group(1)),
1822                       markup)
1823
1824         try:
1825             # print "Trying to convert document to %s" % proposed
1826             u = self._toUnicode(markup, proposed)
1827             self.markup = u
1828             self.originalEncoding = proposed
1829         except Exception, e:
1830             # print "That didn't work!"
1831             # print e
1832             return None
1833         #print "Correct encoding: %s" % proposed
1834         return self.markup
1835
1836     def _toUnicode(self, data, encoding):
1837         '''Given a string and its encoding, decodes the string into Unicode.
1838         %encoding is a string recognized by encodings.aliases'''
1839
1840         # strip Byte Order Mark (if present)
1841         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1842                and (data[2:4] != '\x00\x00'):
1843             encoding = 'utf-16be'
1844             data = data[2:]
1845         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1846                  and (data[2:4] != '\x00\x00'):
1847             encoding = 'utf-16le'
1848             data = data[2:]
1849         elif data[:3] == '\xef\xbb\xbf':
1850             encoding = 'utf-8'
1851             data = data[3:]
1852         elif data[:4] == '\x00\x00\xfe\xff':
1853             encoding = 'utf-32be'
1854             data = data[4:]
1855         elif data[:4] == '\xff\xfe\x00\x00':
1856             encoding = 'utf-32le'
1857             data = data[4:]
1858         newdata = unicode(data, encoding)
1859         return newdata
1860
1861     def _detectEncoding(self, xml_data, isHTML=False):
1862         """Given a document, tries to detect its XML encoding."""
1863         xml_encoding = sniffed_xml_encoding = None
1864         try:
1865             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1866                 # EBCDIC
1867                 xml_data = self._ebcdic_to_ascii(xml_data)
1868             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1869                 # UTF-16BE
1870                 sniffed_xml_encoding = 'utf-16be'
1871                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1872             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1873                      and (xml_data[2:4] != '\x00\x00'):
1874                 # UTF-16BE with BOM
1875                 sniffed_xml_encoding = 'utf-16be'
1876                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1877             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1878                 # UTF-16LE
1879                 sniffed_xml_encoding = 'utf-16le'
1880                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1881             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1882                      (xml_data[2:4] != '\x00\x00'):
1883                 # UTF-16LE with BOM
1884                 sniffed_xml_encoding = 'utf-16le'
1885                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1886             elif xml_data[:4] == '\x00\x00\x00\x3c':
1887                 # UTF-32BE
1888                 sniffed_xml_encoding = 'utf-32be'
1889                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1890             elif xml_data[:4] == '\x3c\x00\x00\x00':
1891                 # UTF-32LE
1892                 sniffed_xml_encoding = 'utf-32le'
1893                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1894             elif xml_data[:4] == '\x00\x00\xfe\xff':
1895                 # UTF-32BE with BOM
1896                 sniffed_xml_encoding = 'utf-32be'
1897                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1898             elif xml_data[:4] == '\xff\xfe\x00\x00':
1899                 # UTF-32LE with BOM
1900                 sniffed_xml_encoding = 'utf-32le'
1901                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1902             elif xml_data[:3] == '\xef\xbb\xbf':
1903                 # UTF-8 with BOM
1904                 sniffed_xml_encoding = 'utf-8'
1905                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1906             else:
1907                 sniffed_xml_encoding = 'ascii'
1908                 pass
1909         except:
1910             xml_encoding_match = None
1911         xml_encoding_match = re.compile(
1912             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1913         if not xml_encoding_match and isHTML:
1914             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1915             xml_encoding_match = regexp.search(xml_data)
1916         if xml_encoding_match is not None:
1917             xml_encoding = xml_encoding_match.groups()[0].lower()
1918             if isHTML:
1919                 self.declaredHTMLEncoding = xml_encoding
1920             if sniffed_xml_encoding and \
1921                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1922                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1923                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1924                                  'utf16', 'u16')):
1925                 xml_encoding = sniffed_xml_encoding
1926         return xml_data, xml_encoding, sniffed_xml_encoding
1927
1928
1929     def find_codec(self, charset):
1930         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1931                or (charset and self._codec(charset.replace("-", ""))) \
1932                or (charset and self._codec(charset.replace("-", "_"))) \
1933                or charset
1934
1935     def _codec(self, charset):
1936         if not charset: return charset
1937         codec = None
1938         try:
1939             codecs.lookup(charset)
1940             codec = charset
1941         except (LookupError, ValueError):
1942             pass
1943         return codec
1944
1945     EBCDIC_TO_ASCII_MAP = None
1946     def _ebcdic_to_ascii(self, s):
1947         c = self.__class__
1948         if not c.EBCDIC_TO_ASCII_MAP:
1949             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1950                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1951                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1952                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1953                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1954                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1955                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1956                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1957                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1958                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1959                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1960                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1961                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1962                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1963                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1964                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1965                     250,251,252,253,254,255)
1966             import string
1967             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1968             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1969         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1970
1971     MS_CHARS = { '\x80' : ('euro', '20AC'),
1972                  '\x81' : ' ',
1973                  '\x82' : ('sbquo', '201A'),
1974                  '\x83' : ('fnof', '192'),
1975                  '\x84' : ('bdquo', '201E'),
1976                  '\x85' : ('hellip', '2026'),
1977                  '\x86' : ('dagger', '2020'),
1978                  '\x87' : ('Dagger', '2021'),
1979                  '\x88' : ('circ', '2C6'),
1980                  '\x89' : ('permil', '2030'),
1981                  '\x8A' : ('Scaron', '160'),
1982                  '\x8B' : ('lsaquo', '2039'),
1983                  '\x8C' : ('OElig', '152'),
1984                  '\x8D' : '?',
1985                  '\x8E' : ('#x17D', '17D'),
1986                  '\x8F' : '?',
1987                  '\x90' : '?',
1988                  '\x91' : ('lsquo', '2018'),
1989                  '\x92' : ('rsquo', '2019'),
1990                  '\x93' : ('ldquo', '201C'),
1991                  '\x94' : ('rdquo', '201D'),
1992                  '\x95' : ('bull', '2022'),
1993                  '\x96' : ('ndash', '2013'),
1994                  '\x97' : ('mdash', '2014'),
1995                  '\x98' : ('tilde', '2DC'),
1996                  '\x99' : ('trade', '2122'),
1997                  '\x9a' : ('scaron', '161'),
1998                  '\x9b' : ('rsaquo', '203A'),
1999                  '\x9c' : ('oelig', '153'),
2000                  '\x9d' : '?',
2001                  '\x9e' : ('#x17E', '17E'),
2002                  '\x9f' : ('Yuml', ''),}
2003
2004 #######################################################################
2005
2006
2007 #By default, act as an HTML pretty-printer.
2008 if __name__ == '__main__':
2009     import sys
2010     soup = BeautifulSoup(sys.stdin)
2011     print soup.prettify()