3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2009, Leonard Richardson
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79 from __future__ import generators
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
84 __license__ = "New-style BSD"
86 from sgmllib import SGMLParser, SGMLParseError
93 from htmlentitydefs import name2codepoint
99 from sets import Set as set
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING = "utf-8"
107 def _match_css_class(str):
108 """Build a RE to match the given CSS class."""
109 return re.compile(r"(^|.*\s)%s($|\s)" % str)
111 # First, the classes that represent markup elements.
113 class PageElement(object):
114 """Contains the navigational information for some part of the page
115 (either a tag or a piece of text)"""
117 def setup(self, parent=None, previous=None):
118 """Sets up the initial relations between this element and
121 self.previous = previous
123 self.previousSibling = None
124 self.nextSibling = None
125 if self.parent and self.parent.contents:
126 self.previousSibling = self.parent.contents[-1]
127 self.previousSibling.nextSibling = self
129 def replaceWith(self, replaceWith):
130 oldParent = self.parent
131 myIndex = self.parent.index(self)
132 if hasattr(replaceWith, "parent")\
133 and replaceWith.parent is self.parent:
134 # We're replacing this element with one of its siblings.
135 index = replaceWith.parent.index(replaceWith)
136 if index and index < myIndex:
137 # Furthermore, it comes before this element. That
138 # means that when we extract it, the index of this
139 # element will change.
140 myIndex = myIndex - 1
142 oldParent.insert(myIndex, replaceWith)
144 def replaceWithChildren(self):
145 myParent = self.parent
146 myIndex = self.parent.index(self)
148 reversedChildren = list(self.contents)
149 reversedChildren.reverse()
150 for child in reversedChildren:
151 myParent.insert(myIndex, child)
154 """Destructively rips this element out of the tree."""
157 del self.parent.contents[self.parent.index(self)]
161 #Find the two elements that would be next to each other if
162 #this element (and any children) hadn't been parsed. Connect
164 lastChild = self._lastRecursiveChild()
165 nextElement = lastChild.next
168 self.previous.next = nextElement
170 nextElement.previous = self.previous
172 lastChild.next = None
175 if self.previousSibling:
176 self.previousSibling.nextSibling = self.nextSibling
178 self.nextSibling.previousSibling = self.previousSibling
179 self.previousSibling = self.nextSibling = None
182 def _lastRecursiveChild(self):
183 "Finds the last element beneath this object to be parsed."
185 while hasattr(lastChild, 'contents') and lastChild.contents:
186 lastChild = lastChild.contents[-1]
189 def insert(self, position, newChild):
190 if isinstance(newChild, basestring) \
191 and not isinstance(newChild, NavigableString):
192 newChild = NavigableString(newChild)
194 position = min(position, len(self.contents))
195 if hasattr(newChild, 'parent') and newChild.parent is not None:
196 # We're 'inserting' an element that's already one
197 # of this object's children.
198 if newChild.parent is self:
199 index = self.index(newChild)
201 # Furthermore we're moving it further down the
202 # list of this object's children. That means that
203 # when we extract this element, our target index
204 # will jump down one.
205 position = position - 1
208 newChild.parent = self
211 newChild.previousSibling = None
212 newChild.previous = self
214 previousChild = self.contents[position-1]
215 newChild.previousSibling = previousChild
216 newChild.previousSibling.nextSibling = newChild
217 newChild.previous = previousChild._lastRecursiveChild()
218 if newChild.previous:
219 newChild.previous.next = newChild
221 newChildsLastElement = newChild._lastRecursiveChild()
223 if position >= len(self.contents):
224 newChild.nextSibling = None
227 parentsNextSibling = None
228 while not parentsNextSibling:
229 parentsNextSibling = parent.nextSibling
230 parent = parent.parent
231 if not parent: # This is the last element in the document.
233 if parentsNextSibling:
234 newChildsLastElement.next = parentsNextSibling
236 newChildsLastElement.next = None
238 nextChild = self.contents[position]
239 newChild.nextSibling = nextChild
240 if newChild.nextSibling:
241 newChild.nextSibling.previousSibling = newChild
242 newChildsLastElement.next = nextChild
244 if newChildsLastElement.next:
245 newChildsLastElement.next.previous = newChildsLastElement
246 self.contents.insert(position, newChild)
248 def append(self, tag):
249 """Appends the given tag to the contents of this tag."""
250 self.insert(len(self.contents), tag)
252 def findNext(self, name=None, attrs={}, text=None, **kwargs):
253 """Returns the first item that matches the given criteria and
254 appears after this Tag in the document."""
255 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
257 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
259 """Returns all items that match the given criteria and appear
260 after this Tag in the document."""
261 return self._findAll(name, attrs, text, limit, self.nextGenerator,
264 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265 """Returns the closest sibling to this Tag that matches the
266 given criteria and appears after this Tag in the document."""
267 return self._findOne(self.findNextSiblings, name, attrs, text,
270 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
272 """Returns the siblings of this Tag that match the given
273 criteria and appear after this Tag in the document."""
274 return self._findAll(name, attrs, text, limit,
275 self.nextSiblingGenerator, **kwargs)
276 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
278 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279 """Returns the first item that matches the given criteria and
280 appears before this Tag in the document."""
281 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
283 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
285 """Returns all items that match the given criteria and appear
286 before this Tag in the document."""
287 return self._findAll(name, attrs, text, limit, self.previousGenerator,
289 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
291 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292 """Returns the closest sibling to this Tag that matches the
293 given criteria and appears before this Tag in the document."""
294 return self._findOne(self.findPreviousSiblings, name, attrs, text,
297 def findPreviousSiblings(self, name=None, attrs={}, text=None,
298 limit=None, **kwargs):
299 """Returns the siblings of this Tag that match the given
300 criteria and appear before this Tag in the document."""
301 return self._findAll(name, attrs, text, limit,
302 self.previousSiblingGenerator, **kwargs)
303 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
305 def findParent(self, name=None, attrs={}, **kwargs):
306 """Returns the closest parent of this Tag that matches the given
308 # NOTE: We can't use _findOne because findParents takes a different
311 l = self.findParents(name, attrs, 1)
316 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317 """Returns the parents of this Tag that match the given
320 return self._findAll(name, attrs, None, limit, self.parentGenerator,
322 fetchParents = findParents # Compatibility with pre-3.x
324 #These methods do the real heavy lifting.
326 def _findOne(self, method, name, attrs, text, **kwargs):
328 l = method(name, attrs, text, 1, **kwargs)
333 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334 "Iterates over a generator looking for things that match."
336 if isinstance(name, SoupStrainer):
338 # Special case some findAll* searches
340 elif not limit and name is True and not attrs and not kwargs:
341 return [element for element in generator()
342 if isinstance(element, Tag)]
344 # findAll*('tag-name')
345 elif not limit and isinstance(name, basestring) and not attrs \
347 return [element for element in generator()
348 if isinstance(element, Tag) and element.name == name]
350 # Build a SoupStrainer
352 strainer = SoupStrainer(name, attrs, text, **kwargs)
353 results = ResultSet(strainer)
358 except StopIteration:
361 found = strainer.search(i)
363 results.append(found)
364 if limit and len(results) >= limit:
368 #These Generators can be used to navigate starting from both
369 #NavigableStrings and Tags.
370 def nextGenerator(self):
376 def nextSiblingGenerator(self):
382 def previousGenerator(self):
388 def previousSiblingGenerator(self):
391 i = i.previousSibling
394 def parentGenerator(self):
401 def substituteEncoding(self, str, encoding=None):
402 encoding = encoding or "utf-8"
403 return str.replace("%SOUP-ENCODING%", encoding)
405 def toEncoding(self, s, encoding=None):
406 """Encodes an object to a string in some encoding, or to Unicode.
408 if isinstance(s, unicode):
410 s = s.encode(encoding)
411 elif isinstance(s, str):
413 s = s.encode(encoding)
418 s = self.toEncoding(str(s), encoding)
423 class NavigableString(unicode, PageElement):
425 def __new__(cls, value):
426 """Create a new NavigableString.
428 When unpickling a NavigableString, this method is called with
429 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
430 passed in to the superclass's __new__ or the superclass won't know
431 how to handle non-ASCII characters.
433 if isinstance(value, unicode):
434 return unicode.__new__(cls, value)
435 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
437 def __getnewargs__(self):
438 return (NavigableString.__str__(self),)
440 def __getattr__(self, attr):
441 """text.string gives you text. This is for backwards
442 compatibility for Navigable*String, but for CData* it lets you
443 get the string without the CData wrapper."""
447 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
449 def __unicode__(self):
450 return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
454 return self.encode(encoding)
458 class CData(NavigableString):
460 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
461 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
463 class ProcessingInstruction(NavigableString):
464 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
466 if "%SOUP-ENCODING%" in output:
467 output = self.substituteEncoding(output, encoding)
468 return "<?%s?>" % self.toEncoding(output, encoding)
470 class Comment(NavigableString):
471 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
472 return "<!--%s-->" % NavigableString.__str__(self, encoding)
474 class Declaration(NavigableString):
475 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
476 return "<!%s>" % NavigableString.__str__(self, encoding)
478 class Tag(PageElement):
480 """Represents a found HTML tag with its attributes and contents."""
483 "Cheap function to invert a hash."
485 for k,v in h.items():
489 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
495 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497 def _convertEntities(self, match):
498 """Used in a call to re.sub to replace HTML, XML, and numeric
499 entities with the appropriate Unicode characters. If HTML
500 entities are being converted, any unrecognized entities are
503 if self.convertHTMLEntities and x in name2codepoint:
504 return unichr(name2codepoint[x])
505 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
506 if self.convertXMLEntities:
507 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
510 elif len(x) > 0 and x[0] == '#':
511 # Handle numeric entities
512 if len(x) > 1 and x[1] == 'x':
513 return unichr(int(x[2:], 16))
515 return unichr(int(x[1:]))
517 elif self.escapeUnrecognizedEntities:
518 return u'&%s;' % x
522 def __init__(self, parser, name, attrs=None, parent=None,
526 # We don't actually store the parser object: that lets extracted
527 # chunks be garbage-collected
528 self.parserClass = parser.__class__
529 self.isSelfClosing = parser.isSelfClosingTag(name)
535 self.setup(parent, previous)
537 self.containsSubstitutions = False
538 self.convertHTMLEntities = parser.convertHTMLEntities
539 self.convertXMLEntities = parser.convertXMLEntities
540 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
542 # Convert any HTML, XML, or numeric entities in the attribute values.
543 convert = lambda(k, val): (k,
544 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
545 self._convertEntities,
547 self.attrs = map(convert, self.attrs)
550 if (len(self.contents) == 1
551 and isinstance(self.contents[0], NavigableString)):
552 return self.contents[0]
554 def setString(self, string):
555 """Replace the contents of the tag with a string"""
559 string = property(getString, setString)
561 def getText(self, separator=u""):
562 if not len(self.contents):
564 stopNode = self._lastRecursiveChild().next
566 current = self.contents[0]
567 while current is not stopNode:
568 if isinstance(current, NavigableString):
569 strings.append(current.strip())
570 current = current.next
571 return separator.join(strings)
573 text = property(getText)
575 def get(self, key, default=None):
576 """Returns the value of the 'key' attribute for the tag, or
577 the value given for 'default' if it doesn't have that
579 return self._getAttrMap().get(key, default)
582 """Extract all children."""
583 for child in self.contents[:]:
586 def index(self, element):
587 for i, child in enumerate(self.contents):
590 raise ValueError("Tag.index: element not in tag")
592 def has_key(self, key):
593 return self._getAttrMap().has_key(key)
595 def __getitem__(self, key):
596 """tag[key] returns the value of the 'key' attribute for the tag,
597 and throws an exception if it's not there."""
598 return self._getAttrMap()[key]
601 "Iterating over a tag iterates over its contents."
602 return iter(self.contents)
605 "The length of a tag is the length of its list of contents."
606 return len(self.contents)
608 def __contains__(self, x):
609 return x in self.contents
611 def __nonzero__(self):
612 "A tag is non-None even if it has no contents."
615 def __setitem__(self, key, value):
616 """Setting tag[key] sets the value of the 'key' attribute for the
619 self.attrMap[key] = value
621 for i in range(0, len(self.attrs)):
622 if self.attrs[i][0] == key:
623 self.attrs[i] = (key, value)
626 self.attrs.append((key, value))
627 self._getAttrMap()[key] = value
629 def __delitem__(self, key):
630 "Deleting tag[key] deletes all 'key' attributes for the tag."
631 for item in self.attrs:
633 self.attrs.remove(item)
634 #We don't break because bad HTML can define the same
635 #attribute multiple times.
637 if self.attrMap.has_key(key):
638 del self.attrMap[key]
640 def __call__(self, *args, **kwargs):
641 """Calling a tag like a function is the same as calling its
642 findAll() method. Eg. tag('a') returns a list of all the A tags
643 found within this tag."""
644 return apply(self.findAll, args, kwargs)
646 def __getattr__(self, tag):
647 #print "Getattr %s.%s" % (self.__class__, tag)
648 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
649 return self.find(tag[:-3])
650 elif tag.find('__') != 0:
651 return self.find(tag)
652 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
654 def __eq__(self, other):
655 """Returns true iff this tag has the same name, the same attributes,
656 and the same contents (recursively) as the given tag.
658 NOTE: right now this will return false if two tags have the
659 same attributes in a different order. Should this be fixed?"""
662 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
664 for i in range(0, len(self.contents)):
665 if self.contents[i] != other.contents[i]:
669 def __ne__(self, other):
670 """Returns true iff this tag is not identical to the other tag,
671 as defined in __eq__."""
672 return not self == other
674 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
675 """Renders this tag as a string."""
676 return self.__str__(encoding)
678 def __unicode__(self):
679 return self.__str__(None)
681 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
682 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
685 def _sub_entity(self, x):
686 """Used with a regular expression to substitute the
687 appropriate XML entity for an XML special character."""
688 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
690 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
691 prettyPrint=False, indentLevel=0):
692 """Returns a string or Unicode representation of this tag and
693 its contents. To get Unicode, pass None for encoding.
695 NOTE: since Python's HTML parser consumes whitespace, this
696 method is not certain to reproduce the whitespace present in
697 the original string."""
699 encodedName = self.toEncoding(self.name, encoding)
703 for key, val in self.attrs:
705 if isinstance(val, basestring):
706 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
707 val = self.substituteEncoding(val, encoding)
709 # The attribute value either:
711 # * Contains no embedded double quotes or single quotes.
712 # No problem: we enclose it in double quotes.
713 # * Contains embedded single quotes. No problem:
714 # double quotes work here too.
715 # * Contains embedded double quotes. No problem:
716 # we enclose it in single quotes.
717 # * Embeds both single _and_ double quotes. This
718 # can't happen naturally, but it can happen if
719 # you modify an attribute value after parsing
720 # the document. Now we have a bit of a
721 # problem. We solve it by enclosing the
722 # attribute in single quotes, and escaping any
723 # embedded single quotes to XML entities.
727 # TODO: replace with apos when
729 val = val.replace("'", "&squot;")
731 # Now we're okay w/r/t quotes. But the attribute
732 # value might also contain angle brackets, or
733 # ampersands that aren't part of entities. We need
734 # to escape those to XML entities too.
735 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
737 attrs.append(fmt % (self.toEncoding(key, encoding),
738 self.toEncoding(val, encoding)))
741 if self.isSelfClosing:
744 closeTag = '</%s>' % encodedName
746 indentTag, indentContents = 0, 0
748 indentTag = indentLevel
749 space = (' ' * (indentTag-1))
750 indentContents = indentTag + 1
751 contents = self.renderContents(encoding, prettyPrint, indentContents)
758 attributeString = ' ' + ' '.join(attrs)
761 s.append('<%s%s%s>' % (encodedName, attributeString, close))
765 if prettyPrint and contents and contents[-1] != "\n":
767 if prettyPrint and closeTag:
770 if prettyPrint and closeTag and self.nextSibling:
776 """Recursively destroys the contents of this tree."""
778 if len(self.contents) == 0:
780 current = self.contents[0]
781 while current is not None:
783 if isinstance(current, Tag):
784 del current.contents[:]
785 current.parent = None
786 current.previous = None
787 current.previousSibling = None
789 current.nextSibling = None
792 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
793 return self.__str__(encoding, True)
795 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
796 prettyPrint=False, indentLevel=0):
797 """Renders the contents of this tag as a string in the given
798 encoding. If encoding is None, returns a Unicode string.."""
802 if isinstance(c, NavigableString):
803 text = c.__str__(encoding)
804 elif isinstance(c, Tag):
805 s.append(c.__str__(encoding, prettyPrint, indentLevel))
806 if text and prettyPrint:
810 s.append(" " * (indentLevel-1))
818 def find(self, name=None, attrs={}, recursive=True, text=None,
820 """Return only the first child of this Tag matching the given
823 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
829 def findAll(self, name=None, attrs={}, recursive=True, text=None,
830 limit=None, **kwargs):
831 """Extracts a list of Tag objects that match the given
832 criteria. You can specify the name of the Tag and any
833 attributes you want the Tag to have.
835 The value of a key-value pair in the 'attrs' map can be a
836 string, a list of strings, a regular expression object, or a
837 callable that takes a string and returns whether or not the
838 string matches for some custom definition of 'matches'. The
839 same is true of the tag name."""
840 generator = self.recursiveChildGenerator
842 generator = self.childGenerator
843 return self._findAll(name, attrs, text, limit, generator, **kwargs)
844 findChildren = findAll
846 # Pre-3.x compatibility methods
850 def fetchText(self, text=None, recursive=True, limit=None):
851 return self.findAll(text=text, recursive=recursive, limit=limit)
853 def firstText(self, text=None, recursive=True):
854 return self.find(text=text, recursive=recursive)
858 def _getAttrMap(self):
859 """Initializes a map representation of this tag's attributes,
860 if not already initialized."""
861 if not getattr(self, 'attrMap'):
863 for (key, value) in self.attrs:
864 self.attrMap[key] = value
868 def childGenerator(self):
869 # Just use the iterator from the contents
870 return iter(self.contents)
872 def recursiveChildGenerator(self):
873 if not len(self.contents):
875 stopNode = self._lastRecursiveChild().next
876 current = self.contents[0]
877 while current is not stopNode:
879 current = current.next
882 # Next, a couple classes to represent queries and their results.
884 """Encapsulates a number of ways of matching a markup element (tag or
887 def __init__(self, name=None, attrs={}, text=None, **kwargs):
889 if isinstance(attrs, basestring):
890 kwargs['class'] = _match_css_class(attrs)
905 return "%s|%s" % (self.name, self.attrs)
907 def searchTag(self, markupName=None, markupAttrs={}):
910 if isinstance(markupName, Tag):
913 callFunctionWithTagData = callable(self.name) \
914 and not isinstance(markupName, Tag)
917 or callFunctionWithTagData \
918 or (markup and self._matches(markup, self.name)) \
919 or (not markup and self._matches(markupName, self.name)):
920 if callFunctionWithTagData:
921 match = self.name(markupName, markupAttrs)
925 for attr, matchAgainst in self.attrs.items():
926 if not markupAttrMap:
927 if hasattr(markupAttrs, 'get'):
928 markupAttrMap = markupAttrs
931 for k,v in markupAttrs:
933 attrValue = markupAttrMap.get(attr)
934 if not self._matches(attrValue, matchAgainst):
944 def search(self, markup):
945 #print 'looking for %s in %s' % (self, markup)
947 # If given a list of items, scan it for a text element that
949 if hasattr(markup, "__iter__") \
950 and not isinstance(markup, Tag):
951 for element in markup:
952 if isinstance(element, NavigableString) \
953 and self.search(element):
956 # If it's a Tag, make sure its name or attributes match.
957 # Don't bother with Tags if we're searching for text.
958 elif isinstance(markup, Tag):
960 found = self.searchTag(markup)
961 # If it's text, make sure the text matches.
962 elif isinstance(markup, NavigableString) or \
963 isinstance(markup, basestring):
964 if self._matches(markup, self.text):
967 raise Exception, "I don't know how to match against a %s" \
971 def _matches(self, markup, matchAgainst):
972 #print "Matching %s against %s" % (markup, matchAgainst)
974 if matchAgainst is True:
975 result = markup is not None
976 elif callable(matchAgainst):
977 result = matchAgainst(markup)
979 #Custom match methods take the tag as an argument, but all
980 #other ways of matching match the tag name as a string.
981 if isinstance(markup, Tag):
983 if markup and not isinstance(markup, basestring):
984 markup = unicode(markup)
985 #Now we know that chunk is either a string, or None.
986 if hasattr(matchAgainst, 'match'):
987 # It's a regexp object.
988 result = markup and matchAgainst.search(markup)
989 elif hasattr(matchAgainst, '__iter__'): # list-like
990 result = markup in matchAgainst
991 elif hasattr(matchAgainst, 'items'):
992 result = markup.has_key(matchAgainst)
993 elif matchAgainst and isinstance(markup, basestring):
994 if isinstance(markup, unicode):
995 matchAgainst = unicode(matchAgainst)
997 matchAgainst = str(matchAgainst)
1000 result = matchAgainst == markup
1003 class ResultSet(list):
1004 """A ResultSet is just a list that keeps track of the SoupStrainer
1006 def __init__(self, source):
1008 self.source = source
1010 # Now, some helper functions.
1012 def buildTagMap(default, *args):
1013 """Turns a list of maps, lists, or scalars into a single map.
1014 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1015 NESTING_RESET_TAGS maps out of lists and partial maps."""
1017 for portion in args:
1018 if hasattr(portion, 'items'):
1019 #It's a map. Merge it.
1020 for k,v in portion.items():
1022 elif hasattr(portion, '__iter__'): # is a list
1023 #It's a list. Map each item to the default.
1027 #It's a scalar. Map it to the default.
1028 built[portion] = default
1031 # Now, the parser classes.
1033 class BeautifulStoneSoup(Tag, SGMLParser):
1035 """This class contains the basic parser and search code. It defines
1036 a parser that knows nothing about tag behavior except for the
1039 You can't close a tag without closing all the tags it encloses.
1040 That is, "<foo><bar></foo>" actually means
1041 "<foo><bar></bar></foo>".
1043 [Another possible explanation is "<foo><bar /></foo>", but since
1044 this class defines no SELF_CLOSING_TAGS, it will never use that
1047 This class is useful for parsing XML or made-up markup languages,
1048 or when BeautifulSoup makes an assumption counter to what you were
1051 SELF_CLOSING_TAGS = {}
1053 RESET_NESTING_TAGS = {}
1055 PRESERVE_WHITESPACE_TAGS = []
1057 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1058 lambda x: x.group(1) + ' />'),
1059 (re.compile('<!\s+([^<>]*)>'),
1060 lambda x: '<!' + x.group(1) + '>')
1063 ROOT_TAG_NAME = u'[document]'
1065 HTML_ENTITIES = "html"
1066 XML_ENTITIES = "xml"
1067 XHTML_ENTITIES = "xhtml"
1068 # TODO: This only exists for backwards-compatibility
1069 ALL_ENTITIES = XHTML_ENTITIES
1071 # Used when determining whether a text node is all whitespace and
1072 # can be replaced with a single space. A text node that contains
1073 # fancy Unicode spaces (usually non-breaking) should be left
1075 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1077 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1078 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1079 convertEntities=None, selfClosingTags=None, isHTML=False):
1080 """The Soup object is initialized as the 'root tag', and the
1081 provided markup (which can be a string or a file-like object)
1082 is fed into the underlying parser.
1084 sgmllib will process most bad HTML, and the BeautifulSoup
1085 class has some tricks for dealing with some HTML that kills
1086 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1087 if your data uses self-closing tags or declarations
1090 By default, Beautiful Soup uses regexes to sanitize input,
1091 avoiding the vast majority of these problems. If the problems
1092 don't apply to you, pass in False for markupMassage, and
1093 you'll get better performance.
1095 The default parser massage techniques fix the two most common
1096 instances of invalid HTML that choke sgmllib:
1098 <br/> (No space between name of closing tag and tag close)
1099 <! --Comment--> (Extraneous whitespace in declaration)
1101 You can pass in a custom list of (RE object, replace method)
1102 tuples to get Beautiful Soup to scrub your input the way you
1105 self.parseOnlyThese = parseOnlyThese
1106 self.fromEncoding = fromEncoding
1107 self.smartQuotesTo = smartQuotesTo
1108 self.convertEntities = convertEntities
1109 # Set the rules for how we'll deal with the entities we
1111 if self.convertEntities:
1112 # It doesn't make sense to convert encoded characters to
1113 # entities even while you're converting entities to Unicode.
1114 # Just convert it all to Unicode.
1115 self.smartQuotesTo = None
1116 if convertEntities == self.HTML_ENTITIES:
1117 self.convertXMLEntities = False
1118 self.convertHTMLEntities = True
1119 self.escapeUnrecognizedEntities = True
1120 elif convertEntities == self.XHTML_ENTITIES:
1121 self.convertXMLEntities = True
1122 self.convertHTMLEntities = True
1123 self.escapeUnrecognizedEntities = False
1124 elif convertEntities == self.XML_ENTITIES:
1125 self.convertXMLEntities = True
1126 self.convertHTMLEntities = False
1127 self.escapeUnrecognizedEntities = False
1129 self.convertXMLEntities = False
1130 self.convertHTMLEntities = False
1131 self.escapeUnrecognizedEntities = False
1133 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1134 SGMLParser.__init__(self)
1136 if hasattr(markup, 'read'): # It's a file-type object.
1137 markup = markup.read()
1138 self.markup = markup
1139 self.markupMassage = markupMassage
1141 self._feed(isHTML=isHTML)
1144 self.markup = None # The markup can now be GCed
1146 def convert_charref(self, name):
1147 """This method fixes a bug in Python's SGMLParser."""
1152 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1154 return self.convert_codepoint(n)
1156 def _feed(self, inDocumentEncoding=None, isHTML=False):
1157 # Convert the document to Unicode.
1158 markup = self.markup
1159 if isinstance(markup, unicode):
1160 if not hasattr(self, 'originalEncoding'):
1161 self.originalEncoding = None
1163 dammit = UnicodeDammit\
1164 (markup, [self.fromEncoding, inDocumentEncoding],
1165 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1166 markup = dammit.unicode
1167 self.originalEncoding = dammit.originalEncoding
1168 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1170 if self.markupMassage:
1171 if not hasattr(self.markupMassage, "__iter__"):
1172 self.markupMassage = self.MARKUP_MASSAGE
1173 for fix, m in self.markupMassage:
1174 markup = fix.sub(m, markup)
1175 # TODO: We get rid of markupMassage so that the
1176 # soup object can be deepcopied later on. Some
1177 # Python installations can't copy regexes. If anyone
1178 # was relying on the existence of markupMassage, this
1179 # might cause problems.
1180 del(self.markupMassage)
1183 SGMLParser.feed(self, markup)
1184 # Close out any unfinished strings and close all the open tags.
1186 while self.currentTag.name != self.ROOT_TAG_NAME:
1189 def __getattr__(self, methodName):
1190 """This method routes method call requests to either the SGMLParser
1191 superclass or the Tag superclass, depending on the method name."""
1192 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1194 if methodName.startswith('start_') or methodName.startswith('end_') \
1195 or methodName.startswith('do_'):
1196 return SGMLParser.__getattr__(self, methodName)
1197 elif not methodName.startswith('__'):
1198 return Tag.__getattr__(self, methodName)
1200 raise AttributeError
1202 def isSelfClosingTag(self, name):
1203 """Returns true iff the given string is the name of a
1204 self-closing tag according to this parser."""
1205 return self.SELF_CLOSING_TAGS.has_key(name) \
1206 or self.instanceSelfClosingTags.has_key(name)
1209 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1211 SGMLParser.reset(self)
1212 self.currentData = []
1213 self.currentTag = None
1215 self.quoteStack = []
1219 tag = self.tagStack.pop()
1221 #print "Pop", tag.name
1223 self.currentTag = self.tagStack[-1]
1224 return self.currentTag
1226 def pushTag(self, tag):
1227 #print "Push", tag.name
1229 self.currentTag.contents.append(tag)
1230 self.tagStack.append(tag)
1231 self.currentTag = self.tagStack[-1]
1233 def endData(self, containerClass=NavigableString):
1234 if self.currentData:
1235 currentData = u''.join(self.currentData)
1236 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1237 not set([tag.name for tag in self.tagStack]).intersection(
1238 self.PRESERVE_WHITESPACE_TAGS)):
1239 if '\n' in currentData:
1243 self.currentData = []
1244 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1245 (not self.parseOnlyThese.text or \
1246 not self.parseOnlyThese.search(currentData)):
1248 o = containerClass(currentData)
1249 o.setup(self.currentTag, self.previous)
1251 self.previous.next = o
1253 self.currentTag.contents.append(o)
1256 def _popToTag(self, name, inclusivePop=True):
1257 """Pops the tag stack up to and including the most recent
1258 instance of the given tag. If inclusivePop is false, pops the tag
1259 stack up to but *not* including the most recent instqance of
1261 #print "Popping to %s" % name
1262 if name == self.ROOT_TAG_NAME:
1266 mostRecentTag = None
1267 for i in range(len(self.tagStack)-1, 0, -1):
1268 if name == self.tagStack[i].name:
1269 numPops = len(self.tagStack)-i
1271 if not inclusivePop:
1272 numPops = numPops - 1
1274 for i in range(0, numPops):
1275 mostRecentTag = self.popTag()
1276 return mostRecentTag
1278 def _smartPop(self, name):
1280 """We need to pop up to the previous tag of this type, unless
1281 one of this tag's nesting reset triggers comes between this
1282 tag and the previous tag of this type, OR unless this tag is a
1283 generic nesting trigger and another generic nesting trigger
1284 comes between this tag and the previous tag of this type.
1287 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1288 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1289 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1291 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1292 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1293 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1296 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1297 isNestable = nestingResetTriggers != None
1298 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1301 for i in range(len(self.tagStack)-1, 0, -1):
1302 p = self.tagStack[i]
1303 if (not p or p.name == name) and not isNestable:
1304 #Non-nestable tags get popped to the top or to their
1308 if (nestingResetTriggers is not None
1309 and p.name in nestingResetTriggers) \
1310 or (nestingResetTriggers is None and isResetNesting
1311 and self.RESET_NESTING_TAGS.has_key(p.name)):
1313 #If we encounter one of the nesting reset triggers
1314 #peculiar to this tag, or we encounter another tag
1315 #that causes nesting to reset, pop up to but not
1316 #including that tag.
1322 self._popToTag(popTo, inclusive)
1324 def unknown_starttag(self, name, attrs, selfClosing=0):
1325 #print "Start tag %s: %s" % (name, attrs)
1327 #This is not a real tag.
1328 #print "<%s> is not real!" % name
1329 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1330 self.handle_data('<%s%s>' % (name, attrs))
1334 if not self.isSelfClosingTag(name) and not selfClosing:
1335 self._smartPop(name)
1337 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1338 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1341 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1343 self.previous.next = tag
1346 if selfClosing or self.isSelfClosingTag(name):
1348 if name in self.QUOTE_TAGS:
1349 #print "Beginning quote (%s)" % name
1350 self.quoteStack.append(name)
1354 def unknown_endtag(self, name):
1355 #print "End tag %s" % name
1356 if self.quoteStack and self.quoteStack[-1] != name:
1357 #This is not a real end tag.
1358 #print "</%s> is not real!" % name
1359 self.handle_data('</%s>' % name)
1362 self._popToTag(name)
1363 if self.quoteStack and self.quoteStack[-1] == name:
1364 self.quoteStack.pop()
1365 self.literal = (len(self.quoteStack) > 0)
1367 def handle_data(self, data):
1368 self.currentData.append(data)
1370 def _toStringSubclass(self, text, subclass):
1371 """Adds a certain piece of text to the tree as a NavigableString
1374 self.handle_data(text)
1375 self.endData(subclass)
1377 def handle_pi(self, text):
1378 """Handle a processing instruction as a ProcessingInstruction
1379 object, possibly one with a %SOUP-ENCODING% slot into which an
1380 encoding will be plugged later."""
1381 if text[:3] == "xml":
1382 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1383 self._toStringSubclass(text, ProcessingInstruction)
1385 def handle_comment(self, text):
1386 "Handle comments as Comment objects."
1387 self._toStringSubclass(text, Comment)
1389 def handle_charref(self, ref):
1390 "Handle character references as data."
1391 if self.convertEntities:
1392 data = unichr(int(ref))
1394 data = '&#%s;' % ref
1395 self.handle_data(data)
1397 def handle_entityref(self, ref):
1398 """Handle entity references as data, possibly converting known
1399 HTML and/or XML entity references to the corresponding Unicode
1402 if self.convertHTMLEntities:
1404 data = unichr(name2codepoint[ref])
1408 if not data and self.convertXMLEntities:
1409 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1411 if not data and self.convertHTMLEntities and \
1412 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1413 # TODO: We've got a problem here. We're told this is
1414 # an entity reference, but it's not an XML entity
1415 # reference or an HTML entity reference. Nonetheless,
1416 # the logical thing to do is to pass it through as an
1417 # unrecognized entity reference.
1419 # Except: when the input is "&carol;" this function
1420 # will be called with input "carol". When the input is
1421 # "AT&T", this function will be called with input
1422 # "T". We have no way of knowing whether a semicolon
1423 # was present originally, so we don't know whether
1424 # this is an unknown entity or just a misplaced
1427 # The more common case is a misplaced ampersand, so I
1428 # escape the ampersand and omit the trailing semicolon.
1429 data = "&%s" % ref
1431 # This case is different from the one above, because we
1432 # haven't already gone through a supposedly comprehensive
1433 # mapping of entities to Unicode characters. We might not
1434 # have gone through any mapping at all. So the chances are
1435 # very high that this is a real entity, and not a
1436 # misplaced ampersand.
1438 self.handle_data(data)
1440 def handle_decl(self, data):
1441 "Handle DOCTYPEs and the like as Declaration objects."
1442 self._toStringSubclass(data, Declaration)
1444 def parse_declaration(self, i):
1445 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1446 declaration as a CData object."""
1448 if self.rawdata[i:i+9] == '<![CDATA[':
1449 k = self.rawdata.find(']]>', i)
1451 k = len(self.rawdata)
1452 data = self.rawdata[i+9:k]
1454 self._toStringSubclass(data, CData)
1457 j = SGMLParser.parse_declaration(self, i)
1458 except SGMLParseError:
1459 toHandle = self.rawdata[i:]
1460 self.handle_data(toHandle)
1461 j = i + len(toHandle)
1464 class BeautifulSoup(BeautifulStoneSoup):
1466 """This parser knows the following facts about HTML:
1468 * Some tags have no closing tag and should be interpreted as being
1469 closed as soon as they are encountered.
1471 * The text inside some tags (ie. 'script') may contain tags which
1472 are not really part of the document and which should be parsed
1473 as text, not tags. If you want to parse the text as tags, you can
1474 always fetch it and parse it explicitly.
1476 * Tag nesting rules:
1478 Most tags can't be nested at all. For instance, the occurance of
1479 a <p> tag should implicitly close the previous <p> tag.
1482 should be transformed into:
1483 <p>Para1</p><p>Para2
1485 Some tags can be nested arbitrarily. For instance, the occurance
1486 of a <blockquote> tag should _not_ implicitly close the previous
1489 Alice said: <blockquote>Bob said: <blockquote>Blah
1490 should NOT be transformed into:
1491 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1493 Some tags can be nested, but the nesting is reset by the
1494 interposition of other tags. For instance, a <tr> tag should
1495 implicitly close the previous <tr> tag within the same <table>,
1496 but not close a <tr> tag in another table.
1498 <table><tr>Blah<tr>Blah
1499 should be transformed into:
1500 <table><tr>Blah</tr><tr>Blah
1502 <tr>Blah<table><tr>Blah
1503 should NOT be transformed into
1504 <tr>Blah<table></tr><tr>Blah
1506 Differing assumptions about tag nesting rules are a major source
1507 of problems with the BeautifulSoup class. If BeautifulSoup is not
1508 treating as nestable a tag your page author treats as nestable,
1509 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1510 BeautifulStoneSoup before writing your own subclass."""
1512 def __init__(self, *args, **kwargs):
1513 if not kwargs.has_key('smartQuotesTo'):
1514 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1515 kwargs['isHTML'] = True
1516 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1518 SELF_CLOSING_TAGS = buildTagMap(None,
1519 ('br' , 'hr', 'input', 'img', 'meta',
1520 'spacer', 'link', 'frame', 'base', 'col'))
1522 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1524 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1526 #According to the HTML standard, each of these inline tags can
1527 #contain another tag of the same type. Furthermore, it's common
1528 #to actually use these tags this way.
1529 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1532 #According to the HTML standard, these block tags can contain
1533 #another tag of the same type. Furthermore, it's common
1534 #to actually use these tags this way.
1535 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1537 #Lists can contain other lists, but there are restrictions.
1538 NESTABLE_LIST_TAGS = { 'ol' : [],
1540 'li' : ['ul', 'ol'],
1545 #Tables can contain other tables, but there are restrictions.
1546 NESTABLE_TABLE_TAGS = {'table' : [],
1547 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1550 'thead' : ['table'],
1551 'tbody' : ['table'],
1552 'tfoot' : ['table'],
1555 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1557 #If one of these tags is encountered, all tags up to the next tag of
1558 #this type are popped.
1559 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1560 NON_NESTABLE_BLOCK_TAGS,
1562 NESTABLE_TABLE_TAGS)
1564 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1565 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1567 # Used to detect the charset in a META tag; see start_meta
1568 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1570 def start_meta(self, attrs):
1571 """Beautiful Soup can detect a charset included in a META tag,
1572 try to convert the document to that charset, and re-parse the
1573 document from the beginning."""
1576 contentTypeIndex = None
1577 tagNeedsEncodingSubstitution = False
1579 for i in range(0, len(attrs)):
1580 key, value = attrs[i]
1582 if key == 'http-equiv':
1584 elif key == 'content':
1586 contentTypeIndex = i
1588 if httpEquiv and contentType: # It's an interesting meta tag.
1589 match = self.CHARSET_RE.search(contentType)
1591 if (self.declaredHTMLEncoding is not None or
1592 self.originalEncoding == self.fromEncoding):
1593 # An HTML encoding was sniffed while converting
1594 # the document to Unicode, or an HTML encoding was
1595 # sniffed during a previous pass through the
1596 # document, or an encoding was specified
1597 # explicitly and it worked. Rewrite the meta tag.
1599 return match.group(1) + "%SOUP-ENCODING%"
1600 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1601 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1603 tagNeedsEncodingSubstitution = True
1605 # This is our first pass through the document.
1606 # Go through it again with the encoding information.
1607 newCharset = match.group(3)
1608 if newCharset and newCharset != self.originalEncoding:
1609 self.declaredHTMLEncoding = newCharset
1610 self._feed(self.declaredHTMLEncoding)
1613 tag = self.unknown_starttag("meta", attrs)
1614 if tag and tagNeedsEncodingSubstitution:
1615 tag.containsSubstitutions = True
1617 class StopParsing(Exception):
1620 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1622 """The BeautifulSoup class is oriented towards skipping over
1623 common HTML errors like unclosed tags. However, sometimes it makes
1624 errors of its own. For instance, consider this fragment:
1626 <b>Foo<b>Bar</b></b>
1628 This is perfectly valid (if bizarre) HTML. However, the
1629 BeautifulSoup class will implicitly close the first b tag when it
1630 encounters the second 'b'. It will think the author wrote
1631 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1632 there's no real-world reason to bold something that's already
1633 bold. When it encounters '</b></b>' it will close two more 'b'
1634 tags, for a grand total of three tags closed instead of two. This
1635 can throw off the rest of your document structure. The same is
1636 true of a number of other tags, listed below.
1638 It's much more common for someone to forget to close a 'b' tag
1639 than to actually use nested 'b' tags, and the BeautifulSoup class
1640 handles the common case. This class handles the not-co-common
1641 case: where you can't believe someone wrote what they did, but
1642 it's valid HTML and BeautifulSoup screwed up by assuming it
1645 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1646 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1647 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1650 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript')
1652 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1653 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1654 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1656 class MinimalSoup(BeautifulSoup):
1657 """The MinimalSoup class is for parsing HTML that contains
1658 pathologically bad markup. It makes no assumptions about tag
1659 nesting, but it does know which tags are self-closing, that
1660 <script> tags contain Javascript and should not be parsed, that
1661 META tags may contain encoding information, and so on.
1663 This also makes it better for subclassing than BeautifulStoneSoup
1664 or BeautifulSoup."""
1666 RESET_NESTING_TAGS = buildTagMap('noscript')
1669 class BeautifulSOAP(BeautifulStoneSoup):
1670 """This class will push a tag with only a single string child into
1671 the tag's parent as an attribute. The attribute's name is the tag
1672 name, and the value is the string child. An example should give
1673 the flavor of the change:
1675 <foo><bar>baz</bar></foo>
1677 <foo bar="baz"><bar>baz</bar></foo>
1679 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1681 This is, of course, useful for scraping structures that tend to
1682 use subelements instead of attributes, such as SOAP messages. Note
1683 that it modifies its input, so don't print the modified version
1686 I'm not sure how many people really want to use this class; let me
1687 know if you do. Mainly I like the name."""
1690 if len(self.tagStack) > 1:
1691 tag = self.tagStack[-1]
1692 parent = self.tagStack[-2]
1693 parent._getAttrMap()
1694 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1695 isinstance(tag.contents[0], NavigableString) and
1696 not parent.attrMap.has_key(tag.name)):
1697 parent[tag.name] = tag.contents[0]
1698 BeautifulStoneSoup.popTag(self)
1700 #Enterprise class names! It has come to our attention that some people
1701 #think the names of the Beautiful Soup parser classes are too silly
1702 #and "unprofessional" for use in enterprise screen-scraping. We feel
1703 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1704 #All-Night Kosher Bakery recommends renaming this file to
1705 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1706 #"RobustParserBeanInterface.class") and using the following
1707 #enterprise-friendly class aliases:
1708 class RobustXMLParser(BeautifulStoneSoup):
1710 class RobustHTMLParser(BeautifulSoup):
1712 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1714 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1716 class SimplifyingSOAPParser(BeautifulSOAP):
1719 ######################################################
1721 # Bonus library: Unicode, Dammit
1723 # This class forces XML data into a standard format (usually to UTF-8
1724 # or Unicode). It is heavily based on code from Mark Pilgrim's
1725 # Universal Feed Parser. It does not rewrite the XML or HTML to
1726 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1727 # (XML) and BeautifulSoup.start_meta (HTML).
1729 # Autodetects character encodings.
1730 # Download from http://chardet.feedparser.org/
1733 # import chardet.constants
1734 # chardet.constants._debug = 1
1738 # cjkcodecs and iconv_codec make Python know about more character encodings.
1739 # Both are available from http://cjkpython.i18n.org/
1740 # They're built in if you use Python 2.4.
1742 import cjkcodecs.aliases
1750 class UnicodeDammit:
1751 """A class for detecting the encoding of a *ML document and
1752 converting it to a Unicode string. If the source encoding is
1753 windows-1252, can replace MS smart quotes with their HTML or XML
1756 # This dictionary maps commonly seen values for "charset" in HTML
1757 # meta tags to the corresponding Python codec names. It only covers
1758 # values that aren't in Python's aliases and can't be determined
1759 # by the heuristics in find_codec.
1760 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1761 "x-sjis" : "shift-jis" }
1763 def __init__(self, markup, overrideEncodings=[],
1764 smartQuotesTo='xml', isHTML=False):
1765 self.declaredHTMLEncoding = None
1766 self.markup, documentEncoding, sniffedEncoding = \
1767 self._detectEncoding(markup, isHTML)
1768 self.smartQuotesTo = smartQuotesTo
1769 self.triedEncodings = []
1770 if markup == '' or isinstance(markup, unicode):
1771 self.originalEncoding = None
1772 self.unicode = unicode(markup)
1776 for proposedEncoding in overrideEncodings:
1777 u = self._convertFrom(proposedEncoding)
1780 for proposedEncoding in (documentEncoding, sniffedEncoding):
1781 u = self._convertFrom(proposedEncoding)
1784 # If no luck and we have auto-detection library, try that:
1785 if not u and chardet and not isinstance(self.markup, unicode):
1786 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1788 # As a last resort, try utf-8 and windows-1252:
1790 for proposed_encoding in ("utf-8", "windows-1252"):
1791 u = self._convertFrom(proposed_encoding)
1795 if not u: self.originalEncoding = None
1797 def _subMSChar(self, orig):
1798 """Changes a MS smart quote character to an XML or HTML
1800 sub = self.MS_CHARS.get(orig)
1801 if isinstance(sub, tuple):
1802 if self.smartQuotesTo == 'xml':
1803 sub = '&#x%s;' % sub[1]
1805 sub = '&%s;' % sub[0]
1808 def _convertFrom(self, proposed):
1809 proposed = self.find_codec(proposed)
1810 if not proposed or proposed in self.triedEncodings:
1812 self.triedEncodings.append(proposed)
1813 markup = self.markup
1815 # Convert smart quotes to HTML if coming from an encoding
1816 # that might have them.
1817 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1820 markup = re.compile("([\x80-\x9f])").sub \
1821 (lambda(x): self._subMSChar(x.group(1)),
1825 # print "Trying to convert document to %s" % proposed
1826 u = self._toUnicode(markup, proposed)
1828 self.originalEncoding = proposed
1829 except Exception, e:
1830 # print "That didn't work!"
1833 #print "Correct encoding: %s" % proposed
1836 def _toUnicode(self, data, encoding):
1837 '''Given a string and its encoding, decodes the string into Unicode.
1838 %encoding is a string recognized by encodings.aliases'''
1840 # strip Byte Order Mark (if present)
1841 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1842 and (data[2:4] != '\x00\x00'):
1843 encoding = 'utf-16be'
1845 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1846 and (data[2:4] != '\x00\x00'):
1847 encoding = 'utf-16le'
1849 elif data[:3] == '\xef\xbb\xbf':
1852 elif data[:4] == '\x00\x00\xfe\xff':
1853 encoding = 'utf-32be'
1855 elif data[:4] == '\xff\xfe\x00\x00':
1856 encoding = 'utf-32le'
1858 newdata = unicode(data, encoding)
1861 def _detectEncoding(self, xml_data, isHTML=False):
1862 """Given a document, tries to detect its XML encoding."""
1863 xml_encoding = sniffed_xml_encoding = None
1865 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1867 xml_data = self._ebcdic_to_ascii(xml_data)
1868 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1870 sniffed_xml_encoding = 'utf-16be'
1871 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1872 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1873 and (xml_data[2:4] != '\x00\x00'):
1875 sniffed_xml_encoding = 'utf-16be'
1876 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1877 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1879 sniffed_xml_encoding = 'utf-16le'
1880 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1881 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1882 (xml_data[2:4] != '\x00\x00'):
1884 sniffed_xml_encoding = 'utf-16le'
1885 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1886 elif xml_data[:4] == '\x00\x00\x00\x3c':
1888 sniffed_xml_encoding = 'utf-32be'
1889 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1890 elif xml_data[:4] == '\x3c\x00\x00\x00':
1892 sniffed_xml_encoding = 'utf-32le'
1893 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1894 elif xml_data[:4] == '\x00\x00\xfe\xff':
1896 sniffed_xml_encoding = 'utf-32be'
1897 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1898 elif xml_data[:4] == '\xff\xfe\x00\x00':
1900 sniffed_xml_encoding = 'utf-32le'
1901 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1902 elif xml_data[:3] == '\xef\xbb\xbf':
1904 sniffed_xml_encoding = 'utf-8'
1905 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1907 sniffed_xml_encoding = 'ascii'
1910 xml_encoding_match = None
1911 xml_encoding_match = re.compile(
1912 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1913 if not xml_encoding_match and isHTML:
1914 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1915 xml_encoding_match = regexp.search(xml_data)
1916 if xml_encoding_match is not None:
1917 xml_encoding = xml_encoding_match.groups()[0].lower()
1919 self.declaredHTMLEncoding = xml_encoding
1920 if sniffed_xml_encoding and \
1921 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1922 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1923 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1925 xml_encoding = sniffed_xml_encoding
1926 return xml_data, xml_encoding, sniffed_xml_encoding
1929 def find_codec(self, charset):
1930 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1931 or (charset and self._codec(charset.replace("-", ""))) \
1932 or (charset and self._codec(charset.replace("-", "_"))) \
1935 def _codec(self, charset):
1936 if not charset: return charset
1939 codecs.lookup(charset)
1941 except (LookupError, ValueError):
1945 EBCDIC_TO_ASCII_MAP = None
1946 def _ebcdic_to_ascii(self, s):
1948 if not c.EBCDIC_TO_ASCII_MAP:
1949 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1950 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1951 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1952 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1953 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1954 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1955 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1956 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1957 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1958 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1959 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1960 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1961 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1962 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1963 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1964 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1965 250,251,252,253,254,255)
1967 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1968 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1969 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1971 MS_CHARS = { '\x80' : ('euro', '20AC'),
1973 '\x82' : ('sbquo', '201A'),
1974 '\x83' : ('fnof', '192'),
1975 '\x84' : ('bdquo', '201E'),
1976 '\x85' : ('hellip', '2026'),
1977 '\x86' : ('dagger', '2020'),
1978 '\x87' : ('Dagger', '2021'),
1979 '\x88' : ('circ', '2C6'),
1980 '\x89' : ('permil', '2030'),
1981 '\x8A' : ('Scaron', '160'),
1982 '\x8B' : ('lsaquo', '2039'),
1983 '\x8C' : ('OElig', '152'),
1985 '\x8E' : ('#x17D', '17D'),
1988 '\x91' : ('lsquo', '2018'),
1989 '\x92' : ('rsquo', '2019'),
1990 '\x93' : ('ldquo', '201C'),
1991 '\x94' : ('rdquo', '201D'),
1992 '\x95' : ('bull', '2022'),
1993 '\x96' : ('ndash', '2013'),
1994 '\x97' : ('mdash', '2014'),
1995 '\x98' : ('tilde', '2DC'),
1996 '\x99' : ('trade', '2122'),
1997 '\x9a' : ('scaron', '161'),
1998 '\x9b' : ('rsaquo', '203A'),
1999 '\x9c' : ('oelig', '153'),
2001 '\x9e' : ('#x17E', '17E'),
2002 '\x9f' : ('Yuml', ''),}
2004 #######################################################################
2007 #By default, act as an HTML pretty-printer.
2008 if __name__ == '__main__':
2010 soup = BeautifulSoup(sys.stdin)
2011 print soup.prettify()