2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.4 or later
10 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16 Redistribution and use in source and binary forms, with or without modification,
17 are permitted provided that the following conditions are met:
19 * Redistributions of source code must retain the above copyright notice,
20 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright notice,
22 this list of conditions and the following disclaimer in the documentation
23 and/or other materials provided with the distribution.
25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 POSSIBILITY OF SUCH DAMAGE."""
36 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
37 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
38 "John Beimler <http://john.beimler.org/>",
39 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
40 "Aaron Swartz <http://aaronsw.com/>",
41 "Kevin Marks <http://epeus.blogspot.com/>",
42 "Sam Ruby <http://intertwingly.net/>",
43 "Ade Oshineye <http://blog.oshineye.com/>",
44 "Martin Pool <http://sourcefrog.net/>",
45 "Kurt McKee <http://kurtmckee.org/>"]
47 # HTTP "User-Agent" header to send to servers when downloading feeds.
48 # If you are embedding feedparser in a larger application, you should
49 # change this to your application name and URL.
50 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
52 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
53 # want to send an Accept header, set this to None.
54 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
56 # List of preferred XML parsers, by SAX driver name. These will be tried first,
57 # but if they're not installed, Python will keep searching through its own list
58 # of pre-installed parsers until it finds one that supports everything we need.
59 PREFERRED_XML_PARSERS = ["drv_libxml2"]
61 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
62 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
63 # or utidylib <http://utidylib.berlios.de/>.
66 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
68 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
70 # If you want feedparser to automatically resolve all relative URIs, set this
72 RESOLVE_RELATIVE_URIS = 1
74 # If you want feedparser to automatically sanitize all potentially unsafe
75 # HTML content, set this to 1.
78 # ---------- Python 3 modules (make it work if possible) ----------
82 from email import _parseaddr as rfc822
85 # Python 3.1 introduces bytes.maketrans and simultaneously
86 # deprecates string.maketrans; use bytes.maketrans if possible
87 _maketrans = bytes.maketrans
88 except (NameError, AttributeError):
90 _maketrans = string.maketrans
92 # base64 support for Atom feeds that contain embedded binary data
94 import base64, binascii
96 base64 = binascii = None
98 # Python 3.1 deprecates decodestring in favor of decodebytes
99 _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
102 # Convert a UTF-8 str to bytes if the interpreter is Python 3
104 return bytes(s, 'utf8')
105 except (NameError, TypeError):
106 # In Python 2.5 and below, bytes doesn't exist (NameError)
107 # In Python 2.6 and above, bytes and str are the same (TypeError)
111 # Convert a list of ints to bytes if the interpreter is Python 3
114 # In Python 2.6 and above, this call won't raise an exception
115 # but it will return bytes([65]) as '[65]' instead of 'A'
119 return ''.join(map(chr, l))
121 # If you want feedparser to allow all URL schemes, set this to ()
122 # List culled from Python's urlparse documentation at:
123 # http://docs.python.org/library/urlparse.html
124 # as well as from "URI scheme" at Wikipedia:
125 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
126 # Many more will likely need to be added!
127 ACCEPTABLE_URI_SCHEMES = (
128 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
129 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
130 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
131 # Additional common-but-unofficial schemes
132 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
133 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
135 #ACCEPTABLE_URI_SCHEMES = ()
137 # ---------- required modules (should come with any Python distribution) ----------
150 from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
153 from io import BytesIO as _StringIO
156 from cStringIO import StringIO as _StringIO
158 from StringIO import StringIO as _StringIO
160 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
162 # gzip is included with most Python distributions, but may not be available if you compiled your own
172 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
173 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
174 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
175 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
178 from xml.sax.saxutils import escape as _xmlescape
181 def _xmlescape(data,entities={}):
182 data = data.replace('&', '&')
183 data = data.replace('>', '>')
184 data = data.replace('<', '<')
185 for char, entity in entities:
186 data = data.replace(char, entity)
190 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
191 except xml.sax.SAXReaderNotAvailable:
196 # sgmllib is not available by default in Python 3; if the end user doesn't have
197 # it available then we'll lose illformed XML parsing, content santizing, and
198 # microformat support (at least while feedparser depends on BeautifulSoup).
202 # This is probably Python 3, which doesn't include sgmllib anymore
205 # Mock sgmllib enough to allow subclassing later on
206 class sgmllib(object):
207 class SGMLParser(object):
208 def goahead(self, i):
210 def parse_starttag(self, i):
215 # sgmllib defines a number of module-level regular expressions that are
216 # insufficient for the XML parsing feedparser needs. Rather than modify
217 # the variables directly in sgmllib, they're defined here using the same
218 # names, and the compiled code objects of several sgmllib.SGMLParser
219 # methods are copied into _BaseHTMLProcessor so that they execute in
220 # feedparser's scope instead of sgmllib's scope.
221 charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
222 tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
223 attrfind = re.compile(
224 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
225 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
228 # Unfortunately, these must be copied over to prevent NameError exceptions
229 entityref = sgmllib.entityref
230 incomplete = sgmllib.incomplete
231 interesting = sgmllib.interesting
232 shorttag = sgmllib.shorttag
233 shorttagopen = sgmllib.shorttagopen
234 starttagopen = sgmllib.starttagopen
236 class _EndBracketRegEx:
238 # Overriding the built-in sgmllib.endbracket regex allows the
239 # parser to find angle brackets embedded in element attributes.
240 self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
241 def search(self, target, index=0):
242 match = self.endbracket.match(target, index)
243 if match is not None:
244 # Returning a new object in the calling thread's context
245 # resolves a thread-safety.
246 return EndBracketMatch(match)
248 class EndBracketMatch:
249 def __init__(self, match):
252 return self.match.end(n)
253 endbracket = _EndBracketRegEx()
256 # cjkcodecs and iconv_codec provide support for more character encodings.
257 # Both are available from http://cjkpython.i18n.org/
259 import cjkcodecs.aliases
267 # chardet library auto-detects character encodings
268 # Download from http://chardet.feedparser.org/
274 # BeautifulSoup parser used for parsing microformats from embedded HTML content
275 # http://www.crummy.com/software/BeautifulSoup/
276 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
277 # older 2.x series. If it doesn't, and you can figure out why, I'll accept a
278 # patch and modify the compatibility statement accordingly.
284 # ---------- don't touch these ----------
285 class ThingsNobodyCaresAboutButMe(Exception): pass
286 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
287 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
288 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
289 class UndeclaredNamespace(Exception): pass
291 SUPPORTED_VERSIONS = {'': u'unknown',
292 'rss090': u'RSS 0.90',
293 'rss091n': u'RSS 0.91 (Netscape)',
294 'rss091u': u'RSS 0.91 (Userland)',
295 'rss092': u'RSS 0.92',
296 'rss093': u'RSS 0.93',
297 'rss094': u'RSS 0.94',
300 'rss': u'RSS (unknown version)',
301 'atom01': u'Atom 0.1',
302 'atom02': u'Atom 0.2',
303 'atom03': u'Atom 0.3',
304 'atom10': u'Atom 1.0',
305 'atom': u'Atom (unknown version)',
309 class FeedParserDict(dict):
310 keymap = {'channel': 'feed',
314 'date_parsed': 'updated_parsed',
315 'description': ['summary', 'subtitle'],
317 'modified': 'updated',
318 'modified_parsed': 'updated_parsed',
319 'issued': 'published',
320 'issued_parsed': 'published_parsed',
321 'copyright': 'rights',
322 'copyright_detail': 'rights_detail',
323 'tagline': 'subtitle',
324 'tagline_detail': 'subtitle_detail'}
325 def __getitem__(self, key):
326 if key == 'category':
328 return dict.__getitem__(self, 'tags')[0]['term']
330 raise KeyError, "object doesn't have key 'category'"
331 elif key == 'enclosures':
332 norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
333 return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
334 elif key == 'license':
335 for link in dict.__getitem__(self, 'links'):
336 if link['rel']==u'license' and link.has_key('href'):
338 elif key == 'categories':
339 return [(tag['scheme'], tag['term']) for tag in dict.__getitem__(self, 'tags')]
341 realkey = self.keymap.get(key, key)
342 if isinstance(realkey, list):
344 if dict.__contains__(self, k):
345 return dict.__getitem__(self, k)
346 elif dict.__contains__(self, realkey):
347 return dict.__getitem__(self, realkey)
348 return dict.__getitem__(self, key)
350 def __contains__(self, key):
352 self.__getitem__(key)
358 has_key = __contains__
360 def get(self, key, default=None):
362 return self.__getitem__(key)
366 def __setitem__(self, key, value):
367 key = self.keymap.get(key, key)
368 if isinstance(key, list):
370 return dict.__setitem__(self, key, value)
372 def setdefault(self, key, value):
378 def __getattr__(self, key):
379 # __getattribute__() is called first; this will be called
380 # only if an attribute was not already found
382 return self.__getitem__(key)
384 raise AttributeError, "object has no attribute '%s'" % key
387 _ebcdic_to_ascii_map = None
388 def _ebcdic_to_ascii(s):
389 global _ebcdic_to_ascii_map
390 if not _ebcdic_to_ascii_map:
392 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
393 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
394 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
395 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
396 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
397 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
398 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
399 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
400 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
401 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
402 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
403 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
404 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
405 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
406 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
407 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
409 _ebcdic_to_ascii_map = _maketrans( \
410 _l2bytes(range(256)), _l2bytes(emap))
411 return s.translate(_ebcdic_to_ascii_map)
414 unichr(128): unichr(8364), # euro sign
415 unichr(130): unichr(8218), # single low-9 quotation mark
416 unichr(131): unichr( 402), # latin small letter f with hook
417 unichr(132): unichr(8222), # double low-9 quotation mark
418 unichr(133): unichr(8230), # horizontal ellipsis
419 unichr(134): unichr(8224), # dagger
420 unichr(135): unichr(8225), # double dagger
421 unichr(136): unichr( 710), # modifier letter circumflex accent
422 unichr(137): unichr(8240), # per mille sign
423 unichr(138): unichr( 352), # latin capital letter s with caron
424 unichr(139): unichr(8249), # single left-pointing angle quotation mark
425 unichr(140): unichr( 338), # latin capital ligature oe
426 unichr(142): unichr( 381), # latin capital letter z with caron
427 unichr(145): unichr(8216), # left single quotation mark
428 unichr(146): unichr(8217), # right single quotation mark
429 unichr(147): unichr(8220), # left double quotation mark
430 unichr(148): unichr(8221), # right double quotation mark
431 unichr(149): unichr(8226), # bullet
432 unichr(150): unichr(8211), # en dash
433 unichr(151): unichr(8212), # em dash
434 unichr(152): unichr( 732), # small tilde
435 unichr(153): unichr(8482), # trade mark sign
436 unichr(154): unichr( 353), # latin small letter s with caron
437 unichr(155): unichr(8250), # single right-pointing angle quotation mark
438 unichr(156): unichr( 339), # latin small ligature oe
439 unichr(158): unichr( 382), # latin small letter z with caron
440 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
442 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
443 def _urljoin(base, uri):
444 uri = _urifixer.sub(r'\1\3', uri)
446 uri = urlparse.urljoin(base, uri)
447 if not isinstance(uri, unicode):
448 return uri.decode('utf-8', 'ignore')
451 # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
452 # return urlparse.urljoin(base, uri)
454 class _FeedParserMixin:
455 namespaces = {'': '',
456 'http://backend.userland.com/rss': '',
457 'http://blogs.law.harvard.edu/tech/rss': '',
458 'http://purl.org/rss/1.0/': '',
459 'http://my.netscape.com/rdf/simple/0.9/': '',
460 'http://example.com/newformat#': '',
461 'http://example.com/necho': '',
462 'http://purl.org/echo/': '',
463 'uri/of/echo/namespace#': '',
464 'http://purl.org/pie/': '',
465 'http://purl.org/atom/ns#': '',
466 'http://www.w3.org/2005/Atom': '',
467 'http://purl.org/rss/1.0/modules/rss091#': '',
469 'http://webns.net/mvcb/': 'admin',
470 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
471 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
472 'http://media.tangent.org/rss/1.0/': 'audio',
473 'http://backend.userland.com/blogChannelModule': 'blogChannel',
474 'http://web.resource.org/cc/': 'cc',
475 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
476 'http://purl.org/rss/1.0/modules/company': 'co',
477 'http://purl.org/rss/1.0/modules/content/': 'content',
478 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
479 'http://purl.org/dc/elements/1.1/': 'dc',
480 'http://purl.org/dc/terms/': 'dcterms',
481 'http://purl.org/rss/1.0/modules/email/': 'email',
482 'http://purl.org/rss/1.0/modules/event/': 'ev',
483 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
484 'http://freshmeat.net/rss/fm/': 'fm',
485 'http://xmlns.com/foaf/0.1/': 'foaf',
486 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
487 'http://postneo.com/icbm/': 'icbm',
488 'http://purl.org/rss/1.0/modules/image/': 'image',
489 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
490 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
491 'http://purl.org/rss/1.0/modules/link/': 'l',
492 'http://search.yahoo.com/mrss': 'media',
493 #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
494 'http://search.yahoo.com/mrss/': 'media',
495 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
496 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
497 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
498 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
499 'http://purl.org/rss/1.0/modules/reference/': 'ref',
500 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
501 'http://purl.org/rss/1.0/modules/search/': 'search',
502 'http://purl.org/rss/1.0/modules/slash/': 'slash',
503 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
504 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
505 'http://hacks.benhammersley.com/rss/streaming/': 'str',
506 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
507 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
508 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
509 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
510 'http://purl.org/rss/1.0/modules/threading/': 'thr',
511 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
512 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
513 'http://wellformedweb.org/commentAPI/': 'wfw',
514 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
515 'http://www.w3.org/1999/xhtml': 'xhtml',
516 'http://www.w3.org/1999/xlink': 'xlink',
517 'http://www.w3.org/XML/1998/namespace': 'xml'
519 _matchnamespaces = {}
521 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
522 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
523 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
524 html_types = [u'text/html', u'application/xhtml+xml']
526 def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
527 if not self._matchnamespaces:
528 for k, v in self.namespaces.items():
529 self._matchnamespaces[k.lower()] = v
530 self.feeddata = FeedParserDict() # feed-level data
531 self.encoding = encoding # character encoding
532 self.entries = [] # list of entry-level data
533 self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
534 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
536 # the following are used internally to track state;
537 # this is really out of control and should be refactored
544 self.incontributor = 0
547 self.sourcedata = FeedParserDict()
548 self.contentparams = FeedParserDict()
549 self._summaryKey = None
550 self.namespacemap = {}
551 self.elementstack = []
554 self.baseuri = baseuri or u''
555 self.lang = baselang or None
559 self.feeddata['language'] = baselang.replace('_','-')
561 def _normalize_attributes(self, kv):
563 v = k in ('rel', 'type') and kv[1].lower() or kv[1]
564 # the sgml parser doesn't handle entities in attributes, nor
565 # does it pass the attribute values through as unicode, while
566 # strict xml parsers do -- account for this difference
567 if isinstance(self, _LooseFeedParser):
568 v = v.replace('&', '&')
569 if not isinstance(v, unicode):
570 v = v.decode('utf-8')
573 def unknown_starttag(self, tag, attrs):
575 attrs = map(self._normalize_attributes, attrs)
577 # track xml:base and xml:lang
579 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
580 if not isinstance(baseuri, unicode):
581 baseuri = baseuri.decode(self.encoding, 'ignore')
582 # ensure that self.baseuri is always an absolute URI that
583 # uses a whitelisted URI scheme (e.g. not `javscript:`)
585 self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
587 self.baseuri = _urljoin(self.baseuri, baseuri)
588 lang = attrsD.get('xml:lang', attrsD.get('lang'))
590 # xml:lang could be explicitly set to '', we need to capture that
593 # if no xml:lang is specified, use parent lang
596 if tag in ('feed', 'rss', 'rdf:RDF'):
597 self.feeddata['language'] = lang.replace('_','-')
599 self.basestack.append(self.baseuri)
600 self.langstack.append(lang)
603 for prefix, uri in attrs:
604 if prefix.startswith('xmlns:'):
605 self.trackNamespace(prefix[6:], uri)
606 elif prefix == 'xmlns':
607 self.trackNamespace(None, uri)
609 # track inline content
610 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
611 if tag in ['xhtml:div', 'div']:
612 return # typepad does this 10/2007
613 # element declared itself as escaped markup, but it isn't really
614 self.contentparams['type'] = u'application/xhtml+xml'
615 if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
616 if tag.find(':') <> -1:
617 prefix, tag = tag.split(':', 1)
618 namespace = self.namespacesInUse.get(prefix, '')
619 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
620 attrs.append(('xmlns',namespace))
621 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
622 attrs.append(('xmlns',namespace))
625 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
628 if tag.find(':') <> -1:
629 prefix, suffix = tag.split(':', 1)
631 prefix, suffix = '', tag
632 prefix = self.namespacemap.get(prefix, prefix)
634 prefix = prefix + '_'
636 # special hack for better tracking of empty textinput/image elements in illformed feeds
637 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
639 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
642 # call special handler (if defined) or default handler
643 methodname = '_start_' + prefix + suffix
645 method = getattr(self, methodname)
646 return method(attrsD)
647 except AttributeError:
648 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
649 unknown_tag = prefix + suffix
651 # No attributes so merge it into the encosing dictionary
652 return self.push(unknown_tag, 1)
654 # Has attributes so create it in its own dictionary
655 context = self._getContext()
656 context[unknown_tag] = attrsD
658 def unknown_endtag(self, tag):
660 if tag.find(':') <> -1:
661 prefix, suffix = tag.split(':', 1)
663 prefix, suffix = '', tag
664 prefix = self.namespacemap.get(prefix, prefix)
666 prefix = prefix + '_'
667 if suffix == 'svg' and self.svgOK:
670 # call special handler (if defined) or default handler
671 methodname = '_end_' + prefix + suffix
674 raise AttributeError()
675 method = getattr(self, methodname)
677 except AttributeError:
678 self.pop(prefix + suffix)
680 # track inline content
681 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
682 # element declared itself as escaped markup, but it isn't really
683 if tag in ['xhtml:div', 'div']:
684 return # typepad does this 10/2007
685 self.contentparams['type'] = u'application/xhtml+xml'
686 if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
687 tag = tag.split(':')[-1]
688 self.handle_data('</%s>' % tag, escape=0)
690 # track xml:base and xml:lang going out of scope
693 if self.basestack and self.basestack[-1]:
694 self.baseuri = self.basestack[-1]
697 if self.langstack: # and (self.langstack[-1] is not None):
698 self.lang = self.langstack[-1]
700 def handle_charref(self, ref):
701 # called for each character reference, e.g. for ' ', ref will be '160'
702 if not self.elementstack:
705 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
712 text = unichr(c).encode('utf-8')
713 self.elementstack[-1][2].append(text)
715 def handle_entityref(self, ref):
716 # called for each entity reference, e.g. for '©', ref will be 'copy'
717 if not self.elementstack:
719 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
721 elif ref in self.entities.keys():
722 text = self.entities[ref]
723 if text.startswith('&#') and text.endswith(';'):
724 return self.handle_entityref(text)
731 text = unichr(name2codepoint[ref]).encode('utf-8')
732 self.elementstack[-1][2].append(text)
734 def handle_data(self, text, escape=1):
735 # called for each block of plain text, i.e. outside of any tag and
736 # not containing any character or entity references
737 if not self.elementstack:
739 if escape and self.contentparams.get('type') == u'application/xhtml+xml':
740 text = _xmlescape(text)
741 self.elementstack[-1][2].append(text)
743 def handle_comment(self, text):
744 # called for each comment, e.g. <!-- insert message here -->
747 def handle_pi(self, text):
748 # called for each processing instruction, e.g. <?instruction>
751 def handle_decl(self, text):
754 def parse_declaration(self, i):
755 # override internal declaration handler to handle CDATA blocks
756 if self.rawdata[i:i+9] == '<![CDATA[':
757 k = self.rawdata.find(']]>', i)
759 # CDATA block began but didn't finish
760 k = len(self.rawdata)
762 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
765 k = self.rawdata.find('>', i)
769 # We have an incomplete CDATA block.
772 def mapContentType(self, contentType):
773 contentType = contentType.lower()
774 if contentType == 'text' or contentType == 'plain':
775 contentType = u'text/plain'
776 elif contentType == 'html':
777 contentType = u'text/html'
778 elif contentType == 'xhtml':
779 contentType = u'application/xhtml+xml'
782 def trackNamespace(self, prefix, uri):
783 loweruri = uri.lower()
784 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
785 self.version = u'rss090'
786 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
787 self.version = u'rss10'
788 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
789 self.version = u'atom10'
790 if loweruri.find(u'backend.userland.com/rss') <> -1:
791 # match any backend.userland.com namespace
792 uri = u'http://backend.userland.com/rss'
794 if self._matchnamespaces.has_key(loweruri):
795 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
796 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
798 self.namespacesInUse[prefix or ''] = uri
800 def resolveURI(self, uri):
801 return _urljoin(self.baseuri or u'', uri)
803 def decodeEntities(self, element, data):
806 def strattrs(self, attrs):
807 return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
809 def push(self, element, expectingText):
810 self.elementstack.append([element, expectingText, []])
812 def pop(self, element, stripWhitespace=1):
813 if not self.elementstack:
815 if self.elementstack[-1][0] != element:
818 element, expectingText, pieces = self.elementstack.pop()
820 if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
821 # remove enclosing child element, but only if it is a <div> and
822 # only if all the remaining content is nested underneath it.
823 # This means that the divs would be retained in the following:
824 # <div>foo</div><div>bar</div>
825 while pieces and len(pieces)>1 and not pieces[-1].strip():
827 while pieces and len(pieces)>1 and not pieces[0].strip():
829 if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
831 for piece in pieces[:-1]:
832 if piece.startswith('</'):
836 elif piece.startswith('<') and not piece.endswith('/>'):
839 pieces = pieces[1:-1]
841 # Ensure each piece is a str for Python 3
842 for (i, v) in enumerate(pieces):
843 if not isinstance(v, unicode):
844 pieces[i] = v.decode('utf-8')
846 output = u''.join(pieces)
848 output = output.strip()
849 if not expectingText:
852 # decode base64 content
853 if base64 and self.contentparams.get('base64', 0):
855 output = _base64decode(output)
856 except binascii.Error:
858 except binascii.Incomplete:
861 # In Python 3, base64 takes and outputs bytes, not str
862 # This may not be the most correct way to accomplish this
863 output = _base64decode(output.encode('utf-8')).decode('utf-8')
865 # resolve relative URIs
866 if (element in self.can_be_relative_uri) and output:
867 output = self.resolveURI(output)
869 # decode entities within embedded markup
870 if not self.contentparams.get('base64', 0):
871 output = self.decodeEntities(element, output)
873 # some feed formats require consumers to guess
874 # whether the content is html or plain text
875 if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
876 if self.lookslikehtml(output):
877 self.contentparams['type'] = u'text/html'
879 # remove temporary cruft from contentparams
881 del self.contentparams['mode']
885 del self.contentparams['base64']
889 is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
890 # resolve relative URIs within embedded markup
891 if is_htmlish and RESOLVE_RELATIVE_URIS:
892 if element in self.can_contain_relative_uris:
893 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
896 # (must do this before sanitizing because some microformats
897 # rely on elements that we sanitize)
898 if is_htmlish and element in ['content', 'description', 'summary']:
899 mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
901 for tag in mfresults.get('tags', []):
902 self._addTag(tag['term'], tag['scheme'], tag['label'])
903 for enclosure in mfresults.get('enclosures', []):
904 self._start_enclosure(enclosure)
905 for xfn in mfresults.get('xfn', []):
906 self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
907 vcard = mfresults.get('vcard')
909 self._getContext()['vcard'] = vcard
911 # sanitize embedded markup
912 if is_htmlish and SANITIZE_HTML:
913 if element in self.can_contain_dangerous_markup:
914 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
916 if self.encoding and not isinstance(output, unicode):
917 output = output.decode(self.encoding, 'ignore')
919 # address common error where people take data that is already
920 # utf-8, presume that it is iso-8859-1, and re-encode it.
921 if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
923 output = output.encode('iso-8859-1').decode('utf-8')
924 except (UnicodeEncodeError, UnicodeDecodeError):
927 # map win-1252 extensions to the proper code points
928 if isinstance(output, unicode):
929 output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
931 # categories/tags/keywords/whatever are handled in _end_category
932 if element == 'category':
935 if element == 'title' and self.hasTitle:
938 # store output in appropriate place(s)
939 if self.inentry and not self.insource:
940 if element == 'content':
941 self.entries[-1].setdefault(element, [])
942 contentparams = copy.deepcopy(self.contentparams)
943 contentparams['value'] = output
944 self.entries[-1][element].append(contentparams)
945 elif element == 'link':
947 # query variables in urls in link elements are improperly
948 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
949 # unhandled character references. fix this special case.
950 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
951 self.entries[-1][element] = output
953 self.entries[-1]['links'][-1]['href'] = output
955 if element == 'description':
957 self.entries[-1][element] = output
959 contentparams = copy.deepcopy(self.contentparams)
960 contentparams['value'] = output
961 self.entries[-1][element + '_detail'] = contentparams
962 elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
963 context = self._getContext()
964 if element == 'description':
966 context[element] = output
967 if element == 'link':
968 # fix query variables; see above for the explanation
969 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
970 context[element] = output
971 context['links'][-1]['href'] = output
973 contentparams = copy.deepcopy(self.contentparams)
974 contentparams['value'] = output
975 context[element + '_detail'] = contentparams
978 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
981 self.lang=self.lang.replace('_','-')
982 self.contentparams = FeedParserDict({
983 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
984 'language': self.lang,
985 'base': self.baseuri})
986 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
987 self.push(tag, expectingText)
989 def popContent(self, tag):
990 value = self.pop(tag)
992 self.contentparams.clear()
995 # a number of elements in a number of RSS variants are nominally plain
996 # text, but this is routinely ignored. This is an attempt to detect
997 # the most common cases. As false positives often result in silent
998 # data loss, this function errs on the conservative side.
1000 def lookslikehtml(s):
1001 # must have a close tag or a entity reference to qualify
1002 if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
1005 # all tags must be in a restricted subset of valid HTML tags
1006 if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1007 re.findall(r'</?(\w+)',s)):
1010 # all entities must have been defined as valid HTML entities
1011 if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1016 def _mapToStandardPrefix(self, name):
1017 colonpos = name.find(':')
1019 prefix = name[:colonpos]
1020 suffix = name[colonpos+1:]
1021 prefix = self.namespacemap.get(prefix, prefix)
1022 name = prefix + ':' + suffix
1025 def _getAttribute(self, attrsD, name):
1026 return attrsD.get(self._mapToStandardPrefix(name))
1028 def _isBase64(self, attrsD, contentparams):
1029 if attrsD.get('mode', '') == 'base64':
1031 if self.contentparams['type'].startswith(u'text/'):
1033 if self.contentparams['type'].endswith(u'+xml'):
1035 if self.contentparams['type'].endswith(u'/xml'):
1039 def _itsAnHrefDamnIt(self, attrsD):
1040 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1050 attrsD['href'] = href
1053 def _save(self, key, value, overwrite=False):
1054 context = self._getContext()
1056 context[key] = value
1058 context.setdefault(key, value)
1060 def _start_rss(self, attrsD):
1061 versionmap = {'0.91': u'rss091u',
1065 #If we're here then this is an RSS feed.
1066 #If we don't have a version or have a version that starts with something
1067 #other than RSS then there's been a mistake. Correct it.
1068 if not self.version or not self.version.startswith(u'rss'):
1069 attr_version = attrsD.get('version', '')
1070 version = versionmap.get(attr_version)
1072 self.version = version
1073 elif attr_version.startswith('2.'):
1074 self.version = u'rss20'
1076 self.version = u'rss'
1078 def _start_channel(self, attrsD):
1080 self._cdf_common(attrsD)
1082 def _cdf_common(self, attrsD):
1083 if attrsD.has_key('lastmod'):
1084 self._start_modified({})
1085 self.elementstack[-1][-1] = attrsD['lastmod']
1086 self._end_modified()
1087 if attrsD.has_key('href'):
1088 self._start_link({})
1089 self.elementstack[-1][-1] = attrsD['href']
1092 def _start_feed(self, attrsD):
1094 versionmap = {'0.1': u'atom01',
1097 if not self.version:
1098 attr_version = attrsD.get('version')
1099 version = versionmap.get(attr_version)
1101 self.version = version
1103 self.version = u'atom'
1105 def _end_channel(self):
1107 _end_feed = _end_channel
1109 def _start_image(self, attrsD):
1110 context = self._getContext()
1111 if not self.inentry:
1112 context.setdefault('image', FeedParserDict())
1115 self.push('image', 0)
1117 def _end_image(self):
1121 def _start_textinput(self, attrsD):
1122 context = self._getContext()
1123 context.setdefault('textinput', FeedParserDict())
1124 self.intextinput = 1
1126 self.push('textinput', 0)
1127 _start_textInput = _start_textinput
1129 def _end_textinput(self):
1130 self.pop('textinput')
1131 self.intextinput = 0
1132 _end_textInput = _end_textinput
1134 def _start_author(self, attrsD):
1136 self.push('author', 1)
1137 # Append a new FeedParserDict when expecting an author
1138 context = self._getContext()
1139 context.setdefault('authors', [])
1140 context['authors'].append(FeedParserDict())
1141 _start_managingeditor = _start_author
1142 _start_dc_author = _start_author
1143 _start_dc_creator = _start_author
1144 _start_itunes_author = _start_author
1146 def _end_author(self):
1149 self._sync_author_detail()
1150 _end_managingeditor = _end_author
1151 _end_dc_author = _end_author
1152 _end_dc_creator = _end_author
1153 _end_itunes_author = _end_author
1155 def _start_itunes_owner(self, attrsD):
1156 self.inpublisher = 1
1157 self.push('publisher', 0)
1159 def _end_itunes_owner(self):
1160 self.pop('publisher')
1161 self.inpublisher = 0
1162 self._sync_author_detail('publisher')
1164 def _start_contributor(self, attrsD):
1165 self.incontributor = 1
1166 context = self._getContext()
1167 context.setdefault('contributors', [])
1168 context['contributors'].append(FeedParserDict())
1169 self.push('contributor', 0)
1171 def _end_contributor(self):
1172 self.pop('contributor')
1173 self.incontributor = 0
1175 def _start_dc_contributor(self, attrsD):
1176 self.incontributor = 1
1177 context = self._getContext()
1178 context.setdefault('contributors', [])
1179 context['contributors'].append(FeedParserDict())
1180 self.push('name', 0)
1182 def _end_dc_contributor(self):
1184 self.incontributor = 0
1186 def _start_name(self, attrsD):
1187 self.push('name', 0)
1188 _start_itunes_name = _start_name
1190 def _end_name(self):
1191 value = self.pop('name')
1192 if self.inpublisher:
1193 self._save_author('name', value, 'publisher')
1195 self._save_author('name', value)
1196 elif self.incontributor:
1197 self._save_contributor('name', value)
1198 elif self.intextinput:
1199 context = self._getContext()
1200 context['name'] = value
1201 _end_itunes_name = _end_name
1203 def _start_width(self, attrsD):
1204 self.push('width', 0)
1206 def _end_width(self):
1207 value = self.pop('width')
1213 context = self._getContext()
1214 context['width'] = value
1216 def _start_height(self, attrsD):
1217 self.push('height', 0)
1219 def _end_height(self):
1220 value = self.pop('height')
1226 context = self._getContext()
1227 context['height'] = value
1229 def _start_url(self, attrsD):
1230 self.push('href', 1)
1231 _start_homepage = _start_url
1232 _start_uri = _start_url
1235 value = self.pop('href')
1237 self._save_author('href', value)
1238 elif self.incontributor:
1239 self._save_contributor('href', value)
1240 _end_homepage = _end_url
1243 def _start_email(self, attrsD):
1244 self.push('email', 0)
1245 _start_itunes_email = _start_email
1247 def _end_email(self):
1248 value = self.pop('email')
1249 if self.inpublisher:
1250 self._save_author('email', value, 'publisher')
1252 self._save_author('email', value)
1253 elif self.incontributor:
1254 self._save_contributor('email', value)
1255 _end_itunes_email = _end_email
1257 def _getContext(self):
1259 context = self.sourcedata
1260 elif self.inimage and self.feeddata.has_key('image'):
1261 context = self.feeddata['image']
1262 elif self.intextinput:
1263 context = self.feeddata['textinput']
1265 context = self.entries[-1]
1267 context = self.feeddata
1270 def _save_author(self, key, value, prefix='author'):
1271 context = self._getContext()
1272 context.setdefault(prefix + '_detail', FeedParserDict())
1273 context[prefix + '_detail'][key] = value
1274 self._sync_author_detail()
1275 context.setdefault('authors', [FeedParserDict()])
1276 context['authors'][-1][key] = value
1278 def _save_contributor(self, key, value):
1279 context = self._getContext()
1280 context.setdefault('contributors', [FeedParserDict()])
1281 context['contributors'][-1][key] = value
1283 def _sync_author_detail(self, key='author'):
1284 context = self._getContext()
1285 detail = context.get('%s_detail' % key)
1287 name = detail.get('name')
1288 email = detail.get('email')
1290 context[key] = u'%s (%s)' % (name, email)
1294 context[key] = email
1296 author, email = context.get(key), None
1299 emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1301 email = emailmatch.group(0)
1302 # probably a better way to do the following, but it passes all the tests
1303 author = author.replace(email, u'')
1304 author = author.replace(u'()', u'')
1305 author = author.replace(u'<>', u'')
1306 author = author.replace(u'<>', u'')
1307 author = author.strip()
1308 if author and (author[0] == u'('):
1310 if author and (author[-1] == u')'):
1311 author = author[:-1]
1312 author = author.strip()
1314 context.setdefault('%s_detail' % key, FeedParserDict())
1316 context['%s_detail' % key]['name'] = author
1318 context['%s_detail' % key]['email'] = email
1320 def _start_subtitle(self, attrsD):
1321 self.pushContent('subtitle', attrsD, u'text/plain', 1)
1322 _start_tagline = _start_subtitle
1323 _start_itunes_subtitle = _start_subtitle
1325 def _end_subtitle(self):
1326 self.popContent('subtitle')
1327 _end_tagline = _end_subtitle
1328 _end_itunes_subtitle = _end_subtitle
1330 def _start_rights(self, attrsD):
1331 self.pushContent('rights', attrsD, u'text/plain', 1)
1332 _start_dc_rights = _start_rights
1333 _start_copyright = _start_rights
1335 def _end_rights(self):
1336 self.popContent('rights')
1337 _end_dc_rights = _end_rights
1338 _end_copyright = _end_rights
1340 def _start_item(self, attrsD):
1341 self.entries.append(FeedParserDict())
1342 self.push('item', 0)
1346 id = self._getAttribute(attrsD, 'rdf:about')
1348 context = self._getContext()
1350 self._cdf_common(attrsD)
1351 _start_entry = _start_item
1353 def _end_item(self):
1356 _end_entry = _end_item
1358 def _start_dc_language(self, attrsD):
1359 self.push('language', 1)
1360 _start_language = _start_dc_language
1362 def _end_dc_language(self):
1363 self.lang = self.pop('language')
1364 _end_language = _end_dc_language
1366 def _start_dc_publisher(self, attrsD):
1367 self.push('publisher', 1)
1368 _start_webmaster = _start_dc_publisher
1370 def _end_dc_publisher(self):
1371 self.pop('publisher')
1372 self._sync_author_detail('publisher')
1373 _end_webmaster = _end_dc_publisher
1375 def _start_published(self, attrsD):
1376 self.push('published', 1)
1377 _start_dcterms_issued = _start_published
1378 _start_issued = _start_published
1380 def _end_published(self):
1381 value = self.pop('published')
1382 self._save('published_parsed', _parse_date(value), overwrite=True)
1383 _end_dcterms_issued = _end_published
1384 _end_issued = _end_published
1386 def _start_updated(self, attrsD):
1387 self.push('updated', 1)
1388 _start_modified = _start_updated
1389 _start_dcterms_modified = _start_updated
1390 _start_pubdate = _start_updated
1391 _start_dc_date = _start_updated
1392 _start_lastbuilddate = _start_updated
1394 def _end_updated(self):
1395 value = self.pop('updated')
1396 parsed_value = _parse_date(value)
1397 self._save('updated_parsed', parsed_value, overwrite=True)
1398 _end_modified = _end_updated
1399 _end_dcterms_modified = _end_updated
1400 _end_pubdate = _end_updated
1401 _end_dc_date = _end_updated
1402 _end_lastbuilddate = _end_updated
1404 def _start_created(self, attrsD):
1405 self.push('created', 1)
1406 _start_dcterms_created = _start_created
1408 def _end_created(self):
1409 value = self.pop('created')
1410 self._save('created_parsed', _parse_date(value), overwrite=True)
1411 _end_dcterms_created = _end_created
1413 def _start_expirationdate(self, attrsD):
1414 self.push('expired', 1)
1416 def _end_expirationdate(self):
1417 self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1419 def _start_cc_license(self, attrsD):
1420 context = self._getContext()
1421 value = self._getAttribute(attrsD, 'rdf:resource')
1422 attrsD = FeedParserDict()
1423 attrsD['rel'] = u'license'
1425 attrsD['href']=value
1426 context.setdefault('links', []).append(attrsD)
1428 def _start_creativecommons_license(self, attrsD):
1429 self.push('license', 1)
1430 _start_creativeCommons_license = _start_creativecommons_license
1432 def _end_creativecommons_license(self):
1433 value = self.pop('license')
1434 context = self._getContext()
1435 attrsD = FeedParserDict()
1436 attrsD['rel'] = u'license'
1438 attrsD['href'] = value
1439 context.setdefault('links', []).append(attrsD)
1440 del context['license']
1441 _end_creativeCommons_license = _end_creativecommons_license
1443 def _addXFN(self, relationships, href, name):
1444 context = self._getContext()
1445 xfn = context.setdefault('xfn', [])
1446 value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1447 if value not in xfn:
1450 def _addTag(self, term, scheme, label):
1451 context = self._getContext()
1452 tags = context.setdefault('tags', [])
1453 if (not term) and (not scheme) and (not label):
1455 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1456 if value not in tags:
1459 def _start_category(self, attrsD):
1460 term = attrsD.get('term')
1461 scheme = attrsD.get('scheme', attrsD.get('domain'))
1462 label = attrsD.get('label')
1463 self._addTag(term, scheme, label)
1464 self.push('category', 1)
1465 _start_dc_subject = _start_category
1466 _start_keywords = _start_category
1468 def _start_media_category(self, attrsD):
1469 attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1470 self._start_category(attrsD)
1472 def _end_itunes_keywords(self):
1473 for term in self.pop('itunes_keywords').split():
1474 self._addTag(term, u'http://www.itunes.com/', None)
1476 def _start_itunes_category(self, attrsD):
1477 self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1478 self.push('category', 1)
1480 def _end_category(self):
1481 value = self.pop('category')
1484 context = self._getContext()
1485 tags = context['tags']
1486 if value and len(tags) and not tags[-1]['term']:
1487 tags[-1]['term'] = value
1489 self._addTag(value, None, None)
1490 _end_dc_subject = _end_category
1491 _end_keywords = _end_category
1492 _end_itunes_category = _end_category
1493 _end_media_category = _end_category
1495 def _start_cloud(self, attrsD):
1496 self._getContext()['cloud'] = FeedParserDict(attrsD)
1498 def _start_link(self, attrsD):
1499 attrsD.setdefault('rel', u'alternate')
1500 if attrsD['rel'] == u'self':
1501 attrsD.setdefault('type', u'application/atom+xml')
1503 attrsD.setdefault('type', u'text/html')
1504 context = self._getContext()
1505 attrsD = self._itsAnHrefDamnIt(attrsD)
1506 if attrsD.has_key('href'):
1507 attrsD['href'] = self.resolveURI(attrsD['href'])
1508 expectingText = self.infeed or self.inentry or self.insource
1509 context.setdefault('links', [])
1510 if not (self.inentry and self.inimage):
1511 context['links'].append(FeedParserDict(attrsD))
1512 if attrsD.has_key('href'):
1514 if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1515 context['link'] = attrsD['href']
1517 self.push('link', expectingText)
1519 def _end_link(self):
1520 value = self.pop('link')
1521 context = self._getContext()
1523 def _start_guid(self, attrsD):
1524 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1527 def _end_guid(self):
1528 value = self.pop('id')
1529 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1531 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1532 # and only if the item doesn't already have a link element
1533 self._save('link', value)
1535 def _start_title(self, attrsD):
1537 return self.unknown_starttag('title', attrsD.items())
1538 self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1539 _start_dc_title = _start_title
1540 _start_media_title = _start_title
1542 def _end_title(self):
1545 value = self.popContent('title')
1548 context = self._getContext()
1550 _end_dc_title = _end_title
1552 def _end_media_title(self):
1553 hasTitle = self.hasTitle
1555 self.hasTitle = hasTitle
1557 def _start_description(self, attrsD):
1558 context = self._getContext()
1559 if context.has_key('summary'):
1560 self._summaryKey = 'content'
1561 self._start_content(attrsD)
1563 self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1564 _start_dc_description = _start_description
1566 def _start_abstract(self, attrsD):
1567 self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1569 def _end_description(self):
1570 if self._summaryKey == 'content':
1573 value = self.popContent('description')
1574 self._summaryKey = None
1575 _end_abstract = _end_description
1576 _end_dc_description = _end_description
1578 def _start_info(self, attrsD):
1579 self.pushContent('info', attrsD, u'text/plain', 1)
1580 _start_feedburner_browserfriendly = _start_info
1582 def _end_info(self):
1583 self.popContent('info')
1584 _end_feedburner_browserfriendly = _end_info
1586 def _start_generator(self, attrsD):
1588 attrsD = self._itsAnHrefDamnIt(attrsD)
1589 if attrsD.has_key('href'):
1590 attrsD['href'] = self.resolveURI(attrsD['href'])
1591 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1592 self.push('generator', 1)
1594 def _end_generator(self):
1595 value = self.pop('generator')
1596 context = self._getContext()
1597 if context.has_key('generator_detail'):
1598 context['generator_detail']['name'] = value
1600 def _start_admin_generatoragent(self, attrsD):
1601 self.push('generator', 1)
1602 value = self._getAttribute(attrsD, 'rdf:resource')
1604 self.elementstack[-1][2].append(value)
1605 self.pop('generator')
1606 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1608 def _start_admin_errorreportsto(self, attrsD):
1609 self.push('errorreportsto', 1)
1610 value = self._getAttribute(attrsD, 'rdf:resource')
1612 self.elementstack[-1][2].append(value)
1613 self.pop('errorreportsto')
1615 def _start_summary(self, attrsD):
1616 context = self._getContext()
1617 if context.has_key('summary'):
1618 self._summaryKey = 'content'
1619 self._start_content(attrsD)
1621 self._summaryKey = 'summary'
1622 self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1623 _start_itunes_summary = _start_summary
1625 def _end_summary(self):
1626 if self._summaryKey == 'content':
1629 self.popContent(self._summaryKey or 'summary')
1630 self._summaryKey = None
1631 _end_itunes_summary = _end_summary
1633 def _start_enclosure(self, attrsD):
1634 attrsD = self._itsAnHrefDamnIt(attrsD)
1635 context = self._getContext()
1636 attrsD['rel'] = u'enclosure'
1637 context.setdefault('links', []).append(FeedParserDict(attrsD))
1639 def _start_source(self, attrsD):
1641 # This means that we're processing a source element from an RSS 2.0 feed
1642 self.sourcedata['href'] = attrsD[u'url']
1643 self.push('source', 1)
1647 def _end_source(self):
1649 value = self.pop('source')
1651 self.sourcedata['title'] = value
1652 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1653 self.sourcedata.clear()
1655 def _start_content(self, attrsD):
1656 self.pushContent('content', attrsD, u'text/plain', 1)
1657 src = attrsD.get('src')
1659 self.contentparams['src'] = src
1660 self.push('content', 1)
1662 def _start_body(self, attrsD):
1663 self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1664 _start_xhtml_body = _start_body
1666 def _start_content_encoded(self, attrsD):
1667 self.pushContent('content', attrsD, u'text/html', 1)
1668 _start_fullitem = _start_content_encoded
1670 def _end_content(self):
1671 copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1672 value = self.popContent('content')
1674 self._save('summary', value)
1676 _end_body = _end_content
1677 _end_xhtml_body = _end_content
1678 _end_content_encoded = _end_content
1679 _end_fullitem = _end_content
1681 def _start_itunes_image(self, attrsD):
1682 self.push('itunes_image', 0)
1683 if attrsD.get('href'):
1684 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1685 _start_itunes_link = _start_itunes_image
1687 def _end_itunes_block(self):
1688 value = self.pop('itunes_block', 0)
1689 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1691 def _end_itunes_explicit(self):
1692 value = self.pop('itunes_explicit', 0)
1693 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1694 # False and None both evaluate as False, so the difference can be ignored
1695 # by applications that only need to know if the content is explicit.
1696 self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1698 def _start_media_content(self, attrsD):
1699 context = self._getContext()
1700 context.setdefault('media_content', [])
1701 context['media_content'].append(attrsD)
1703 def _start_media_thumbnail(self, attrsD):
1704 context = self._getContext()
1705 context.setdefault('media_thumbnail', [])
1706 self.push('url', 1) # new
1707 context['media_thumbnail'].append(attrsD)
1709 def _end_media_thumbnail(self):
1710 url = self.pop('url')
1711 context = self._getContext()
1712 if url != None and len(url.strip()) != 0:
1713 if not context['media_thumbnail'][-1].has_key('url'):
1714 context['media_thumbnail'][-1]['url'] = url
1716 def _start_media_player(self, attrsD):
1717 self.push('media_player', 0)
1718 self._getContext()['media_player'] = FeedParserDict(attrsD)
1720 def _end_media_player(self):
1721 value = self.pop('media_player')
1722 context = self._getContext()
1723 context['media_player']['content'] = value
1725 def _start_newlocation(self, attrsD):
1726 self.push('newlocation', 1)
1728 def _end_newlocation(self):
1729 url = self.pop('newlocation')
1730 context = self._getContext()
1731 # don't set newlocation if the context isn't right
1732 if context is not self.feeddata:
1734 context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1737 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1738 def __init__(self, baseuri, baselang, encoding):
1739 xml.sax.handler.ContentHandler.__init__(self)
1740 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1745 def startPrefixMapping(self, prefix, uri):
1748 # Jython uses '' instead of None; standardize on None
1749 prefix = prefix or None
1750 self.trackNamespace(prefix, uri)
1751 if prefix and uri == 'http://www.w3.org/1999/xlink':
1752 self.decls['xmlns:' + prefix] = uri
1754 def startElementNS(self, name, qname, attrs):
1755 namespace, localname = name
1756 lowernamespace = str(namespace or '').lower()
1757 if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1758 # match any backend.userland.com namespace
1759 namespace = u'http://backend.userland.com/rss'
1760 lowernamespace = namespace
1761 if qname and qname.find(':') > 0:
1762 givenprefix = qname.split(':')[0]
1765 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1766 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1767 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1768 localname = str(localname).lower()
1770 # qname implementation is horribly broken in Python 2.1 (it
1771 # doesn't report any), and slightly broken in Python 2.2 (it
1772 # doesn't report the xml: namespace). So we match up namespaces
1773 # with a known list first, and then possibly override them with
1774 # the qnames the SAX parser gives us (if indeed it gives us any
1775 # at all). Thanks to MatejC for helping me test this and
1776 # tirelessly telling me that it didn't work yet.
1777 attrsD, self.decls = self.decls, {}
1778 if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1779 attrsD['xmlns']=namespace
1780 if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1781 attrsD['xmlns']=namespace
1784 localname = prefix.lower() + ':' + localname
1785 elif namespace and not qname: #Expat
1786 for name,value in self.namespacesInUse.items():
1787 if name and value == namespace:
1788 localname = name + ':' + localname
1791 for (namespace, attrlocalname), attrvalue in attrs.items():
1792 lowernamespace = (namespace or '').lower()
1793 prefix = self._matchnamespaces.get(lowernamespace, '')
1795 attrlocalname = prefix + ':' + attrlocalname
1796 attrsD[str(attrlocalname).lower()] = attrvalue
1797 for qname in attrs.getQNames():
1798 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1799 self.unknown_starttag(localname, attrsD.items())
1801 def characters(self, text):
1802 self.handle_data(text)
1804 def endElementNS(self, name, qname):
1805 namespace, localname = name
1806 lowernamespace = str(namespace or '').lower()
1807 if qname and qname.find(':') > 0:
1808 givenprefix = qname.split(':')[0]
1811 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1813 localname = prefix + ':' + localname
1814 elif namespace and not qname: #Expat
1815 for name,value in self.namespacesInUse.items():
1816 if name and value == namespace:
1817 localname = name + ':' + localname
1819 localname = str(localname).lower()
1820 self.unknown_endtag(localname)
1822 def error(self, exc):
1826 # drv_libxml2 calls warning() in some cases
1829 def fatalError(self, exc):
1833 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1834 special = re.compile('''[<>'"]''')
1835 bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1836 elements_no_end_tag = [
1837 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1838 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1839 'source', 'track', 'wbr'
1842 def __init__(self, encoding, _type):
1843 self.encoding = encoding
1845 sgmllib.SGMLParser.__init__(self)
1849 sgmllib.SGMLParser.reset(self)
1851 def _shorttag_replace(self, match):
1852 tag = match.group(1)
1853 if tag in self.elements_no_end_tag:
1854 return '<' + tag + ' />'
1856 return '<' + tag + '></' + tag + '>'
1858 # By declaring these methods and overriding their compiled code
1859 # with the code from sgmllib, the original code will execute in
1860 # feedparser's scope instead of sgmllib's. This means that the
1861 # `tagfind` and `charref` regular expressions will be found as
1862 # they're declared above, not as they're declared in sgmllib.
1863 def goahead(self, i):
1865 goahead.func_code = sgmllib.SGMLParser.goahead.func_code
1867 def __parse_starttag(self, i):
1869 __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
1871 def parse_starttag(self,i):
1872 j = self.__parse_starttag(i)
1873 if self._type == 'application/xhtml+xml':
1874 if j>2 and self.rawdata[j-2:j]=='/>':
1875 self.unknown_endtag(self.lasttag)
1878 def feed(self, data):
1879 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1880 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1881 data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
1882 data = data.replace(''', "'")
1883 data = data.replace('"', '"')
1888 self.encoding = self.encoding + u'_INVALID_PYTHON_3'
1890 if self.encoding and isinstance(data, unicode):
1891 data = data.encode(self.encoding)
1892 sgmllib.SGMLParser.feed(self, data)
1893 sgmllib.SGMLParser.close(self)
1895 def normalize_attrs(self, attrs):
1898 # utility method to be called by descendants
1899 attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1900 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1904 def unknown_starttag(self, tag, attrs):
1905 # called for each start tag
1906 # attrs is a list of (attr, value) tuples
1907 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1911 for key, value in attrs:
1912 value=value.replace('>','>').replace('<','<').replace('"','"')
1913 value = self.bare_ampersand.sub("&", value)
1914 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1915 if not isinstance(value, unicode):
1916 value = value.decode(self.encoding, 'ignore')
1918 # Currently, in Python 3 the key is already a str, and cannot be decoded again
1919 uattrs.append((unicode(key, self.encoding), value))
1921 uattrs.append((key, value))
1922 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1925 strattrs = strattrs.encode(self.encoding)
1926 except (UnicodeEncodeError, LookupError):
1928 if tag in self.elements_no_end_tag:
1929 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1931 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1933 def unknown_endtag(self, tag):
1934 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1935 # Reconstruct the original end tag.
1936 if tag not in self.elements_no_end_tag:
1937 self.pieces.append("</%(tag)s>" % locals())
1939 def handle_charref(self, ref):
1940 # called for each character reference, e.g. for ' ', ref will be '160'
1941 # Reconstruct the original character reference.
1942 if ref.startswith('x'):
1943 value = unichr(int(ref[1:],16))
1945 value = unichr(int(ref))
1947 if value in _cp1252.keys():
1948 self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1950 self.pieces.append('&#%(ref)s;' % locals())
1952 def handle_entityref(self, ref):
1953 # called for each entity reference, e.g. for '©', ref will be 'copy'
1954 # Reconstruct the original entity reference.
1955 if name2codepoint.has_key(ref):
1956 self.pieces.append('&%(ref)s;' % locals())
1958 self.pieces.append('&%(ref)s' % locals())
1960 def handle_data(self, text):
1961 # called for each block of plain text, i.e. outside of any tag and
1962 # not containing any character or entity references
1963 # Store the original text verbatim.
1964 self.pieces.append(text)
1966 def handle_comment(self, text):
1967 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1968 # Reconstruct the original comment.
1969 self.pieces.append('<!--%(text)s-->' % locals())
1971 def handle_pi(self, text):
1972 # called for each processing instruction, e.g. <?instruction>
1973 # Reconstruct original processing instruction.
1974 self.pieces.append('<?%(text)s>' % locals())
1976 def handle_decl(self, text):
1977 # called for the DOCTYPE, if present, e.g.
1978 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1979 # "http://www.w3.org/TR/html4/loose.dtd">
1980 # Reconstruct original DOCTYPE
1981 self.pieces.append('<!%(text)s>' % locals())
1983 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1984 def _scan_name(self, i, declstartpos):
1985 rawdata = self.rawdata
1989 m = self._new_declname_match(rawdata, i)
1993 if (i + len(s)) == n:
1994 return None, -1 # end of buffer
1995 return name.lower(), m.end()
1997 self.handle_data(rawdata)
1998 # self.updatepos(declstartpos, i)
2001 def convert_charref(self, name):
2002 return '&#%s;' % name
2004 def convert_entityref(self, name):
2005 return '&%s;' % name
2008 '''Return processed HTML as a single string'''
2009 return ''.join([str(p) for p in self.pieces])
2011 def parse_declaration(self, i):
2013 return sgmllib.SGMLParser.parse_declaration(self, i)
2014 except sgmllib.SGMLParseError:
2015 # escape the doctype declaration and continue parsing
2016 self.handle_data('<')
2019 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2020 def __init__(self, baseuri, baselang, encoding, entities):
2021 sgmllib.SGMLParser.__init__(self)
2022 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2023 _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2024 self.entities=entities
2026 def decodeEntities(self, element, data):
2027 data = data.replace('<', '<')
2028 data = data.replace('<', '<')
2029 data = data.replace('<', '<')
2030 data = data.replace('>', '>')
2031 data = data.replace('>', '>')
2032 data = data.replace('>', '>')
2033 data = data.replace('&', '&')
2034 data = data.replace('&', '&')
2035 data = data.replace('"', '"')
2036 data = data.replace('"', '"')
2037 data = data.replace(''', ''')
2038 data = data.replace(''', ''')
2039 if self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
2040 data = data.replace('<', '<')
2041 data = data.replace('>', '>')
2042 data = data.replace('&', '&')
2043 data = data.replace('"', '"')
2044 data = data.replace(''', "'")
2047 def strattrs(self, attrs):
2048 return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2050 class _MicroformatsParser:
2057 known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2058 known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2060 def __init__(self, data, baseuri, encoding):
2061 self.document = BeautifulSoup.BeautifulSoup(data)
2062 self.baseuri = baseuri
2063 self.encoding = encoding
2064 if isinstance(data, unicode):
2065 data = data.encode(encoding)
2067 self.enclosures = []
2071 def vcardEscape(self, s):
2072 if isinstance(s, basestring):
2073 s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2076 def vcardFold(self, s):
2077 s = re.sub(';+$', '', s)
2081 while len(s) > iMax:
2082 sFolded += sPrefix + s[:iMax] + '\n'
2086 sFolded += sPrefix + s
2089 def normalize(self, s):
2090 return re.sub(r'\s+', ' ', s).strip()
2092 def unique(self, aList):
2094 for element in aList:
2095 if element not in results:
2096 results.append(element)
2099 def toISO8601(self, dt):
2100 return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
2102 def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
2104 sProperty = sProperty.lower()
2107 propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
2108 if bAllowMultiple and (iPropertyType != self.NODE):
2110 containers = elmRoot(['ul', 'ol'], propertyMatch)
2111 for container in containers:
2112 snapResults.extend(container('li'))
2113 bFound = (len(snapResults) != 0)
2115 snapResults = elmRoot(all, propertyMatch)
2116 bFound = (len(snapResults) != 0)
2117 if (not bFound) and (sProperty == 'value'):
2118 snapResults = elmRoot('pre')
2119 bFound = (len(snapResults) != 0)
2120 bNormalize = not bFound
2122 snapResults = [elmRoot]
2123 bFound = (len(snapResults) != 0)
2125 if sProperty == 'vcard':
2126 snapFilter = elmRoot(all, propertyMatch)
2127 for node in snapFilter:
2128 if node.findParent(all, propertyMatch):
2129 arFilter.append(node)
2131 for node in snapResults:
2132 if node not in arFilter:
2133 arResults.append(node)
2134 bFound = (len(arResults) != 0)
2138 elif iPropertyType == self.STRING:
2140 elif iPropertyType == self.DATE:
2142 elif iPropertyType == self.URI:
2144 elif iPropertyType == self.NODE:
2149 for elmResult in arResults:
2151 if iPropertyType == self.NODE:
2153 arValues.append(elmResult)
2157 sNodeName = elmResult.name.lower()
2158 if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
2159 sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
2161 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2162 if (not sValue) and (sNodeName == 'abbr'):
2163 sValue = elmResult.get('title')
2165 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2166 if (not sValue) and (iPropertyType == self.URI):
2167 if sNodeName == 'a':
2168 sValue = elmResult.get('href')
2169 elif sNodeName == 'img':
2170 sValue = elmResult.get('src')
2171 elif sNodeName == 'object':
2172 sValue = elmResult.get('data')
2174 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2175 if (not sValue) and (sNodeName == 'img'):
2176 sValue = elmResult.get('alt')
2178 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2180 sValue = elmResult.renderContents()
2181 sValue = re.sub(r'<\S[^>]*>', '', sValue)
2182 sValue = sValue.replace('\r\n', '\n')
2183 sValue = sValue.replace('\r', '\n')
2185 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2188 if iPropertyType == self.DATE:
2189 sValue = _parse_date_iso8601(sValue)
2191 arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
2193 return bAutoEscape and self.vcardEscape(sValue) or sValue
2196 def findVCards(self, elmRoot, bAgentParsing=0):
2199 if not bAgentParsing:
2200 arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
2204 for elmCard in arCards:
2207 def processSingleString(sProperty):
2208 sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
2210 arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
2211 return sValue or u''
2213 def processSingleURI(sProperty):
2214 sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
2219 if sValue.startswith('data:'):
2220 sEncoding = ';ENCODING=b'
2221 sContentType = sValue.split(';')[0].split('/').pop()
2222 sValue = sValue.split(',', 1).pop()
2224 elmValue = self.getPropertyValue(elmCard, sProperty)
2226 if sProperty != 'url':
2227 sValueKey = ';VALUE=uri'
2228 sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
2229 sContentType = sContentType.upper()
2230 if sContentType == 'OCTET-STREAM':
2233 sContentType = ';TYPE=' + sContentType.upper()
2234 arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
2236 def processTypeValue(sProperty, arDefaultType, arForceType=None):
2237 arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
2238 for elmResult in arResults:
2239 arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
2241 arType = self.unique(arForceType + arType)
2243 arType = arDefaultType
2244 sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
2246 arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
2249 # must do this before all other properties because it is destructive
2250 # (removes nested class="vcard" nodes so they don't interfere with
2251 # this vcard's other properties)
2252 arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2253 for elmAgent in arAgent:
2254 if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2255 sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2256 sAgentValue = sAgentValue.replace('\n', '\\n')
2257 sAgentValue = sAgentValue.replace(';', '\\;')
2259 arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2260 # Completely remove the agent element from the parse tree
2263 sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2265 arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2268 sFN = processSingleString('fn')
2271 elmName = self.getPropertyValue(elmCard, 'n')
2273 sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2274 sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2275 arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2276 arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2277 arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2278 arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2280 ','.join(arAdditionalNames) + ';' +
2281 ','.join(arHonorificPrefixes) + ';' +
2282 ','.join(arHonorificSuffixes)))
2284 # implied "N" optimization
2285 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2286 arNames = self.normalize(sFN).split()
2287 if len(arNames) == 2:
2288 bFamilyNameFirst = (arNames[0].endswith(',') or
2289 len(arNames[1]) == 1 or
2290 ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2291 if bFamilyNameFirst:
2292 arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2294 arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2297 sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2299 arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2302 arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2304 arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2307 processSingleURI('photo')
2310 dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2312 arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2315 arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2316 for elmAdr in arAdr:
2317 arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2319 arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2320 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2321 sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2322 sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2323 sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2324 sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2325 sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2326 sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2327 arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2328 sPostOfficeBox + ';' +
2329 sExtendedAddress + ';' +
2330 sStreetAddress + ';' +
2337 processTypeValue('label', ['intl','postal','parcel','work'])
2339 # TEL (phone number)
2340 processTypeValue('tel', ['voice'])
2343 processTypeValue('email', ['internet'], ['internet'])
2346 processSingleString('mailer')
2349 processSingleString('tz')
2351 # GEO (geographical information)
2352 elmGeo = self.getPropertyValue(elmCard, 'geo')
2354 sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2355 sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2356 arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2359 processSingleString('title')
2362 processSingleString('role')
2365 processSingleURI('logo')
2367 # ORG (organization)
2368 elmOrg = self.getPropertyValue(elmCard, 'org')
2370 sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2371 if not sOrganizationName:
2372 # implied "organization-name" optimization
2373 # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2374 sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2375 if sOrganizationName:
2376 arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2378 arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2379 arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2382 arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2384 arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2387 processSingleString('note')
2390 processSingleString('rev')
2393 processSingleURI('sound')
2396 processSingleString('uid')
2399 processSingleURI('url')
2402 processSingleString('class')
2405 processSingleURI('key')
2408 arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
2409 # XXX - this is super ugly; properly fix this with issue 148
2410 for i, s in enumerate(arLines):
2411 if not isinstance(s, unicode):
2412 arLines[i] = s.decode('utf-8', 'ignore')
2413 sVCards += u'\n'.join(arLines) + u'\n'
2415 return sVCards.strip()
2417 def isProbablyDownloadable(self, elm):
2418 attrsD = elm.attrMap
2419 if not attrsD.has_key('href'):
2421 linktype = attrsD.get('type', '').strip()
2422 if linktype.startswith('audio/') or \
2423 linktype.startswith('video/') or \
2424 (linktype.startswith('application/') and not linktype.endswith('xml')):
2426 path = urlparse.urlparse(attrsD['href'])[2]
2427 if path.find('.') == -1:
2429 fileext = path.split('.').pop().lower()
2430 return fileext in self.known_binary_extensions
2434 for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2435 href = elm.get('href')
2438 urlscheme, domain, path, params, query, fragment = \
2439 urlparse.urlparse(_urljoin(self.baseuri, href))
2440 segments = path.split('/')
2441 tag = segments.pop()
2444 tag = segments.pop()
2448 tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2449 if not tagscheme.endswith('/'):
2451 self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2453 def findEnclosures(self):
2455 enclosure_match = re.compile(r'\benclosure\b')
2456 for elm in self.document(all, {'href': re.compile(r'.+')}):
2457 if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm):
2459 if elm.attrMap not in self.enclosures:
2460 self.enclosures.append(elm.attrMap)
2461 if elm.string and not elm.get('title'):
2462 self.enclosures[-1]['title'] = elm.string
2466 for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2467 rels = elm.get('rel', u'').split()
2470 if rel in self.known_xfn_relationships:
2471 xfn_rels.append(rel)
2473 self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2475 def _parseMicroformats(htmlSource, baseURI, encoding):
2476 if not BeautifulSoup:
2479 p = _MicroformatsParser(htmlSource, baseURI, encoding)
2480 except UnicodeEncodeError:
2481 # sgmllib throws this exception when performing lookups of tags
2482 # with non-ASCII characters in them.
2484 p.vcard = p.findVCards(p.document)
2488 return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2490 class _RelativeURIResolver(_BaseHTMLProcessor):
2491 relative_uris = [('a', 'href'),
2492 ('applet', 'codebase'),
2494 ('blockquote', 'cite'),
2495 ('body', 'background'),
2498 ('frame', 'longdesc'),
2500 ('iframe', 'longdesc'),
2502 ('head', 'profile'),
2503 ('img', 'longdesc'),
2507 ('input', 'usemap'),
2510 ('object', 'classid'),
2511 ('object', 'codebase'),
2513 ('object', 'usemap'),
2517 def __init__(self, baseuri, encoding, _type):
2518 _BaseHTMLProcessor.__init__(self, encoding, _type)
2519 self.baseuri = baseuri
2521 def resolveURI(self, uri):
2522 return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
2524 def unknown_starttag(self, tag, attrs):
2525 attrs = self.normalize_attrs(attrs)
2526 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2527 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2529 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2530 if not _SGML_AVAILABLE:
2533 p = _RelativeURIResolver(baseURI, encoding, _type)
2537 def _makeSafeAbsoluteURI(base, rel=None):
2538 # bail if ACCEPTABLE_URI_SCHEMES is empty
2539 if not ACCEPTABLE_URI_SCHEMES:
2540 return _urljoin(base, rel or u'')
2544 scheme = urlparse.urlparse(base)[0]
2545 if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2548 uri = _urljoin(base, rel)
2549 if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2553 class _HTMLSanitizer(_BaseHTMLProcessor):
2554 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
2555 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2556 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2557 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2558 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2559 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2560 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2561 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2562 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2563 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2564 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2565 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2566 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2568 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2569 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2570 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2571 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2572 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2573 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2574 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2575 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2576 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2577 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2578 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2579 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2580 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2581 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2582 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2583 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2584 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2585 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2586 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2587 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2588 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2591 unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2593 acceptable_css_properties = ['azimuth', 'background-color',
2594 'border-bottom-color', 'border-collapse', 'border-color',
2595 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2596 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2597 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2598 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2599 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2600 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2601 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2602 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2603 'white-space', 'width']
2605 # survey of common keywords found in feeds
2606 acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2607 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2608 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2609 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2610 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2611 'transparent', 'underline', 'white', 'yellow']
2613 valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2614 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2616 mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2617 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2618 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2619 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2620 'munderover', 'none', 'semantics']
2622 mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2623 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2624 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2625 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2626 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2627 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2628 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2629 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2630 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2632 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2633 svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2634 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2635 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2636 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2637 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2638 'svg', 'switch', 'text', 'title', 'tspan', 'use']
2640 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2641 svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2642 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2643 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2644 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2645 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2646 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2647 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2648 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2649 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2650 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2651 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2652 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2653 'overline-position', 'overline-thickness', 'panose-1', 'path',
2654 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2655 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2656 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2657 'stop-color', 'stop-opacity', 'strikethrough-position',
2658 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2659 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2660 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2661 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2662 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2663 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2664 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2665 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2666 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2672 acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2673 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2677 _BaseHTMLProcessor.reset(self)
2678 self.unacceptablestack = 0
2682 def unknown_starttag(self, tag, attrs):
2683 acceptable_attributes = self.acceptable_attributes
2685 if not tag in self.acceptable_elements or self.svgOK:
2686 if tag in self.unacceptable_elements_with_end_tag:
2687 self.unacceptablestack += 1
2689 # add implicit namespaces to html5 inline svg/mathml
2690 if self._type.endswith('html'):
2691 if not dict(attrs).get('xmlns'):
2693 attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2695 attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2697 # not otherwise acceptable, perhaps it is MathML or SVG?
2698 if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2700 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2703 # chose acceptable attributes based on tag class, else bail
2704 if self.mathmlOK and tag in self.mathml_elements:
2705 acceptable_attributes = self.mathml_attributes
2706 elif self.svgOK and tag in self.svg_elements:
2707 # for most vocabularies, lowercasing is a good idea. Many
2708 # svg elements, however, are camel case
2709 if not self.svg_attr_map:
2710 lower=[attr.lower() for attr in self.svg_attributes]
2711 mix=[a for a in self.svg_attributes if a not in lower]
2712 self.svg_attributes = lower
2713 self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2715 lower=[attr.lower() for attr in self.svg_elements]
2716 mix=[a for a in self.svg_elements if a not in lower]
2717 self.svg_elements = lower
2718 self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2719 acceptable_attributes = self.svg_attributes
2720 tag = self.svg_elem_map.get(tag,tag)
2721 keymap = self.svg_attr_map
2722 elif not tag in self.acceptable_elements:
2725 # declare xlink namespace, if needed
2726 if self.mathmlOK or self.svgOK:
2727 if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2728 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2729 attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2732 for key, value in self.normalize_attrs(attrs):
2733 if key in acceptable_attributes:
2734 key=keymap.get(key,key)
2735 # make sure the uri uses an acceptable uri scheme
2737 value = _makeSafeAbsoluteURI(value)
2738 clean_attrs.append((key,value))
2740 clean_value = self.sanitize_style(value)
2742 clean_attrs.append((key,clean_value))
2743 _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2745 def unknown_endtag(self, tag):
2746 if not tag in self.acceptable_elements:
2747 if tag in self.unacceptable_elements_with_end_tag:
2748 self.unacceptablestack -= 1
2749 if self.mathmlOK and tag in self.mathml_elements:
2750 if tag == 'math' and self.mathmlOK:
2752 elif self.svgOK and tag in self.svg_elements:
2753 tag = self.svg_elem_map.get(tag,tag)
2754 if tag == 'svg' and self.svgOK:
2758 _BaseHTMLProcessor.unknown_endtag(self, tag)
2760 def handle_pi(self, text):
2763 def handle_decl(self, text):
2766 def handle_data(self, text):
2767 if not self.unacceptablestack:
2768 _BaseHTMLProcessor.handle_data(self, text)
2770 def sanitize_style(self, style):
2772 style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2775 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
2777 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2778 if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
2782 for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2785 if prop.lower() in self.acceptable_css_properties:
2786 clean.append(prop + ': ' + value + ';')
2787 elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2788 for keyword in value.split():
2789 if not keyword in self.acceptable_css_keywords and \
2790 not self.valid_css_values.match(keyword):
2793 clean.append(prop + ': ' + value + ';')
2794 elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2795 clean.append(prop + ': ' + value + ';')
2797 return ' '.join(clean)
2799 def parse_comment(self, i, report=1):
2800 ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2803 # if ret == -1, this may be a malicious attempt to circumvent
2804 # sanitization, or a page-destroying unclosed comment
2805 match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2808 # unclosed comment; deliberately fail to handle_data()
2809 return len(self.rawdata)
2812 def _sanitizeHTML(htmlSource, encoding, _type):
2813 if not _SGML_AVAILABLE:
2815 p = _HTMLSanitizer(encoding, _type)
2816 htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2820 # loop through list of preferred Tidy interfaces looking for one that's installed,
2821 # then set up a common _tidy function to wrap the interface-specific API.
2823 for tidy_interface in PREFERRED_TIDY_INTERFACES:
2825 if tidy_interface == "uTidy":
2826 from tidy import parseString as _utidy
2827 def _tidy(data, **kwargs):
2828 return str(_utidy(data, **kwargs))
2830 elif tidy_interface == "mxTidy":
2831 from mx.Tidy import Tidy as _mxtidy
2832 def _tidy(data, **kwargs):
2833 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2839 utf8 = isinstance(data, unicode)
2841 data = data.encode('utf-8')
2842 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2844 data = unicode(data, 'utf-8')
2845 if data.count('<body'):
2846 data = data.split('<body', 1)[1]
2848 data = data.split('>', 1)[1]
2849 if data.count('</body'):
2850 data = data.split('</body', 1)[0]
2851 data = data.strip().replace('\r\n', '\n')
2854 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2855 def http_error_default(self, req, fp, code, msg, headers):
2856 # The default implementation just raises HTTPError.
2861 def http_error_301(self, req, fp, code, msg, hdrs):
2862 result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
2864 result.status = code
2865 result.newurl = result.geturl()
2867 # The default implementations in urllib2.HTTPRedirectHandler
2868 # are identical, so hardcoding a http_error_301 call above
2869 # won't affect anything
2870 http_error_300 = http_error_301
2871 http_error_302 = http_error_301
2872 http_error_303 = http_error_301
2873 http_error_307 = http_error_301
2875 def http_error_401(self, req, fp, code, msg, headers):
2877 # - server requires digest auth, AND
2878 # - we tried (unsuccessfully) with basic auth, AND
2879 # If all conditions hold, parse authentication information
2880 # out of the Authorization header we sent the first time
2881 # (for the username and password) and the WWW-Authenticate
2882 # header the server sent back (for the realm) and retry
2883 # the request with the appropriate digest auth headers instead.
2884 # This evil genius hack has been brought to you by Aaron Swartz.
2885 host = urlparse.urlparse(req.get_full_url())[1]
2886 if base64 is None or 'Authorization' not in req.headers \
2887 or 'WWW-Authenticate' not in headers:
2888 return self.http_error_default(req, fp, code, msg, headers)
2889 auth = _base64decode(req.headers['Authorization'].split(' ')[1])
2890 user, passw = auth.split(':')
2891 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2892 self.add_password(realm, host, user, passw)
2893 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2894 self.reset_retry_count()
2897 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2898 """URL, filename, or string --> stream
2900 This function lets you define parsers that take any input source
2901 (URL, pathname to local or network file, or actual data as a string)
2902 and deal with it in a uniform manner. Returned object is guaranteed
2903 to have all the basic stdio read methods (read, readline, readlines).
2904 Just .close() the object when you're done with it.
2906 If the etag argument is supplied, it will be used as the value of an
2907 If-None-Match request header.
2909 If the modified argument is supplied, it can be a tuple of 9 integers
2910 (as returned by gmtime() in the standard Python time module) or a date
2911 string in any format supported by feedparser. Regardless, it MUST
2912 be in GMT (Greenwich Mean Time). It will be reformatted into an
2913 RFC 1123-compliant date and used as the value of an If-Modified-Since
2916 If the agent argument is supplied, it will be used as the value of a
2917 User-Agent request header.
2919 If the referrer argument is supplied, it will be used as the value of a
2920 Referer[sic] request header.
2922 If handlers is supplied, it is a list of handlers used to build a
2925 if request_headers is supplied it is a dictionary of HTTP request headers
2926 that will override the values generated by FeedParser.
2929 if hasattr(url_file_stream_or_string, 'read'):
2930 return url_file_stream_or_string
2932 if url_file_stream_or_string == '-':
2935 if isinstance(url_file_stream_or_string, basestring) \
2936 and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2937 # Deal with the feed URI scheme
2938 if url_file_stream_or_string.startswith('feed:http'):
2939 url_file_stream_or_string = url_file_stream_or_string[5:]
2940 elif url_file_stream_or_string.startswith('feed:'):
2941 url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2944 # test for inline user:password for basic auth
2947 urltype, rest = urllib.splittype(url_file_stream_or_string)
2948 realhost, rest = urllib.splithost(rest)
2950 user_passwd, realhost = urllib.splituser(realhost)
2952 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2953 auth = base64.standard_b64encode(user_passwd).strip()
2956 if isinstance(url_file_stream_or_string, unicode):
2957 url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2959 # try to open with urllib2 (to use optional headers)
2960 request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2961 opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
2962 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2964 return opener.open(request)
2966 opener.close() # JohnD
2968 # try to open with native open function (if url_file_stream_or_string is a filename)
2970 return open(url_file_stream_or_string, 'rb')
2974 # treat url_file_stream_or_string as string
2975 if isinstance(url_file_stream_or_string, unicode):
2976 return _StringIO(url_file_stream_or_string.encode('utf-8'))
2977 return _StringIO(url_file_stream_or_string)
2979 def _convert_to_idn(url):
2980 """Convert a URL to IDN notation"""
2981 # this function should only be called with a unicode string
2982 # strategy: if the host cannot be encoded in ascii, then
2983 # it'll be necessary to encode it in idn form
2984 parts = list(urlparse.urlsplit(url))
2986 parts[1].encode('ascii')
2987 except UnicodeEncodeError:
2988 # the url needs to be converted to idn notation
2989 host = parts[1].rsplit(':', 1)
2994 for h in host[0].split('.'):
2995 newhost.append(h.encode('idna').decode('utf-8'))
2996 parts[1] = '.'.join(newhost)
2998 parts[1] += ':' + port
2999 return urlparse.urlunsplit(parts)
3003 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
3004 request = urllib2.Request(url)
3005 request.add_header('User-Agent', agent)
3007 request.add_header('If-None-Match', etag)
3008 if isinstance(modified, basestring):
3009 modified = _parse_date(modified)
3010 elif isinstance(modified, datetime.datetime):
3011 modified = modified.utctimetuple()
3013 # format into an RFC 1123-compliant timestamp. We can't use
3014 # time.strftime() since the %a and %b directives can be affected
3015 # by the current locale, but RFC 2616 states that dates must be
3017 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
3018 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3019 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
3021 request.add_header('Referer', referrer)
3023 request.add_header('Accept-encoding', 'gzip, deflate')
3025 request.add_header('Accept-encoding', 'gzip')
3027 request.add_header('Accept-encoding', 'deflate')
3029 request.add_header('Accept-encoding', '')
3031 request.add_header('Authorization', 'Basic %s' % auth)
3033 request.add_header('Accept', ACCEPT_HEADER)
3034 # use this for whatever -- cookies, special headers, etc
3035 # [('Cookie','Something'),('x-special-header','Another Value')]
3036 for header_name, header_value in request_headers.items():
3037 request.add_header(header_name, header_value)
3038 request.add_header('A-IM', 'feed') # RFC 3229 support
3042 def registerDateHandler(func):
3043 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
3044 _date_handlers.insert(0, func)
3046 # ISO-8601 date parsing routines written by Fazal Majid.
3047 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
3048 # parser is beyond the scope of feedparser and would be a worthwhile addition
3049 # to the Python library.
3050 # A single regular expression cannot parse ISO 8601 date formats into groups
3051 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
3052 # 0301-04-01), so we use templates instead.
3053 # Please note the order in templates is significant because we need a
3055 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
3056 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
3057 '-YY-?MM', '-OOO', '-YY',
3063 'YYYY', r'(?P<year>\d{4})').replace(
3064 'YY', r'(?P<year>\d\d)').replace(
3065 'MM', r'(?P<month>[01]\d)').replace(
3066 'DD', r'(?P<day>[0123]\d)').replace(
3067 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
3068 'CC', r'(?P<century>\d\d$)')
3069 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
3070 + r'(:(?P<second>\d{2}))?'
3071 + r'(\.(?P<fracsecond>\d+))?'
3072 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
3073 for tmpl in _iso8601_tmpl]
3078 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
3083 def _parse_date_iso8601(dateString):
3084 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
3086 for _iso8601_match in _iso8601_matches:
3087 m = _iso8601_match(dateString)
3092 if m.span() == (0, 0):
3094 params = m.groupdict()
3095 ordinal = params.get('ordinal', 0)
3097 ordinal = int(ordinal)
3100 year = params.get('year', '--')
3101 if not year or year == '--':
3102 year = time.gmtime()[0]
3103 elif len(year) == 2:
3104 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3105 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3108 month = params.get('month', '-')
3109 if not month or month == '-':
3110 # ordinals are NOT normalized by mktime, we simulate them
3111 # by setting month=1, day=ordinal
3115 month = time.gmtime()[1]
3117 day = params.get('day', 0)
3122 elif params.get('century', 0) or \
3123 params.get('year', 0) or params.get('month', 0):
3126 day = time.gmtime()[2]
3129 # special case of the century - is the first year of the 21st century
3130 # 2000 or 2001 ? The debate goes on...
3131 if 'century' in params.keys():
3132 year = (int(params['century']) - 1) * 100 + 1
3133 # in ISO 8601 most fields are optional
3134 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3135 if not params.get(field, None):
3137 hour = int(params.get('hour', 0))
3138 minute = int(params.get('minute', 0))
3139 second = int(float(params.get('second', 0)))
3140 # weekday is normalized by mktime(), we can ignore it
3142 daylight_savings_flag = -1
3143 tm = [year, month, day, hour, minute, second, weekday,
3144 ordinal, daylight_savings_flag]
3145 # ISO 8601 time zone adjustments
3146 tz = params.get('tz')
3147 if tz and tz != 'Z':
3149 tm[3] += int(params.get('tzhour', 0))
3150 tm[4] += int(params.get('tzmin', 0))
3152 tm[3] -= int(params.get('tzhour', 0))
3153 tm[4] -= int(params.get('tzmin', 0))
3156 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3157 # which is guaranteed to normalize d/m/y/h/m/s.
3158 # Many implementations have bugs, but we'll pretend they don't.
3159 return time.localtime(time.mktime(tuple(tm)))
3160 registerDateHandler(_parse_date_iso8601)
3162 # 8-bit date handling routines written by ytrewq1.
3163 _korean_year = u'\ub144' # b3e2 in euc-kr
3164 _korean_month = u'\uc6d4' # bff9 in euc-kr
3165 _korean_day = u'\uc77c' # c0cf in euc-kr
3166 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3167 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3169 _korean_onblog_date_re = \
3170 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3171 (_korean_year, _korean_month, _korean_day))
3172 _korean_nate_date_re = \
3173 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3174 (_korean_am, _korean_pm))
3175 def _parse_date_onblog(dateString):
3176 '''Parse a string according to the OnBlog 8-bit date format'''
3177 m = _korean_onblog_date_re.match(dateString)
3180 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3181 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3182 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3183 'zonediff': '+09:00'}
3184 return _parse_date_w3dtf(w3dtfdate)
3185 registerDateHandler(_parse_date_onblog)
3187 def _parse_date_nate(dateString):
3188 '''Parse a string according to the Nate 8-bit date format'''
3189 m = _korean_nate_date_re.match(dateString)
3192 hour = int(m.group(5))
3194 if (ampm == _korean_pm):
3199 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3200 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3201 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3202 'zonediff': '+09:00'}
3203 return _parse_date_w3dtf(w3dtfdate)
3204 registerDateHandler(_parse_date_nate)
3207 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3208 def _parse_date_mssql(dateString):
3209 '''Parse a string according to the MS SQL date format'''
3210 m = _mssql_date_re.match(dateString)
3213 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3214 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3215 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3216 'zonediff': '+09:00'}
3217 return _parse_date_w3dtf(w3dtfdate)
3218 registerDateHandler(_parse_date_mssql)
3220 # Unicode strings for Greek date strings
3223 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3224 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3225 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3226 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3227 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3228 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3229 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3230 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3231 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3232 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3233 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3234 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3235 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3236 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3237 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3238 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3239 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3240 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3241 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3246 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3247 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3248 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3249 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3250 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3251 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3252 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3255 _greek_date_format_re = \
3256 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3258 def _parse_date_greek(dateString):
3259 '''Parse a string according to a Greek 8-bit date format.'''
3260 m = _greek_date_format_re.match(dateString)
3263 wday = _greek_wdays[m.group(1)]
3264 month = _greek_months[m.group(3)]
3265 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3266 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3267 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3268 'zonediff': m.group(8)}
3269 return _parse_date_rfc822(rfc822date)
3270 registerDateHandler(_parse_date_greek)
3272 # Unicode strings for Hungarian date strings
3273 _hungarian_months = \
3275 u'janu\u00e1r': u'01', # e1 in iso-8859-2
3276 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3277 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3278 u'\u00e1prilis': u'04', # e1 in iso-8859-2
3279 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3280 u'j\u00fanius': u'06', # fa in iso-8859-2
3281 u'j\u00falius': u'07', # fa in iso-8859-2
3282 u'augusztus': u'08',
3283 u'szeptember': u'09',
3284 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3289 _hungarian_date_format_re = \
3290 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3292 def _parse_date_hungarian(dateString):
3293 '''Parse a string according to a Hungarian 8-bit date format.'''
3294 m = _hungarian_date_format_re.match(dateString)
3295 if not m or m.group(2) not in _hungarian_months:
3297 month = _hungarian_months[m.group(2)]
3304 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3305 {'year': m.group(1), 'month': month, 'day': day,\
3306 'hour': hour, 'minute': m.group(5),\
3307 'zonediff': m.group(6)}
3308 return _parse_date_w3dtf(w3dtfdate)
3309 registerDateHandler(_parse_date_hungarian)
3311 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3312 # Drake and licensed under the Python license. Removed all range checking
3313 # for month, day, hour, minute, and second, since mktime will normalize
3315 def _parse_date_w3dtf(dateString):
3316 def __extract_date(m):
3317 year = int(m.group('year'))
3319 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3322 julian = m.group('julian')
3324 julian = int(julian)
3325 month = julian / 30 + 1
3326 day = julian % 30 + 1
3328 while jday != julian:
3329 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
3330 jday = time.gmtime(t)[-2]
3331 diff = abs(jday - julian)
3343 return year, month, day
3344 month = m.group('month')
3350 day = m.group('day')
3355 return year, month, day
3357 def __extract_time(m):
3360 hours = m.group('hours')
3364 minutes = int(m.group('minutes'))
3365 seconds = m.group('seconds')
3367 seconds = int(seconds)
3370 return hours, minutes, seconds
3372 def __extract_tzd(m):
3373 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3376 tzd = m.group('tzd')
3381 hours = int(m.group('tzdhours'))
3382 minutes = m.group('tzdminutes')
3384 minutes = int(minutes)
3387 offset = (hours*60 + minutes) * 60
3392 __date_re = ('(?P<year>\d\d\d\d)'
3394 '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3395 '|(?P<julian>\d\d\d)))?')
3396 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3397 __tzd_rx = re.compile(__tzd_re)
3398 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3399 '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3401 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3402 __datetime_rx = re.compile(__datetime_re)
3403 m = __datetime_rx.match(dateString)
3404 if (m is None) or (m.group() != dateString):
3406 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3409 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3410 registerDateHandler(_parse_date_w3dtf)
3412 def _parse_date_rfc822(dateString):
3413 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3414 data = dateString.split()
3417 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3423 data[3:] = [s[:i], s[i+1:]]
3426 dateString = " ".join(data)
3427 # Account for the Etc/GMT timezone by stripping 'Etc/'
3428 elif len(data) == 5 and data[4].lower().startswith('etc/'):
3429 data[4] = data[4][4:]
3430 dateString = " ".join(data)
3432 dateString += ' 00:00:00 GMT'
3433 tm = rfc822.parsedate_tz(dateString)
3435 # Jython doesn't adjust for 2-digit years like CPython does,
3436 # so account for it by shifting the year so that it's in the
3437 # range 1970-2069 (1970 being the year of the Unix epoch).
3439 tm = (tm[0] + (1900, 2000)[tm[0] < 70],) + tm[1:]
3440 return time.gmtime(rfc822.mktime_tz(tm))
3441 # rfc822.py defines several time zones, but we define some extra ones.
3442 # 'ET' is equivalent to 'EST', etc.
3443 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3444 rfc822._timezones.update(_additional_timezones)
3445 registerDateHandler(_parse_date_rfc822)
3447 def _parse_date_perforce(aDateString):
3448 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3449 # Fri, 2006/09/15 08:19:53 EDT
3450 _my_date_pattern = re.compile( \
3451 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3453 m = _my_date_pattern.search(aDateString)
3456 dow, year, month, day, hour, minute, second, tz = m.groups()
3457 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3458 dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3459 tm = rfc822.parsedate_tz(dateString)
3461 return time.gmtime(rfc822.mktime_tz(tm))
3462 registerDateHandler(_parse_date_perforce)
3464 def _parse_date(dateString):
3465 '''Parses a variety of date formats into a 9-tuple in GMT'''
3468 for handler in _date_handlers:
3470 date9tuple = handler(dateString)
3471 except (KeyError, OverflowError, ValueError):
3475 if len(date9tuple) != 9:
3480 def _getCharacterEncoding(http_headers, xml_data):
3481 '''Get the character encoding of the XML document
3483 http_headers is a dictionary
3484 xml_data is a raw string (not Unicode)
3486 This is so much trickier than it sounds, it's not even funny.
3487 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3488 is application/xml, application/*+xml,
3489 application/xml-external-parsed-entity, or application/xml-dtd,
3490 the encoding given in the charset parameter of the HTTP Content-Type
3491 takes precedence over the encoding given in the XML prefix within the
3492 document, and defaults to 'utf-8' if neither are specified. But, if
3493 the HTTP Content-Type is text/xml, text/*+xml, or
3494 text/xml-external-parsed-entity, the encoding given in the XML prefix
3495 within the document is ALWAYS IGNORED and only the encoding given in
3496 the charset parameter of the HTTP Content-Type header should be
3497 respected, and it defaults to 'us-ascii' if not specified.
3499 Furthermore, discussion on the atom-syntax mailing list with the
3500 author of RFC 3023 leads me to the conclusion that any document
3501 served with a Content-Type of text/* and no charset parameter
3502 must be treated as us-ascii. (We now do this.) And also that it
3503 must always be flagged as non-well-formed. (We now do this too.)
3505 If Content-Type is unspecified (input was local file or non-HTTP source)
3506 or unrecognized (server just got it totally wrong), then go by the
3507 encoding given in the XML prefix of the document and default to
3508 'iso-8859-1' as per the HTTP specification (RFC 2616).
3510 Then, assuming we didn't find a character encoding in the HTTP headers
3511 (and the HTTP Content-type allowed us to look in the body), we need
3512 to sniff the first few bytes of the XML data and try to determine
3513 whether the encoding is ASCII-compatible. Section F of the XML
3514 specification shows the way here:
3515 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3517 If the sniffed encoding is not ASCII-compatible, we need to make it
3518 ASCII compatible so that we can sniff further into the XML declaration
3519 to find the encoding attribute, which will tell us the true encoding.
3521 Of course, none of this guarantees that we will be able to parse the
3522 feed in the declared character encoding (assuming it was declared
3523 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
3524 you should definitely install them if you can.
3525 http://cjkpython.i18n.org/
3528 def _parseHTTPContentType(content_type):
3529 '''takes HTTP Content-Type header and returns (content type, charset)
3531 If no charset is specified, returns (content type, '')
3532 If no content type is specified, returns ('', '')
3533 Both return parameters are guaranteed to be lowercase strings
3535 content_type = content_type or ''
3536 content_type, params = cgi.parse_header(content_type)
3537 charset = params.get('charset', '').replace("'", "")
3538 if not isinstance(charset, unicode):
3539 charset = charset.decode('utf-8', 'ignore')
3540 return content_type, charset
3542 sniffed_xml_encoding = u''
3545 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
3546 # Must sniff for non-ASCII-compatible character encodings before
3547 # searching for XML declaration. This heuristic is defined in
3548 # section F of the XML specification:
3549 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3551 if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3553 xml_data = _ebcdic_to_ascii(xml_data)
3554 elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3556 sniffed_xml_encoding = u'utf-16be'
3557 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3558 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3560 sniffed_xml_encoding = u'utf-16be'
3561 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3562 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3564 sniffed_xml_encoding = u'utf-16le'
3565 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3566 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3568 sniffed_xml_encoding = u'utf-16le'
3569 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3570 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3572 sniffed_xml_encoding = u'utf-32be'
3573 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3574 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3576 sniffed_xml_encoding = u'utf-32le'
3577 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3578 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3580 sniffed_xml_encoding = u'utf-32be'
3581 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3582 elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3584 sniffed_xml_encoding = u'utf-32le'
3585 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3586 elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3588 sniffed_xml_encoding = u'utf-8'
3589 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3593 xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
3594 except UnicodeDecodeError:
3595 xml_encoding_match = None
3596 if xml_encoding_match:
3597 xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3598 if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
3599 xml_encoding = sniffed_xml_encoding
3600 acceptable_content_type = 0
3601 application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
3602 text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
3603 if (http_content_type in application_content_types) or \
3604 (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
3605 acceptable_content_type = 1
3606 true_encoding = http_encoding or xml_encoding or u'utf-8'
3607 elif (http_content_type in text_content_types) or \
3608 (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
3609 acceptable_content_type = 1
3610 true_encoding = http_encoding or u'us-ascii'
3611 elif http_content_type.startswith(u'text/'):
3612 true_encoding = http_encoding or u'us-ascii'
3613 elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
3614 true_encoding = xml_encoding or u'iso-8859-1'
3616 true_encoding = xml_encoding or u'utf-8'
3617 # some feeds claim to be gb2312 but are actually gb18030.
3618 # apparently MSIE and Firefox both do the following switch:
3619 if true_encoding.lower() == u'gb2312':
3620 true_encoding = u'gb18030'
3621 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3623 def _toUTF8(data, encoding):
3624 '''Changes an XML data stream on the fly to specify a new encoding
3626 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3627 encoding is a string recognized by encodings.aliases
3629 # strip Byte Order Mark (if present)
3630 if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3631 encoding = 'utf-16be'
3633 elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3634 encoding = 'utf-16le'
3636 elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3639 elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3640 encoding = 'utf-32be'
3642 elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3643 encoding = 'utf-32le'
3645 newdata = unicode(data, encoding)
3646 declmatch = re.compile('^<\?xml[^>]*?>')
3647 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3648 if declmatch.search(newdata):
3649 newdata = declmatch.sub(newdecl, newdata)
3651 newdata = newdecl + u'\n' + newdata
3652 return newdata.encode('utf-8')
3654 def _stripDoctype(data):
3655 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3657 rss_version may be 'rss091n' or None
3658 stripped_data is the same XML document, minus the DOCTYPE
3660 start = re.search(_s2bytes('<\w'), data)
3661 start = start and start.start() or -1
3662 head,data = data[:start+1], data[start+1:]
3664 entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3665 entity_results=entity_pattern.findall(head)
3666 head = entity_pattern.sub(_s2bytes(''), head)
3667 doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3668 doctype_results = doctype_pattern.findall(head)
3669 doctype = doctype_results and doctype_results[0] or _s2bytes('')
3670 if doctype.lower().count(_s2bytes('netscape')):
3671 version = u'rss091n'
3675 # only allow in 'safe' inline entity definitions
3676 replacement=_s2bytes('')
3677 if len(doctype_results)==1 and entity_results:
3678 safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3679 safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3681 replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
3682 data = doctype_pattern.sub(replacement, head) + data
3684 return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
3686 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3687 '''Parse a feed from a URL, file, stream, or string.
3689 request_headers, if given, is a dict from http header name to value to add
3690 to the request; this overrides internally generated values.
3693 if handlers is None:
3695 if request_headers is None:
3696 request_headers = {}
3697 if response_headers is None:
3698 response_headers = {}
3700 result = FeedParserDict()
3701 result['feed'] = FeedParserDict()
3702 result['entries'] = []
3704 if not isinstance(handlers, list):
3705 handlers = [handlers]
3707 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3709 except Exception, e:
3711 result['bozo_exception'] = e
3715 if hasattr(f, 'headers'):
3716 result['headers'] = dict(f.headers)
3717 # overwrite existing headers using response_headers
3718 if 'headers' in result:
3719 result['headers'].update(response_headers)
3720 elif response_headers:
3721 result['headers'] = copy.deepcopy(response_headers)
3723 # if feed is gzip-compressed, decompress it
3724 if f and data and 'headers' in result:
3725 if gzip and 'gzip' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
3727 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3728 except (IOError, struct.error), e:
3729 # IOError can occur if the gzip header is bad
3730 # struct.error can occur if the data is damaged
3731 # Some feeds claim to be gzipped but they're not, so
3732 # we get garbage. Ideally, we should re-request the
3733 # feed without the 'Accept-encoding: gzip' header,
3736 result['bozo_exception'] = e
3738 elif zlib and 'deflate' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
3740 data = zlib.decompress(data)
3741 except zlib.error, e:
3743 result['bozo_exception'] = e
3747 if 'headers' in result:
3748 if 'etag' in result['headers'] or 'ETag' in result['headers']:
3749 etag = result['headers'].get('etag', result['headers'].get('ETag', u''))
3750 if not isinstance(etag, unicode):
3751 etag = etag.decode('utf-8', 'ignore')
3753 result['etag'] = etag
3754 if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
3755 modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
3757 result['modified'] = _parse_date(modified)
3758 if hasattr(f, 'url'):
3759 if not isinstance(f.url, unicode):
3760 result['href'] = f.url.decode('utf-8', 'ignore')
3762 result['href'] = f.url
3763 result['status'] = 200
3764 if hasattr(f, 'status'):
3765 result['status'] = f.status
3766 if hasattr(f, 'close'):
3772 # there are four encodings to keep track of:
3773 # - http_encoding is the encoding declared in the Content-Type HTTP header
3774 # - xml_encoding is the encoding declared in the <?xml declaration
3775 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3776 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3777 http_headers = result.get('headers', {})
3778 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3779 _getCharacterEncoding(http_headers, data)
3780 if http_headers and (not acceptable_content_type):
3781 if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
3782 bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
3784 bozo_message = 'no Content-type specified'
3786 result['bozo_exception'] = NonXMLContentType(bozo_message)
3788 if data is not None:
3789 result['version'], data, entities = _stripDoctype(data)
3791 # ensure that baseuri is an absolute uri using an acceptable URI scheme
3792 contentloc = http_headers.get('content-location', http_headers.get('Content-Location', u''))
3793 href = result.get('href', u'')
3794 baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3796 baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
3797 if not isinstance(baselang, unicode) and baselang is not None:
3798 baselang = baselang.decode('utf-8', 'ignore')
3800 # if server sent 304, we're done
3801 if result.get('status', 0) == 304:
3802 result['version'] = u''
3803 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3804 'so the server sent no data. This is a feature, not a bug!'
3807 # if there was a problem downloading, we're done
3811 # determine character encoding
3812 use_strict_parser = 0
3814 tried_encodings = []
3815 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3816 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3817 if not proposed_encoding:
3819 if proposed_encoding in tried_encodings:
3821 tried_encodings.append(proposed_encoding)
3823 data = _toUTF8(data, proposed_encoding)
3824 except (UnicodeDecodeError, LookupError):
3827 known_encoding = use_strict_parser = 1
3829 # if no luck and we have auto-detection library, try that
3830 if (not known_encoding) and chardet:
3831 proposed_encoding = chardet.detect(data)['encoding']
3832 if proposed_encoding and (proposed_encoding not in tried_encodings):
3833 tried_encodings.append(proposed_encoding)
3835 data = _toUTF8(data, proposed_encoding)
3836 except (UnicodeDecodeError, LookupError):
3839 known_encoding = use_strict_parser = 1
3840 # if still no luck and we haven't tried utf-8 yet, try that
3841 if (not known_encoding) and (u'utf-8' not in tried_encodings):
3842 proposed_encoding = u'utf-8'
3843 tried_encodings.append(proposed_encoding)
3845 data = _toUTF8(data, proposed_encoding)
3846 except UnicodeDecodeError:
3849 known_encoding = use_strict_parser = 1
3850 # if still no luck and we haven't tried windows-1252 yet, try that
3851 if (not known_encoding) and (u'windows-1252' not in tried_encodings):
3852 proposed_encoding = u'windows-1252'
3853 tried_encodings.append(proposed_encoding)
3855 data = _toUTF8(data, proposed_encoding)
3856 except UnicodeDecodeError:
3859 known_encoding = use_strict_parser = 1
3860 # if still no luck and we haven't tried iso-8859-2 yet, try that.
3861 if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
3862 proposed_encoding = u'iso-8859-2'
3863 tried_encodings.append(proposed_encoding)
3865 data = _toUTF8(data, proposed_encoding)
3866 except UnicodeDecodeError:
3869 known_encoding = use_strict_parser = 1
3870 # if still no luck, give up
3871 if not known_encoding:
3873 result['bozo_exception'] = CharacterEncodingUnknown( \
3874 'document encoding unknown, I tried ' + \
3875 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3876 (result['encoding'], xml_encoding))
3877 result['encoding'] = u''
3878 elif proposed_encoding != result['encoding']:
3880 result['bozo_exception'] = CharacterEncodingOverride( \
3881 'document declared as %s, but parsed as %s' % \
3882 (result['encoding'], proposed_encoding))
3883 result['encoding'] = proposed_encoding
3885 if not _XML_AVAILABLE:
3886 use_strict_parser = 0
3887 if use_strict_parser:
3888 # initialize the SAX parser
3889 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3890 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3891 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3893 # disable downloading external doctype references, if possible
3894 saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3895 except xml.sax.SAXNotSupportedException:
3897 saxparser.setContentHandler(feedparser)
3898 saxparser.setErrorHandler(feedparser)
3899 source = xml.sax.xmlreader.InputSource()
3900 source.setByteStream(_StringIO(data))
3901 if hasattr(saxparser, '_ns_stack'):
3902 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3903 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3904 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3906 saxparser.parse(source)
3907 except xml.sax.SAXParseException, e:
3909 result['bozo_exception'] = feedparser.exc or e
3910 use_strict_parser = 0
3911 if not use_strict_parser and _SGML_AVAILABLE:
3912 feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3913 feedparser.feed(data.decode('utf-8', 'replace'))
3914 result['feed'] = feedparser.feeddata
3915 result['entries'] = feedparser.entries
3916 result['version'] = result['version'] or feedparser.version
3917 result['namespaces'] = feedparser.namespacesInUse