├── README ├── app.yaml ├── extractlinks.py ├── feedparser.py ├── index.yaml ├── main.py └── templates └── index.html /README: -------------------------------------------------------------------------------- 1 | A tool that extracts and caches feed urls (for 1 week) for any webpage. 2 | Please do not abuse, or just fork and run it under your own Google App Engine account. 3 | 4 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | runtime: python27 2 | api_version: 1 3 | threadsafe: yes 4 | 5 | handlers: 6 | - url: .* 7 | script: main.app 8 | 9 | libraries: 10 | - name: webapp2 11 | version: "2.5.2" 12 | -------------------------------------------------------------------------------- /extractlinks.py: -------------------------------------------------------------------------------- 1 | from sgmllib import SGMLParser 2 | from urlparse import urlparse 3 | from urlparse import urljoin 4 | 5 | import logging 6 | 7 | 8 | class LinkExtractor(SGMLParser): 9 | """A simple LinkExtractor class""" 10 | 11 | def set_base_url(self, base_url=None): 12 | self.base_url = base_url 13 | 14 | def make_absolute_and_add(self, dict_feed=None): 15 | if 'href' in dict_feed: 16 | p = urlparse(dict_feed['href']) 17 | if p.scheme != "": 18 | self.links.append(dict_feed) 19 | else: 20 | dict_feed['href'] = urljoin(self.base_url, dict_feed['href']) 21 | self.links.append(dict_feed) 22 | 23 | def reset(self): 24 | SGMLParser.reset(self) 25 | self.links = [] 26 | 27 | def start_link(self, attrs): 28 | if('type', 'application/rss+xml') in attrs: 29 | self.make_absolute_and_add(dict(attrs)) 30 | if('type', 'application/atom+xml') in attrs: 31 | self.make_absolute_and_add(dict(attrs)) 32 | -------------------------------------------------------------------------------- /feedparser.py: -------------------------------------------------------------------------------- 1 | """Universal feed parser 2 | 3 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds 4 | 5 | Visit https://code.google.com/p/feedparser/ for the latest version 6 | Visit http://packages.python.org/feedparser/ for the latest documentation 7 | 8 | Required: Python 2.4 or later 9 | Recommended: iconv_codec 10 | """ 11 | 12 | __version__ = "5.1.3" 13 | __license__ = """ 14 | Copyright (c) 2010-2013 Kurt McKee 15 | Copyright (c) 2002-2008 Mark Pilgrim 16 | All rights reserved. 17 | 18 | Redistribution and use in source and binary forms, with or without modification, 19 | are permitted provided that the following conditions are met: 20 | 21 | * Redistributions of source code must retain the above copyright notice, 22 | this list of conditions and the following disclaimer. 23 | * Redistributions in binary form must reproduce the above copyright notice, 24 | this list of conditions and the following disclaimer in the documentation 25 | and/or other materials provided with the distribution. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 | POSSIBILITY OF SUCH DAMAGE.""" 38 | __author__ = "Mark Pilgrim " 39 | __contributors__ = ["Jason Diamond ", 40 | "John Beimler ", 41 | "Fazal Majid ", 42 | "Aaron Swartz ", 43 | "Kevin Marks ", 44 | "Sam Ruby ", 45 | "Ade Oshineye ", 46 | "Martin Pool ", 47 | "Kurt McKee ", 48 | "Bernd Schlapsi ",] 49 | 50 | # HTTP "User-Agent" header to send to servers when downloading feeds. 51 | # If you are embedding feedparser in a larger application, you should 52 | # change this to your application name and URL. 53 | USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__ 54 | 55 | # HTTP "Accept" header to send to servers when downloading feeds. If you don't 56 | # want to send an Accept header, set this to None. 57 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" 58 | 59 | # List of preferred XML parsers, by SAX driver name. These will be tried first, 60 | # but if they're not installed, Python will keep searching through its own list 61 | # of pre-installed parsers until it finds one that supports everything we need. 62 | PREFERRED_XML_PARSERS = ["drv_libxml2"] 63 | 64 | # If you want feedparser to automatically resolve all relative URIs, set this 65 | # to 1. 66 | RESOLVE_RELATIVE_URIS = 1 67 | 68 | # If you want feedparser to automatically sanitize all potentially unsafe 69 | # HTML content, set this to 1. 70 | SANITIZE_HTML = 1 71 | 72 | # ---------- Python 3 modules (make it work if possible) ---------- 73 | try: 74 | import rfc822 75 | except ImportError: 76 | from email import _parseaddr as rfc822 77 | 78 | try: 79 | # Python 3.1 introduces bytes.maketrans and simultaneously 80 | # deprecates string.maketrans; use bytes.maketrans if possible 81 | _maketrans = bytes.maketrans 82 | except (NameError, AttributeError): 83 | import string 84 | _maketrans = string.maketrans 85 | 86 | # base64 support for Atom feeds that contain embedded binary data 87 | try: 88 | import base64, binascii 89 | except ImportError: 90 | base64 = binascii = None 91 | else: 92 | # Python 3.1 deprecates decodestring in favor of decodebytes 93 | _base64decode = getattr(base64, 'decodebytes', base64.decodestring) 94 | 95 | # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3 96 | # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3 97 | try: 98 | if bytes is str: 99 | # In Python 2.5 and below, bytes doesn't exist (NameError) 100 | # In Python 2.6 and above, bytes and str are the same type 101 | raise NameError 102 | except NameError: 103 | # Python 2 104 | def _s2bytes(s): 105 | return s 106 | def _l2bytes(l): 107 | return ''.join(map(chr, l)) 108 | else: 109 | # Python 3 110 | def _s2bytes(s): 111 | return bytes(s, 'utf8') 112 | def _l2bytes(l): 113 | return bytes(l) 114 | 115 | # If you want feedparser to allow all URL schemes, set this to () 116 | # List culled from Python's urlparse documentation at: 117 | # http://docs.python.org/library/urlparse.html 118 | # as well as from "URI scheme" at Wikipedia: 119 | # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme 120 | # Many more will likely need to be added! 121 | ACCEPTABLE_URI_SCHEMES = ( 122 | 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', 123 | 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 124 | 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 125 | 'wais', 126 | # Additional common-but-unofficial schemes 127 | 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', 128 | 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', 129 | ) 130 | #ACCEPTABLE_URI_SCHEMES = () 131 | 132 | # ---------- required modules (should come with any Python distribution) ---------- 133 | import cgi 134 | import codecs 135 | import copy 136 | import datetime 137 | import itertools 138 | import re 139 | import struct 140 | import time 141 | import types 142 | import urllib 143 | import urllib2 144 | import urlparse 145 | import warnings 146 | 147 | from htmlentitydefs import name2codepoint, codepoint2name, entitydefs 148 | 149 | try: 150 | from io import BytesIO as _StringIO 151 | except ImportError: 152 | try: 153 | from cStringIO import StringIO as _StringIO 154 | except ImportError: 155 | from StringIO import StringIO as _StringIO 156 | 157 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- 158 | 159 | # gzip is included with most Python distributions, but may not be available if you compiled your own 160 | try: 161 | import gzip 162 | except ImportError: 163 | gzip = None 164 | try: 165 | import zlib 166 | except ImportError: 167 | zlib = None 168 | 169 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has 170 | # been tested with the built-in SAX parser and libxml2. On platforms where the 171 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some 172 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. 173 | try: 174 | import xml.sax 175 | from xml.sax.saxutils import escape as _xmlescape 176 | except ImportError: 177 | _XML_AVAILABLE = 0 178 | def _xmlescape(data,entities={}): 179 | data = data.replace('&', '&') 180 | data = data.replace('>', '>') 181 | data = data.replace('<', '<') 182 | for char, entity in entities: 183 | data = data.replace(char, entity) 184 | return data 185 | else: 186 | try: 187 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers 188 | except xml.sax.SAXReaderNotAvailable: 189 | _XML_AVAILABLE = 0 190 | else: 191 | _XML_AVAILABLE = 1 192 | 193 | # sgmllib is not available by default in Python 3; if the end user doesn't have 194 | # it available then we'll lose illformed XML parsing and content santizing 195 | try: 196 | import sgmllib 197 | except ImportError: 198 | # This is probably Python 3, which doesn't include sgmllib anymore 199 | _SGML_AVAILABLE = 0 200 | 201 | # Mock sgmllib enough to allow subclassing later on 202 | class sgmllib(object): 203 | class SGMLParser(object): 204 | def goahead(self, i): 205 | pass 206 | def parse_starttag(self, i): 207 | pass 208 | else: 209 | _SGML_AVAILABLE = 1 210 | 211 | # sgmllib defines a number of module-level regular expressions that are 212 | # insufficient for the XML parsing feedparser needs. Rather than modify 213 | # the variables directly in sgmllib, they're defined here using the same 214 | # names, and the compiled code objects of several sgmllib.SGMLParser 215 | # methods are copied into _BaseHTMLProcessor so that they execute in 216 | # feedparser's scope instead of sgmllib's scope. 217 | charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') 218 | tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 219 | attrfind = re.compile( 220 | r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*' 221 | r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?' 222 | ) 223 | 224 | # Unfortunately, these must be copied over to prevent NameError exceptions 225 | entityref = sgmllib.entityref 226 | incomplete = sgmllib.incomplete 227 | interesting = sgmllib.interesting 228 | shorttag = sgmllib.shorttag 229 | shorttagopen = sgmllib.shorttagopen 230 | starttagopen = sgmllib.starttagopen 231 | 232 | class _EndBracketRegEx: 233 | def __init__(self): 234 | # Overriding the built-in sgmllib.endbracket regex allows the 235 | # parser to find angle brackets embedded in element attributes. 236 | self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') 237 | def search(self, target, index=0): 238 | match = self.endbracket.match(target, index) 239 | if match is not None: 240 | # Returning a new object in the calling thread's context 241 | # resolves a thread-safety. 242 | return EndBracketMatch(match) 243 | return None 244 | class EndBracketMatch: 245 | def __init__(self, match): 246 | self.match = match 247 | def start(self, n): 248 | return self.match.end(n) 249 | endbracket = _EndBracketRegEx() 250 | 251 | 252 | # iconv_codec provides support for more character encodings. 253 | # It's available from http://cjkpython.i18n.org/ 254 | try: 255 | import iconv_codec 256 | except ImportError: 257 | pass 258 | 259 | # chardet library auto-detects character encodings 260 | # Download from http://chardet.feedparser.org/ 261 | try: 262 | import chardet 263 | except ImportError: 264 | chardet = None 265 | 266 | # ---------- don't touch these ---------- 267 | class ThingsNobodyCaresAboutButMe(Exception): pass 268 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass 269 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass 270 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass 271 | class UndeclaredNamespace(Exception): pass 272 | 273 | SUPPORTED_VERSIONS = {'': u'unknown', 274 | 'rss090': u'RSS 0.90', 275 | 'rss091n': u'RSS 0.91 (Netscape)', 276 | 'rss091u': u'RSS 0.91 (Userland)', 277 | 'rss092': u'RSS 0.92', 278 | 'rss093': u'RSS 0.93', 279 | 'rss094': u'RSS 0.94', 280 | 'rss20': u'RSS 2.0', 281 | 'rss10': u'RSS 1.0', 282 | 'rss': u'RSS (unknown version)', 283 | 'atom01': u'Atom 0.1', 284 | 'atom02': u'Atom 0.2', 285 | 'atom03': u'Atom 0.3', 286 | 'atom10': u'Atom 1.0', 287 | 'atom': u'Atom (unknown version)', 288 | 'cdf': u'CDF', 289 | } 290 | 291 | class FeedParserDict(dict): 292 | keymap = {'channel': 'feed', 293 | 'items': 'entries', 294 | 'guid': 'id', 295 | 'date': 'updated', 296 | 'date_parsed': 'updated_parsed', 297 | 'description': ['summary', 'subtitle'], 298 | 'description_detail': ['summary_detail', 'subtitle_detail'], 299 | 'url': ['href'], 300 | 'modified': 'updated', 301 | 'modified_parsed': 'updated_parsed', 302 | 'issued': 'published', 303 | 'issued_parsed': 'published_parsed', 304 | 'copyright': 'rights', 305 | 'copyright_detail': 'rights_detail', 306 | 'tagline': 'subtitle', 307 | 'tagline_detail': 'subtitle_detail'} 308 | def __getitem__(self, key): 309 | if key == 'category': 310 | try: 311 | return dict.__getitem__(self, 'tags')[0]['term'] 312 | except IndexError: 313 | raise KeyError, "object doesn't have key 'category'" 314 | elif key == 'enclosures': 315 | norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) 316 | return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure'] 317 | elif key == 'license': 318 | for link in dict.__getitem__(self, 'links'): 319 | if link['rel']==u'license' and 'href' in link: 320 | return link['href'] 321 | elif key == 'updated': 322 | # Temporarily help developers out by keeping the old 323 | # broken behavior that was reported in issue 310. 324 | # This fix was proposed in issue 328. 325 | if not dict.__contains__(self, 'updated') and \ 326 | dict.__contains__(self, 'published'): 327 | warnings.warn("To avoid breaking existing software while " 328 | "fixing issue 310, a temporary mapping has been created " 329 | "from `updated` to `published` if `updated` doesn't " 330 | "exist. This fallback will be removed in a future version " 331 | "of feedparser.", DeprecationWarning) 332 | return dict.__getitem__(self, 'published') 333 | return dict.__getitem__(self, 'updated') 334 | elif key == 'updated_parsed': 335 | if not dict.__contains__(self, 'updated_parsed') and \ 336 | dict.__contains__(self, 'published_parsed'): 337 | warnings.warn("To avoid breaking existing software while " 338 | "fixing issue 310, a temporary mapping has been created " 339 | "from `updated_parsed` to `published_parsed` if " 340 | "`updated_parsed` doesn't exist. This fallback will be " 341 | "removed in a future version of feedparser.", 342 | DeprecationWarning) 343 | return dict.__getitem__(self, 'published_parsed') 344 | return dict.__getitem__(self, 'updated_parsed') 345 | else: 346 | realkey = self.keymap.get(key, key) 347 | if isinstance(realkey, list): 348 | for k in realkey: 349 | if dict.__contains__(self, k): 350 | return dict.__getitem__(self, k) 351 | elif dict.__contains__(self, realkey): 352 | return dict.__getitem__(self, realkey) 353 | return dict.__getitem__(self, key) 354 | 355 | def __contains__(self, key): 356 | if key in ('updated', 'updated_parsed'): 357 | # Temporarily help developers out by keeping the old 358 | # broken behavior that was reported in issue 310. 359 | # This fix was proposed in issue 328. 360 | return dict.__contains__(self, key) 361 | try: 362 | self.__getitem__(key) 363 | except KeyError: 364 | return False 365 | else: 366 | return True 367 | 368 | has_key = __contains__ 369 | 370 | def get(self, key, default=None): 371 | try: 372 | return self.__getitem__(key) 373 | except KeyError: 374 | return default 375 | 376 | def __setitem__(self, key, value): 377 | key = self.keymap.get(key, key) 378 | if isinstance(key, list): 379 | key = key[0] 380 | return dict.__setitem__(self, key, value) 381 | 382 | def setdefault(self, key, value): 383 | if key not in self: 384 | self[key] = value 385 | return value 386 | return self[key] 387 | 388 | def __getattr__(self, key): 389 | # __getattribute__() is called first; this will be called 390 | # only if an attribute was not already found 391 | try: 392 | return self.__getitem__(key) 393 | except KeyError: 394 | raise AttributeError, "object has no attribute '%s'" % key 395 | 396 | def __hash__(self): 397 | return id(self) 398 | 399 | _cp1252 = { 400 | 128: unichr(8364), # euro sign 401 | 130: unichr(8218), # single low-9 quotation mark 402 | 131: unichr( 402), # latin small letter f with hook 403 | 132: unichr(8222), # double low-9 quotation mark 404 | 133: unichr(8230), # horizontal ellipsis 405 | 134: unichr(8224), # dagger 406 | 135: unichr(8225), # double dagger 407 | 136: unichr( 710), # modifier letter circumflex accent 408 | 137: unichr(8240), # per mille sign 409 | 138: unichr( 352), # latin capital letter s with caron 410 | 139: unichr(8249), # single left-pointing angle quotation mark 411 | 140: unichr( 338), # latin capital ligature oe 412 | 142: unichr( 381), # latin capital letter z with caron 413 | 145: unichr(8216), # left single quotation mark 414 | 146: unichr(8217), # right single quotation mark 415 | 147: unichr(8220), # left double quotation mark 416 | 148: unichr(8221), # right double quotation mark 417 | 149: unichr(8226), # bullet 418 | 150: unichr(8211), # en dash 419 | 151: unichr(8212), # em dash 420 | 152: unichr( 732), # small tilde 421 | 153: unichr(8482), # trade mark sign 422 | 154: unichr( 353), # latin small letter s with caron 423 | 155: unichr(8250), # single right-pointing angle quotation mark 424 | 156: unichr( 339), # latin small ligature oe 425 | 158: unichr( 382), # latin small letter z with caron 426 | 159: unichr( 376), # latin capital letter y with diaeresis 427 | } 428 | 429 | _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') 430 | def _urljoin(base, uri): 431 | uri = _urifixer.sub(r'\1\3', uri) 432 | #try: 433 | if not isinstance(uri, unicode): 434 | uri = uri.decode('utf-8', 'ignore') 435 | uri = urlparse.urljoin(base, uri) 436 | if not isinstance(uri, unicode): 437 | return uri.decode('utf-8', 'ignore') 438 | return uri 439 | #except: 440 | # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) 441 | # return urlparse.urljoin(base, uri) 442 | 443 | class _FeedParserMixin: 444 | namespaces = { 445 | '': '', 446 | 'http://backend.userland.com/rss': '', 447 | 'http://blogs.law.harvard.edu/tech/rss': '', 448 | 'http://purl.org/rss/1.0/': '', 449 | 'http://my.netscape.com/rdf/simple/0.9/': '', 450 | 'http://example.com/newformat#': '', 451 | 'http://example.com/necho': '', 452 | 'http://purl.org/echo/': '', 453 | 'uri/of/echo/namespace#': '', 454 | 'http://purl.org/pie/': '', 455 | 'http://purl.org/atom/ns#': '', 456 | 'http://www.w3.org/2005/Atom': '', 457 | 'http://purl.org/rss/1.0/modules/rss091#': '', 458 | 459 | 'http://webns.net/mvcb/': 'admin', 460 | 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', 461 | 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', 462 | 'http://media.tangent.org/rss/1.0/': 'audio', 463 | 'http://backend.userland.com/blogChannelModule': 'blogChannel', 464 | 'http://web.resource.org/cc/': 'cc', 465 | 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', 466 | 'http://purl.org/rss/1.0/modules/company': 'co', 467 | 'http://purl.org/rss/1.0/modules/content/': 'content', 468 | 'http://my.theinfo.org/changed/1.0/rss/': 'cp', 469 | 'http://purl.org/dc/elements/1.1/': 'dc', 470 | 'http://purl.org/dc/terms/': 'dcterms', 471 | 'http://purl.org/rss/1.0/modules/email/': 'email', 472 | 'http://purl.org/rss/1.0/modules/event/': 'ev', 473 | 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', 474 | 'http://freshmeat.net/rss/fm/': 'fm', 475 | 'http://xmlns.com/foaf/0.1/': 'foaf', 476 | 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', 477 | 'http://www.georss.org/georss': 'georss', 478 | 'http://www.opengis.net/gml': 'gml', 479 | 'http://postneo.com/icbm/': 'icbm', 480 | 'http://purl.org/rss/1.0/modules/image/': 'image', 481 | 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', 482 | 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', 483 | 'http://purl.org/rss/1.0/modules/link/': 'l', 484 | 'http://search.yahoo.com/mrss': 'media', 485 | # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace 486 | 'http://search.yahoo.com/mrss/': 'media', 487 | 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', 488 | 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', 489 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', 490 | 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', 491 | 'http://purl.org/rss/1.0/modules/reference/': 'ref', 492 | 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', 493 | 'http://purl.org/rss/1.0/modules/search/': 'search', 494 | 'http://purl.org/rss/1.0/modules/slash/': 'slash', 495 | 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', 496 | 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', 497 | 'http://hacks.benhammersley.com/rss/streaming/': 'str', 498 | 'http://purl.org/rss/1.0/modules/subscription/': 'sub', 499 | 'http://purl.org/rss/1.0/modules/syndication/': 'sy', 500 | 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', 501 | 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', 502 | 'http://purl.org/rss/1.0/modules/threading/': 'thr', 503 | 'http://purl.org/rss/1.0/modules/textinput/': 'ti', 504 | 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', 505 | 'http://wellformedweb.org/commentAPI/': 'wfw', 506 | 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', 507 | 'http://www.w3.org/1999/xhtml': 'xhtml', 508 | 'http://www.w3.org/1999/xlink': 'xlink', 509 | 'http://www.w3.org/XML/1998/namespace': 'xml', 510 | 'http://podlove.org/simple-chapters': 'psc', 511 | } 512 | _matchnamespaces = {} 513 | 514 | can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']) 515 | can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) 516 | can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) 517 | html_types = [u'text/html', u'application/xhtml+xml'] 518 | 519 | def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'): 520 | if not self._matchnamespaces: 521 | for k, v in self.namespaces.items(): 522 | self._matchnamespaces[k.lower()] = v 523 | self.feeddata = FeedParserDict() # feed-level data 524 | self.encoding = encoding # character encoding 525 | self.entries = [] # list of entry-level data 526 | self.version = u'' # feed type/version, see SUPPORTED_VERSIONS 527 | self.namespacesInUse = {} # dictionary of namespaces defined by the feed 528 | 529 | # the following are used internally to track state; 530 | # this is really out of control and should be refactored 531 | self.infeed = 0 532 | self.inentry = 0 533 | self.incontent = 0 534 | self.intextinput = 0 535 | self.inimage = 0 536 | self.inauthor = 0 537 | self.incontributor = 0 538 | self.inpublisher = 0 539 | self.insource = 0 540 | 541 | # georss 542 | self.ingeometry = 0 543 | 544 | self.sourcedata = FeedParserDict() 545 | self.contentparams = FeedParserDict() 546 | self._summaryKey = None 547 | self.namespacemap = {} 548 | self.elementstack = [] 549 | self.basestack = [] 550 | self.langstack = [] 551 | self.baseuri = baseuri or u'' 552 | self.lang = baselang or None 553 | self.svgOK = 0 554 | self.title_depth = -1 555 | self.depth = 0 556 | self.psc_chapters_counter = 0 557 | if baselang: 558 | self.feeddata['language'] = baselang.replace('_','-') 559 | 560 | # A map of the following form: 561 | # { 562 | # object_that_value_is_set_on: { 563 | # property_name: depth_of_node_property_was_extracted_from, 564 | # other_property: depth_of_node_property_was_extracted_from, 565 | # }, 566 | # } 567 | self.property_depth_map = {} 568 | 569 | def _normalize_attributes(self, kv): 570 | k = kv[0].lower() 571 | v = k in ('rel', 'type') and kv[1].lower() or kv[1] 572 | # the sgml parser doesn't handle entities in attributes, nor 573 | # does it pass the attribute values through as unicode, while 574 | # strict xml parsers do -- account for this difference 575 | if isinstance(self, _LooseFeedParser): 576 | v = v.replace('&', '&') 577 | if not isinstance(v, unicode): 578 | v = v.decode('utf-8') 579 | return (k, v) 580 | 581 | def unknown_starttag(self, tag, attrs): 582 | # increment depth counter 583 | self.depth += 1 584 | 585 | # normalize attrs 586 | attrs = map(self._normalize_attributes, attrs) 587 | 588 | # track xml:base and xml:lang 589 | attrsD = dict(attrs) 590 | baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri 591 | if not isinstance(baseuri, unicode): 592 | baseuri = baseuri.decode(self.encoding, 'ignore') 593 | # ensure that self.baseuri is always an absolute URI that 594 | # uses a whitelisted URI scheme (e.g. not `javscript:`) 595 | if self.baseuri: 596 | self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri 597 | else: 598 | self.baseuri = _urljoin(self.baseuri, baseuri) 599 | lang = attrsD.get('xml:lang', attrsD.get('lang')) 600 | if lang == '': 601 | # xml:lang could be explicitly set to '', we need to capture that 602 | lang = None 603 | elif lang is None: 604 | # if no xml:lang is specified, use parent lang 605 | lang = self.lang 606 | if lang: 607 | if tag in ('feed', 'rss', 'rdf:RDF'): 608 | self.feeddata['language'] = lang.replace('_','-') 609 | self.lang = lang 610 | self.basestack.append(self.baseuri) 611 | self.langstack.append(lang) 612 | 613 | # track namespaces 614 | for prefix, uri in attrs: 615 | if prefix.startswith('xmlns:'): 616 | self.trackNamespace(prefix[6:], uri) 617 | elif prefix == 'xmlns': 618 | self.trackNamespace(None, uri) 619 | 620 | # track inline content 621 | if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'): 622 | if tag in ('xhtml:div', 'div'): 623 | return # typepad does this 10/2007 624 | # element declared itself as escaped markup, but it isn't really 625 | self.contentparams['type'] = u'application/xhtml+xml' 626 | if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': 627 | if tag.find(':') <> -1: 628 | prefix, tag = tag.split(':', 1) 629 | namespace = self.namespacesInUse.get(prefix, '') 630 | if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': 631 | attrs.append(('xmlns',namespace)) 632 | if tag=='svg' and namespace=='http://www.w3.org/2000/svg': 633 | attrs.append(('xmlns',namespace)) 634 | if tag == 'svg': 635 | self.svgOK += 1 636 | return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) 637 | 638 | # match namespaces 639 | if tag.find(':') <> -1: 640 | prefix, suffix = tag.split(':', 1) 641 | else: 642 | prefix, suffix = '', tag 643 | prefix = self.namespacemap.get(prefix, prefix) 644 | if prefix: 645 | prefix = prefix + '_' 646 | 647 | # special hack for better tracking of empty textinput/image elements in illformed feeds 648 | if (not prefix) and tag not in ('title', 'link', 'description', 'name'): 649 | self.intextinput = 0 650 | if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): 651 | self.inimage = 0 652 | 653 | # call special handler (if defined) or default handler 654 | methodname = '_start_' + prefix + suffix 655 | try: 656 | method = getattr(self, methodname) 657 | return method(attrsD) 658 | except AttributeError: 659 | # Since there's no handler or something has gone wrong we explicitly add the element and its attributes 660 | unknown_tag = prefix + suffix 661 | if len(attrsD) == 0: 662 | # No attributes so merge it into the encosing dictionary 663 | return self.push(unknown_tag, 1) 664 | else: 665 | # Has attributes so create it in its own dictionary 666 | context = self._getContext() 667 | context[unknown_tag] = attrsD 668 | 669 | def unknown_endtag(self, tag): 670 | # match namespaces 671 | if tag.find(':') <> -1: 672 | prefix, suffix = tag.split(':', 1) 673 | else: 674 | prefix, suffix = '', tag 675 | prefix = self.namespacemap.get(prefix, prefix) 676 | if prefix: 677 | prefix = prefix + '_' 678 | if suffix == 'svg' and self.svgOK: 679 | self.svgOK -= 1 680 | 681 | # call special handler (if defined) or default handler 682 | methodname = '_end_' + prefix + suffix 683 | try: 684 | if self.svgOK: 685 | raise AttributeError() 686 | method = getattr(self, methodname) 687 | method() 688 | except AttributeError: 689 | self.pop(prefix + suffix) 690 | 691 | # track inline content 692 | if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'): 693 | # element declared itself as escaped markup, but it isn't really 694 | if tag in ('xhtml:div', 'div'): 695 | return # typepad does this 10/2007 696 | self.contentparams['type'] = u'application/xhtml+xml' 697 | if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': 698 | tag = tag.split(':')[-1] 699 | self.handle_data('' % tag, escape=0) 700 | 701 | # track xml:base and xml:lang going out of scope 702 | if self.basestack: 703 | self.basestack.pop() 704 | if self.basestack and self.basestack[-1]: 705 | self.baseuri = self.basestack[-1] 706 | if self.langstack: 707 | self.langstack.pop() 708 | if self.langstack: # and (self.langstack[-1] is not None): 709 | self.lang = self.langstack[-1] 710 | 711 | self.depth -= 1 712 | 713 | def handle_charref(self, ref): 714 | # called for each character reference, e.g. for ' ', ref will be '160' 715 | if not self.elementstack: 716 | return 717 | ref = ref.lower() 718 | if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): 719 | text = '&#%s;' % ref 720 | else: 721 | if ref[0] == 'x': 722 | c = int(ref[1:], 16) 723 | else: 724 | c = int(ref) 725 | text = unichr(c).encode('utf-8') 726 | self.elementstack[-1][2].append(text) 727 | 728 | def handle_entityref(self, ref): 729 | # called for each entity reference, e.g. for '©', ref will be 'copy' 730 | if not self.elementstack: 731 | return 732 | if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): 733 | text = '&%s;' % ref 734 | elif ref in self.entities: 735 | text = self.entities[ref] 736 | if text.startswith('&#') and text.endswith(';'): 737 | return self.handle_entityref(text) 738 | else: 739 | try: 740 | name2codepoint[ref] 741 | except KeyError: 742 | text = '&%s;' % ref 743 | else: 744 | text = unichr(name2codepoint[ref]).encode('utf-8') 745 | self.elementstack[-1][2].append(text) 746 | 747 | def handle_data(self, text, escape=1): 748 | # called for each block of plain text, i.e. outside of any tag and 749 | # not containing any character or entity references 750 | if not self.elementstack: 751 | return 752 | if escape and self.contentparams.get('type') == u'application/xhtml+xml': 753 | text = _xmlescape(text) 754 | self.elementstack[-1][2].append(text) 755 | 756 | def handle_comment(self, text): 757 | # called for each comment, e.g. 758 | pass 759 | 760 | def handle_pi(self, text): 761 | # called for each processing instruction, e.g. 762 | pass 763 | 764 | def handle_decl(self, text): 765 | pass 766 | 767 | def parse_declaration(self, i): 768 | # override internal declaration handler to handle CDATA blocks 769 | if self.rawdata[i:i+9] == '', i) 771 | if k == -1: 772 | # CDATA block began but didn't finish 773 | k = len(self.rawdata) 774 | return k 775 | self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) 776 | return k+3 777 | else: 778 | k = self.rawdata.find('>', i) 779 | if k >= 0: 780 | return k+1 781 | else: 782 | # We have an incomplete CDATA block. 783 | return k 784 | 785 | def mapContentType(self, contentType): 786 | contentType = contentType.lower() 787 | if contentType == 'text' or contentType == 'plain': 788 | contentType = u'text/plain' 789 | elif contentType == 'html': 790 | contentType = u'text/html' 791 | elif contentType == 'xhtml': 792 | contentType = u'application/xhtml+xml' 793 | return contentType 794 | 795 | def trackNamespace(self, prefix, uri): 796 | loweruri = uri.lower() 797 | if not self.version: 798 | if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): 799 | self.version = u'rss090' 800 | elif loweruri == 'http://purl.org/rss/1.0/': 801 | self.version = u'rss10' 802 | elif loweruri == 'http://www.w3.org/2005/atom': 803 | self.version = u'atom10' 804 | if loweruri.find(u'backend.userland.com/rss') <> -1: 805 | # match any backend.userland.com namespace 806 | uri = u'http://backend.userland.com/rss' 807 | loweruri = uri 808 | if loweruri in self._matchnamespaces: 809 | self.namespacemap[prefix] = self._matchnamespaces[loweruri] 810 | self.namespacesInUse[self._matchnamespaces[loweruri]] = uri 811 | else: 812 | self.namespacesInUse[prefix or ''] = uri 813 | 814 | def resolveURI(self, uri): 815 | return _urljoin(self.baseuri or u'', uri) 816 | 817 | def decodeEntities(self, element, data): 818 | return data 819 | 820 | def strattrs(self, attrs): 821 | return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) 822 | 823 | def push(self, element, expectingText): 824 | self.elementstack.append([element, expectingText, []]) 825 | 826 | def pop(self, element, stripWhitespace=1): 827 | if not self.elementstack: 828 | return 829 | if self.elementstack[-1][0] != element: 830 | return 831 | 832 | element, expectingText, pieces = self.elementstack.pop() 833 | 834 | if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml': 835 | # remove enclosing child element, but only if it is a
and 836 | # only if all the remaining content is nested underneath it. 837 | # This means that the divs would be retained in the following: 838 | #
foo
bar
839 | while pieces and len(pieces)>1 and not pieces[-1].strip(): 840 | del pieces[-1] 841 | while pieces and len(pieces)>1 and not pieces[0].strip(): 842 | del pieces[0] 843 | if pieces and (pieces[0] == '
' or pieces[0].startswith('
': 844 | depth = 0 845 | for piece in pieces[:-1]: 846 | if piece.startswith(''): 851 | depth += 1 852 | else: 853 | pieces = pieces[1:-1] 854 | 855 | # Ensure each piece is a str for Python 3 856 | for (i, v) in enumerate(pieces): 857 | if not isinstance(v, unicode): 858 | pieces[i] = v.decode('utf-8') 859 | 860 | output = u''.join(pieces) 861 | if stripWhitespace: 862 | output = output.strip() 863 | if not expectingText: 864 | return output 865 | 866 | # decode base64 content 867 | if base64 and self.contentparams.get('base64', 0): 868 | try: 869 | output = _base64decode(output) 870 | except binascii.Error: 871 | pass 872 | except binascii.Incomplete: 873 | pass 874 | except TypeError: 875 | # In Python 3, base64 takes and outputs bytes, not str 876 | # This may not be the most correct way to accomplish this 877 | output = _base64decode(output.encode('utf-8')).decode('utf-8') 878 | 879 | # resolve relative URIs 880 | if (element in self.can_be_relative_uri) and output: 881 | output = self.resolveURI(output) 882 | 883 | # decode entities within embedded markup 884 | if not self.contentparams.get('base64', 0): 885 | output = self.decodeEntities(element, output) 886 | 887 | # some feed formats require consumers to guess 888 | # whether the content is html or plain text 889 | if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain': 890 | if self.lookslikehtml(output): 891 | self.contentparams['type'] = u'text/html' 892 | 893 | # remove temporary cruft from contentparams 894 | try: 895 | del self.contentparams['mode'] 896 | except KeyError: 897 | pass 898 | try: 899 | del self.contentparams['base64'] 900 | except KeyError: 901 | pass 902 | 903 | is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types 904 | # resolve relative URIs within embedded markup 905 | if is_htmlish and RESOLVE_RELATIVE_URIS: 906 | if element in self.can_contain_relative_uris: 907 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html')) 908 | 909 | # sanitize embedded markup 910 | if is_htmlish and SANITIZE_HTML: 911 | if element in self.can_contain_dangerous_markup: 912 | output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html')) 913 | 914 | if self.encoding and not isinstance(output, unicode): 915 | output = output.decode(self.encoding, 'ignore') 916 | 917 | # address common error where people take data that is already 918 | # utf-8, presume that it is iso-8859-1, and re-encode it. 919 | if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode): 920 | try: 921 | output = output.encode('iso-8859-1').decode('utf-8') 922 | except (UnicodeEncodeError, UnicodeDecodeError): 923 | pass 924 | 925 | # map win-1252 extensions to the proper code points 926 | if isinstance(output, unicode): 927 | output = output.translate(_cp1252) 928 | 929 | # categories/tags/keywords/whatever are handled in _end_category 930 | if element == 'category': 931 | return output 932 | 933 | if element == 'title' and -1 < self.title_depth <= self.depth: 934 | return output 935 | 936 | # store output in appropriate place(s) 937 | if self.inentry and not self.insource: 938 | if element == 'content': 939 | self.entries[-1].setdefault(element, []) 940 | contentparams = copy.deepcopy(self.contentparams) 941 | contentparams['value'] = output 942 | self.entries[-1][element].append(contentparams) 943 | elif element == 'link': 944 | if not self.inimage: 945 | # query variables in urls in link elements are improperly 946 | # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're 947 | # unhandled character references. fix this special case. 948 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) 949 | self.entries[-1][element] = output 950 | if output: 951 | self.entries[-1]['links'][-1]['href'] = output 952 | else: 953 | if element == 'description': 954 | element = 'summary' 955 | old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) 956 | if old_value_depth is None or self.depth <= old_value_depth: 957 | self.property_depth_map[self.entries[-1]][element] = self.depth 958 | self.entries[-1][element] = output 959 | if self.incontent: 960 | contentparams = copy.deepcopy(self.contentparams) 961 | contentparams['value'] = output 962 | self.entries[-1][element + '_detail'] = contentparams 963 | elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): 964 | context = self._getContext() 965 | if element == 'description': 966 | element = 'subtitle' 967 | context[element] = output 968 | if element == 'link': 969 | # fix query variables; see above for the explanation 970 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) 971 | context[element] = output 972 | context['links'][-1]['href'] = output 973 | elif self.incontent: 974 | contentparams = copy.deepcopy(self.contentparams) 975 | contentparams['value'] = output 976 | context[element + '_detail'] = contentparams 977 | return output 978 | 979 | def pushContent(self, tag, attrsD, defaultContentType, expectingText): 980 | self.incontent += 1 981 | if self.lang: 982 | self.lang=self.lang.replace('_','-') 983 | self.contentparams = FeedParserDict({ 984 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 985 | 'language': self.lang, 986 | 'base': self.baseuri}) 987 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) 988 | self.push(tag, expectingText) 989 | 990 | def popContent(self, tag): 991 | value = self.pop(tag) 992 | self.incontent -= 1 993 | self.contentparams.clear() 994 | return value 995 | 996 | # a number of elements in a number of RSS variants are nominally plain 997 | # text, but this is routinely ignored. This is an attempt to detect 998 | # the most common cases. As false positives often result in silent 999 | # data loss, this function errs on the conservative side. 1000 | @staticmethod 1001 | def lookslikehtml(s): 1002 | # must have a close tag or an entity reference to qualify 1003 | if not (re.search(r'',s) or re.search("&#?\w+;",s)): 1004 | return 1005 | 1006 | # all tags must be in a restricted subset of valid HTML tags 1007 | if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, 1008 | re.findall(r' -1: 1020 | prefix = name[:colonpos] 1021 | suffix = name[colonpos+1:] 1022 | prefix = self.namespacemap.get(prefix, prefix) 1023 | name = prefix + ':' + suffix 1024 | return name 1025 | 1026 | def _getAttribute(self, attrsD, name): 1027 | return attrsD.get(self._mapToStandardPrefix(name)) 1028 | 1029 | def _isBase64(self, attrsD, contentparams): 1030 | if attrsD.get('mode', '') == 'base64': 1031 | return 1 1032 | if self.contentparams['type'].startswith(u'text/'): 1033 | return 0 1034 | if self.contentparams['type'].endswith(u'+xml'): 1035 | return 0 1036 | if self.contentparams['type'].endswith(u'/xml'): 1037 | return 0 1038 | return 1 1039 | 1040 | def _itsAnHrefDamnIt(self, attrsD): 1041 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) 1042 | if href: 1043 | try: 1044 | del attrsD['url'] 1045 | except KeyError: 1046 | pass 1047 | try: 1048 | del attrsD['uri'] 1049 | except KeyError: 1050 | pass 1051 | attrsD['href'] = href 1052 | return attrsD 1053 | 1054 | def _save(self, key, value, overwrite=False): 1055 | context = self._getContext() 1056 | if overwrite: 1057 | context[key] = value 1058 | else: 1059 | context.setdefault(key, value) 1060 | 1061 | def _start_rss(self, attrsD): 1062 | versionmap = {'0.91': u'rss091u', 1063 | '0.92': u'rss092', 1064 | '0.93': u'rss093', 1065 | '0.94': u'rss094'} 1066 | #If we're here then this is an RSS feed. 1067 | #If we don't have a version or have a version that starts with something 1068 | #other than RSS then there's been a mistake. Correct it. 1069 | if not self.version or not self.version.startswith(u'rss'): 1070 | attr_version = attrsD.get('version', '') 1071 | version = versionmap.get(attr_version) 1072 | if version: 1073 | self.version = version 1074 | elif attr_version.startswith('2.'): 1075 | self.version = u'rss20' 1076 | else: 1077 | self.version = u'rss' 1078 | 1079 | def _start_channel(self, attrsD): 1080 | self.infeed = 1 1081 | self._cdf_common(attrsD) 1082 | 1083 | def _cdf_common(self, attrsD): 1084 | if 'lastmod' in attrsD: 1085 | self._start_modified({}) 1086 | self.elementstack[-1][-1] = attrsD['lastmod'] 1087 | self._end_modified() 1088 | if 'href' in attrsD: 1089 | self._start_link({}) 1090 | self.elementstack[-1][-1] = attrsD['href'] 1091 | self._end_link() 1092 | 1093 | def _start_feed(self, attrsD): 1094 | self.infeed = 1 1095 | versionmap = {'0.1': u'atom01', 1096 | '0.2': u'atom02', 1097 | '0.3': u'atom03'} 1098 | if not self.version: 1099 | attr_version = attrsD.get('version') 1100 | version = versionmap.get(attr_version) 1101 | if version: 1102 | self.version = version 1103 | else: 1104 | self.version = u'atom' 1105 | 1106 | def _end_channel(self): 1107 | self.infeed = 0 1108 | _end_feed = _end_channel 1109 | 1110 | def _start_image(self, attrsD): 1111 | context = self._getContext() 1112 | if not self.inentry: 1113 | context.setdefault('image', FeedParserDict()) 1114 | self.inimage = 1 1115 | self.title_depth = -1 1116 | self.push('image', 0) 1117 | 1118 | def _end_image(self): 1119 | self.pop('image') 1120 | self.inimage = 0 1121 | 1122 | def _start_textinput(self, attrsD): 1123 | context = self._getContext() 1124 | context.setdefault('textinput', FeedParserDict()) 1125 | self.intextinput = 1 1126 | self.title_depth = -1 1127 | self.push('textinput', 0) 1128 | _start_textInput = _start_textinput 1129 | 1130 | def _end_textinput(self): 1131 | self.pop('textinput') 1132 | self.intextinput = 0 1133 | _end_textInput = _end_textinput 1134 | 1135 | def _start_author(self, attrsD): 1136 | self.inauthor = 1 1137 | self.push('author', 1) 1138 | # Append a new FeedParserDict when expecting an author 1139 | context = self._getContext() 1140 | context.setdefault('authors', []) 1141 | context['authors'].append(FeedParserDict()) 1142 | _start_managingeditor = _start_author 1143 | _start_dc_author = _start_author 1144 | _start_dc_creator = _start_author 1145 | _start_itunes_author = _start_author 1146 | 1147 | def _end_author(self): 1148 | self.pop('author') 1149 | self.inauthor = 0 1150 | self._sync_author_detail() 1151 | _end_managingeditor = _end_author 1152 | _end_dc_author = _end_author 1153 | _end_dc_creator = _end_author 1154 | _end_itunes_author = _end_author 1155 | 1156 | def _start_itunes_owner(self, attrsD): 1157 | self.inpublisher = 1 1158 | self.push('publisher', 0) 1159 | 1160 | def _end_itunes_owner(self): 1161 | self.pop('publisher') 1162 | self.inpublisher = 0 1163 | self._sync_author_detail('publisher') 1164 | 1165 | def _start_contributor(self, attrsD): 1166 | self.incontributor = 1 1167 | context = self._getContext() 1168 | context.setdefault('contributors', []) 1169 | context['contributors'].append(FeedParserDict()) 1170 | self.push('contributor', 0) 1171 | 1172 | def _end_contributor(self): 1173 | self.pop('contributor') 1174 | self.incontributor = 0 1175 | 1176 | def _start_dc_contributor(self, attrsD): 1177 | self.incontributor = 1 1178 | context = self._getContext() 1179 | context.setdefault('contributors', []) 1180 | context['contributors'].append(FeedParserDict()) 1181 | self.push('name', 0) 1182 | 1183 | def _end_dc_contributor(self): 1184 | self._end_name() 1185 | self.incontributor = 0 1186 | 1187 | def _start_name(self, attrsD): 1188 | self.push('name', 0) 1189 | _start_itunes_name = _start_name 1190 | 1191 | def _end_name(self): 1192 | value = self.pop('name') 1193 | if self.inpublisher: 1194 | self._save_author('name', value, 'publisher') 1195 | elif self.inauthor: 1196 | self._save_author('name', value) 1197 | elif self.incontributor: 1198 | self._save_contributor('name', value) 1199 | elif self.intextinput: 1200 | context = self._getContext() 1201 | context['name'] = value 1202 | _end_itunes_name = _end_name 1203 | 1204 | def _start_width(self, attrsD): 1205 | self.push('width', 0) 1206 | 1207 | def _end_width(self): 1208 | value = self.pop('width') 1209 | try: 1210 | value = int(value) 1211 | except ValueError: 1212 | value = 0 1213 | if self.inimage: 1214 | context = self._getContext() 1215 | context['width'] = value 1216 | 1217 | def _start_height(self, attrsD): 1218 | self.push('height', 0) 1219 | 1220 | def _end_height(self): 1221 | value = self.pop('height') 1222 | try: 1223 | value = int(value) 1224 | except ValueError: 1225 | value = 0 1226 | if self.inimage: 1227 | context = self._getContext() 1228 | context['height'] = value 1229 | 1230 | def _start_url(self, attrsD): 1231 | self.push('href', 1) 1232 | _start_homepage = _start_url 1233 | _start_uri = _start_url 1234 | 1235 | def _end_url(self): 1236 | value = self.pop('href') 1237 | if self.inauthor: 1238 | self._save_author('href', value) 1239 | elif self.incontributor: 1240 | self._save_contributor('href', value) 1241 | _end_homepage = _end_url 1242 | _end_uri = _end_url 1243 | 1244 | def _start_email(self, attrsD): 1245 | self.push('email', 0) 1246 | _start_itunes_email = _start_email 1247 | 1248 | def _end_email(self): 1249 | value = self.pop('email') 1250 | if self.inpublisher: 1251 | self._save_author('email', value, 'publisher') 1252 | elif self.inauthor: 1253 | self._save_author('email', value) 1254 | elif self.incontributor: 1255 | self._save_contributor('email', value) 1256 | _end_itunes_email = _end_email 1257 | 1258 | def _getContext(self): 1259 | if self.insource: 1260 | context = self.sourcedata 1261 | elif self.inimage and 'image' in self.feeddata: 1262 | context = self.feeddata['image'] 1263 | elif self.intextinput: 1264 | context = self.feeddata['textinput'] 1265 | elif self.inentry: 1266 | context = self.entries[-1] 1267 | else: 1268 | context = self.feeddata 1269 | return context 1270 | 1271 | def _save_author(self, key, value, prefix='author'): 1272 | context = self._getContext() 1273 | context.setdefault(prefix + '_detail', FeedParserDict()) 1274 | context[prefix + '_detail'][key] = value 1275 | self._sync_author_detail() 1276 | context.setdefault('authors', [FeedParserDict()]) 1277 | context['authors'][-1][key] = value 1278 | 1279 | def _save_contributor(self, key, value): 1280 | context = self._getContext() 1281 | context.setdefault('contributors', [FeedParserDict()]) 1282 | context['contributors'][-1][key] = value 1283 | 1284 | def _sync_author_detail(self, key='author'): 1285 | context = self._getContext() 1286 | detail = context.get('%s_detail' % key) 1287 | if detail: 1288 | name = detail.get('name') 1289 | email = detail.get('email') 1290 | if name and email: 1291 | context[key] = u'%s (%s)' % (name, email) 1292 | elif name: 1293 | context[key] = name 1294 | elif email: 1295 | context[key] = email 1296 | else: 1297 | author, email = context.get(key), None 1298 | if not author: 1299 | return 1300 | emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) 1301 | if emailmatch: 1302 | email = emailmatch.group(0) 1303 | # probably a better way to do the following, but it passes all the tests 1304 | author = author.replace(email, u'') 1305 | author = author.replace(u'()', u'') 1306 | author = author.replace(u'<>', u'') 1307 | author = author.replace(u'<>', u'') 1308 | author = author.strip() 1309 | if author and (author[0] == u'('): 1310 | author = author[1:] 1311 | if author and (author[-1] == u')'): 1312 | author = author[:-1] 1313 | author = author.strip() 1314 | if author or email: 1315 | context.setdefault('%s_detail' % key, FeedParserDict()) 1316 | if author: 1317 | context['%s_detail' % key]['name'] = author 1318 | if email: 1319 | context['%s_detail' % key]['email'] = email 1320 | 1321 | def _start_subtitle(self, attrsD): 1322 | self.pushContent('subtitle', attrsD, u'text/plain', 1) 1323 | _start_tagline = _start_subtitle 1324 | _start_itunes_subtitle = _start_subtitle 1325 | 1326 | def _end_subtitle(self): 1327 | self.popContent('subtitle') 1328 | _end_tagline = _end_subtitle 1329 | _end_itunes_subtitle = _end_subtitle 1330 | 1331 | def _start_rights(self, attrsD): 1332 | self.pushContent('rights', attrsD, u'text/plain', 1) 1333 | _start_dc_rights = _start_rights 1334 | _start_copyright = _start_rights 1335 | 1336 | def _end_rights(self): 1337 | self.popContent('rights') 1338 | _end_dc_rights = _end_rights 1339 | _end_copyright = _end_rights 1340 | 1341 | def _start_item(self, attrsD): 1342 | self.entries.append(FeedParserDict()) 1343 | self.push('item', 0) 1344 | self.inentry = 1 1345 | self.guidislink = 0 1346 | self.title_depth = -1 1347 | self.psc_chapters_counter = 0 1348 | id = self._getAttribute(attrsD, 'rdf:about') 1349 | if id: 1350 | context = self._getContext() 1351 | context['id'] = id 1352 | self._cdf_common(attrsD) 1353 | _start_entry = _start_item 1354 | 1355 | def _end_item(self): 1356 | self.pop('item') 1357 | self.inentry = 0 1358 | _end_entry = _end_item 1359 | 1360 | def _start_dc_language(self, attrsD): 1361 | self.push('language', 1) 1362 | _start_language = _start_dc_language 1363 | 1364 | def _end_dc_language(self): 1365 | self.lang = self.pop('language') 1366 | _end_language = _end_dc_language 1367 | 1368 | def _start_dc_publisher(self, attrsD): 1369 | self.push('publisher', 1) 1370 | _start_webmaster = _start_dc_publisher 1371 | 1372 | def _end_dc_publisher(self): 1373 | self.pop('publisher') 1374 | self._sync_author_detail('publisher') 1375 | _end_webmaster = _end_dc_publisher 1376 | 1377 | def _start_published(self, attrsD): 1378 | self.push('published', 1) 1379 | _start_dcterms_issued = _start_published 1380 | _start_issued = _start_published 1381 | _start_pubdate = _start_published 1382 | 1383 | def _end_published(self): 1384 | value = self.pop('published') 1385 | self._save('published_parsed', _parse_date(value), overwrite=True) 1386 | _end_dcterms_issued = _end_published 1387 | _end_issued = _end_published 1388 | _end_pubdate = _end_published 1389 | 1390 | def _start_updated(self, attrsD): 1391 | self.push('updated', 1) 1392 | _start_modified = _start_updated 1393 | _start_dcterms_modified = _start_updated 1394 | _start_dc_date = _start_updated 1395 | _start_lastbuilddate = _start_updated 1396 | 1397 | def _end_updated(self): 1398 | value = self.pop('updated') 1399 | parsed_value = _parse_date(value) 1400 | self._save('updated_parsed', parsed_value, overwrite=True) 1401 | _end_modified = _end_updated 1402 | _end_dcterms_modified = _end_updated 1403 | _end_dc_date = _end_updated 1404 | _end_lastbuilddate = _end_updated 1405 | 1406 | def _start_created(self, attrsD): 1407 | self.push('created', 1) 1408 | _start_dcterms_created = _start_created 1409 | 1410 | def _end_created(self): 1411 | value = self.pop('created') 1412 | self._save('created_parsed', _parse_date(value), overwrite=True) 1413 | _end_dcterms_created = _end_created 1414 | 1415 | def _start_expirationdate(self, attrsD): 1416 | self.push('expired', 1) 1417 | 1418 | def _end_expirationdate(self): 1419 | self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) 1420 | 1421 | # geospatial location, or "where", from georss.org 1422 | 1423 | def _start_georssgeom(self, attrsD): 1424 | self.push('geometry', 0) 1425 | context = self._getContext() 1426 | context['where'] = FeedParserDict() 1427 | 1428 | _start_georss_point = _start_georssgeom 1429 | _start_georss_line = _start_georssgeom 1430 | _start_georss_polygon = _start_georssgeom 1431 | _start_georss_box = _start_georssgeom 1432 | 1433 | def _save_where(self, geometry): 1434 | context = self._getContext() 1435 | context['where'].update(geometry) 1436 | 1437 | def _end_georss_point(self): 1438 | geometry = _parse_georss_point(self.pop('geometry')) 1439 | if geometry: 1440 | self._save_where(geometry) 1441 | 1442 | def _end_georss_line(self): 1443 | geometry = _parse_georss_line(self.pop('geometry')) 1444 | if geometry: 1445 | self._save_where(geometry) 1446 | 1447 | def _end_georss_polygon(self): 1448 | this = self.pop('geometry') 1449 | geometry = _parse_georss_polygon(this) 1450 | if geometry: 1451 | self._save_where(geometry) 1452 | 1453 | def _end_georss_box(self): 1454 | geometry = _parse_georss_box(self.pop('geometry')) 1455 | if geometry: 1456 | self._save_where(geometry) 1457 | 1458 | def _start_where(self, attrsD): 1459 | self.push('where', 0) 1460 | context = self._getContext() 1461 | context['where'] = FeedParserDict() 1462 | _start_georss_where = _start_where 1463 | 1464 | def _parse_srs_attrs(self, attrsD): 1465 | srsName = attrsD.get('srsname') 1466 | try: 1467 | srsDimension = int(attrsD.get('srsdimension', '2')) 1468 | except ValueError: 1469 | srsDimension = 2 1470 | context = self._getContext() 1471 | context['where']['srsName'] = srsName 1472 | context['where']['srsDimension'] = srsDimension 1473 | 1474 | def _start_gml_point(self, attrsD): 1475 | self._parse_srs_attrs(attrsD) 1476 | self.ingeometry = 1 1477 | self.push('geometry', 0) 1478 | 1479 | def _start_gml_linestring(self, attrsD): 1480 | self._parse_srs_attrs(attrsD) 1481 | self.ingeometry = 'linestring' 1482 | self.push('geometry', 0) 1483 | 1484 | def _start_gml_polygon(self, attrsD): 1485 | self._parse_srs_attrs(attrsD) 1486 | self.push('geometry', 0) 1487 | 1488 | def _start_gml_exterior(self, attrsD): 1489 | self.push('geometry', 0) 1490 | 1491 | def _start_gml_linearring(self, attrsD): 1492 | self.ingeometry = 'polygon' 1493 | self.push('geometry', 0) 1494 | 1495 | def _start_gml_pos(self, attrsD): 1496 | self.push('pos', 0) 1497 | 1498 | def _end_gml_pos(self): 1499 | this = self.pop('pos') 1500 | context = self._getContext() 1501 | srsName = context['where'].get('srsName') 1502 | srsDimension = context['where'].get('srsDimension', 2) 1503 | swap = True 1504 | if srsName and "EPSG" in srsName: 1505 | epsg = int(srsName.split(":")[-1]) 1506 | swap = bool(epsg in _geogCS) 1507 | geometry = _parse_georss_point(this, swap=swap, dims=srsDimension) 1508 | if geometry: 1509 | self._save_where(geometry) 1510 | 1511 | def _start_gml_poslist(self, attrsD): 1512 | self.push('pos', 0) 1513 | 1514 | def _end_gml_poslist(self): 1515 | this = self.pop('pos') 1516 | context = self._getContext() 1517 | srsName = context['where'].get('srsName') 1518 | srsDimension = context['where'].get('srsDimension', 2) 1519 | swap = True 1520 | if srsName and "EPSG" in srsName: 1521 | epsg = int(srsName.split(":")[-1]) 1522 | swap = bool(epsg in _geogCS) 1523 | geometry = _parse_poslist( 1524 | this, self.ingeometry, swap=swap, dims=srsDimension) 1525 | if geometry: 1526 | self._save_where(geometry) 1527 | 1528 | def _end_geom(self): 1529 | self.ingeometry = 0 1530 | self.pop('geometry') 1531 | _end_gml_point = _end_geom 1532 | _end_gml_linestring = _end_geom 1533 | _end_gml_linearring = _end_geom 1534 | _end_gml_exterior = _end_geom 1535 | _end_gml_polygon = _end_geom 1536 | 1537 | def _end_where(self): 1538 | self.pop('where') 1539 | _end_georss_where = _end_where 1540 | 1541 | # end geospatial 1542 | 1543 | def _start_cc_license(self, attrsD): 1544 | context = self._getContext() 1545 | value = self._getAttribute(attrsD, 'rdf:resource') 1546 | attrsD = FeedParserDict() 1547 | attrsD['rel'] = u'license' 1548 | if value: 1549 | attrsD['href']=value 1550 | context.setdefault('links', []).append(attrsD) 1551 | 1552 | def _start_creativecommons_license(self, attrsD): 1553 | self.push('license', 1) 1554 | _start_creativeCommons_license = _start_creativecommons_license 1555 | 1556 | def _end_creativecommons_license(self): 1557 | value = self.pop('license') 1558 | context = self._getContext() 1559 | attrsD = FeedParserDict() 1560 | attrsD['rel'] = u'license' 1561 | if value: 1562 | attrsD['href'] = value 1563 | context.setdefault('links', []).append(attrsD) 1564 | del context['license'] 1565 | _end_creativeCommons_license = _end_creativecommons_license 1566 | 1567 | def _addTag(self, term, scheme, label): 1568 | context = self._getContext() 1569 | tags = context.setdefault('tags', []) 1570 | if (not term) and (not scheme) and (not label): 1571 | return 1572 | value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) 1573 | if value not in tags: 1574 | tags.append(value) 1575 | 1576 | def _start_category(self, attrsD): 1577 | term = attrsD.get('term') 1578 | scheme = attrsD.get('scheme', attrsD.get('domain')) 1579 | label = attrsD.get('label') 1580 | self._addTag(term, scheme, label) 1581 | self.push('category', 1) 1582 | _start_dc_subject = _start_category 1583 | _start_keywords = _start_category 1584 | 1585 | def _start_media_category(self, attrsD): 1586 | attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema') 1587 | self._start_category(attrsD) 1588 | 1589 | def _end_itunes_keywords(self): 1590 | for term in self.pop('itunes_keywords').split(','): 1591 | if term.strip(): 1592 | self._addTag(term.strip(), u'http://www.itunes.com/', None) 1593 | 1594 | def _start_itunes_category(self, attrsD): 1595 | self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None) 1596 | self.push('category', 1) 1597 | 1598 | def _end_category(self): 1599 | value = self.pop('category') 1600 | if not value: 1601 | return 1602 | context = self._getContext() 1603 | tags = context['tags'] 1604 | if value and len(tags) and not tags[-1]['term']: 1605 | tags[-1]['term'] = value 1606 | else: 1607 | self._addTag(value, None, None) 1608 | _end_dc_subject = _end_category 1609 | _end_keywords = _end_category 1610 | _end_itunes_category = _end_category 1611 | _end_media_category = _end_category 1612 | 1613 | def _start_cloud(self, attrsD): 1614 | self._getContext()['cloud'] = FeedParserDict(attrsD) 1615 | 1616 | def _start_link(self, attrsD): 1617 | attrsD.setdefault('rel', u'alternate') 1618 | if attrsD['rel'] == u'self': 1619 | attrsD.setdefault('type', u'application/atom+xml') 1620 | else: 1621 | attrsD.setdefault('type', u'text/html') 1622 | context = self._getContext() 1623 | attrsD = self._itsAnHrefDamnIt(attrsD) 1624 | if 'href' in attrsD: 1625 | attrsD['href'] = self.resolveURI(attrsD['href']) 1626 | expectingText = self.infeed or self.inentry or self.insource 1627 | context.setdefault('links', []) 1628 | if not (self.inentry and self.inimage): 1629 | context['links'].append(FeedParserDict(attrsD)) 1630 | if 'href' in attrsD: 1631 | expectingText = 0 1632 | if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): 1633 | context['link'] = attrsD['href'] 1634 | else: 1635 | self.push('link', expectingText) 1636 | 1637 | def _end_link(self): 1638 | value = self.pop('link') 1639 | 1640 | def _start_guid(self, attrsD): 1641 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') 1642 | self.push('id', 1) 1643 | _start_id = _start_guid 1644 | 1645 | def _end_guid(self): 1646 | value = self.pop('id') 1647 | self._save('guidislink', self.guidislink and 'link' not in self._getContext()) 1648 | if self.guidislink: 1649 | # guid acts as link, but only if 'ispermalink' is not present or is 'true', 1650 | # and only if the item doesn't already have a link element 1651 | self._save('link', value) 1652 | _end_id = _end_guid 1653 | 1654 | def _start_title(self, attrsD): 1655 | if self.svgOK: 1656 | return self.unknown_starttag('title', attrsD.items()) 1657 | self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) 1658 | _start_dc_title = _start_title 1659 | _start_media_title = _start_title 1660 | 1661 | def _end_title(self): 1662 | if self.svgOK: 1663 | return 1664 | value = self.popContent('title') 1665 | if not value: 1666 | return 1667 | self.title_depth = self.depth 1668 | _end_dc_title = _end_title 1669 | 1670 | def _end_media_title(self): 1671 | title_depth = self.title_depth 1672 | self._end_title() 1673 | self.title_depth = title_depth 1674 | 1675 | def _start_description(self, attrsD): 1676 | context = self._getContext() 1677 | if 'summary' in context: 1678 | self._summaryKey = 'content' 1679 | self._start_content(attrsD) 1680 | else: 1681 | self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource) 1682 | _start_dc_description = _start_description 1683 | _start_media_description = _start_description 1684 | 1685 | def _start_abstract(self, attrsD): 1686 | self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) 1687 | 1688 | def _end_description(self): 1689 | if self._summaryKey == 'content': 1690 | self._end_content() 1691 | else: 1692 | value = self.popContent('description') 1693 | self._summaryKey = None 1694 | _end_abstract = _end_description 1695 | _end_dc_description = _end_description 1696 | _end_media_description = _end_description 1697 | 1698 | def _start_info(self, attrsD): 1699 | self.pushContent('info', attrsD, u'text/plain', 1) 1700 | _start_feedburner_browserfriendly = _start_info 1701 | 1702 | def _end_info(self): 1703 | self.popContent('info') 1704 | _end_feedburner_browserfriendly = _end_info 1705 | 1706 | def _start_generator(self, attrsD): 1707 | if attrsD: 1708 | attrsD = self._itsAnHrefDamnIt(attrsD) 1709 | if 'href' in attrsD: 1710 | attrsD['href'] = self.resolveURI(attrsD['href']) 1711 | self._getContext()['generator_detail'] = FeedParserDict(attrsD) 1712 | self.push('generator', 1) 1713 | 1714 | def _end_generator(self): 1715 | value = self.pop('generator') 1716 | context = self._getContext() 1717 | if 'generator_detail' in context: 1718 | context['generator_detail']['name'] = value 1719 | 1720 | def _start_admin_generatoragent(self, attrsD): 1721 | self.push('generator', 1) 1722 | value = self._getAttribute(attrsD, 'rdf:resource') 1723 | if value: 1724 | self.elementstack[-1][2].append(value) 1725 | self.pop('generator') 1726 | self._getContext()['generator_detail'] = FeedParserDict({'href': value}) 1727 | 1728 | def _start_admin_errorreportsto(self, attrsD): 1729 | self.push('errorreportsto', 1) 1730 | value = self._getAttribute(attrsD, 'rdf:resource') 1731 | if value: 1732 | self.elementstack[-1][2].append(value) 1733 | self.pop('errorreportsto') 1734 | 1735 | def _start_summary(self, attrsD): 1736 | context = self._getContext() 1737 | if 'summary' in context: 1738 | self._summaryKey = 'content' 1739 | self._start_content(attrsD) 1740 | else: 1741 | self._summaryKey = 'summary' 1742 | self.pushContent(self._summaryKey, attrsD, u'text/plain', 1) 1743 | _start_itunes_summary = _start_summary 1744 | 1745 | def _end_summary(self): 1746 | if self._summaryKey == 'content': 1747 | self._end_content() 1748 | else: 1749 | self.popContent(self._summaryKey or 'summary') 1750 | self._summaryKey = None 1751 | _end_itunes_summary = _end_summary 1752 | 1753 | def _start_enclosure(self, attrsD): 1754 | attrsD = self._itsAnHrefDamnIt(attrsD) 1755 | context = self._getContext() 1756 | attrsD['rel'] = u'enclosure' 1757 | context.setdefault('links', []).append(FeedParserDict(attrsD)) 1758 | 1759 | def _start_source(self, attrsD): 1760 | if 'url' in attrsD: 1761 | # This means that we're processing a source element from an RSS 2.0 feed 1762 | self.sourcedata['href'] = attrsD[u'url'] 1763 | self.push('source', 1) 1764 | self.insource = 1 1765 | self.title_depth = -1 1766 | 1767 | def _end_source(self): 1768 | self.insource = 0 1769 | value = self.pop('source') 1770 | if value: 1771 | self.sourcedata['title'] = value 1772 | self._getContext()['source'] = copy.deepcopy(self.sourcedata) 1773 | self.sourcedata.clear() 1774 | 1775 | def _start_content(self, attrsD): 1776 | self.pushContent('content', attrsD, u'text/plain', 1) 1777 | src = attrsD.get('src') 1778 | if src: 1779 | self.contentparams['src'] = src 1780 | self.push('content', 1) 1781 | 1782 | def _start_body(self, attrsD): 1783 | self.pushContent('content', attrsD, u'application/xhtml+xml', 1) 1784 | _start_xhtml_body = _start_body 1785 | 1786 | def _start_content_encoded(self, attrsD): 1787 | self.pushContent('content', attrsD, u'text/html', 1) 1788 | _start_fullitem = _start_content_encoded 1789 | 1790 | def _end_content(self): 1791 | copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types) 1792 | value = self.popContent('content') 1793 | if copyToSummary: 1794 | self._save('summary', value) 1795 | 1796 | _end_body = _end_content 1797 | _end_xhtml_body = _end_content 1798 | _end_content_encoded = _end_content 1799 | _end_fullitem = _end_content 1800 | 1801 | def _start_itunes_image(self, attrsD): 1802 | self.push('itunes_image', 0) 1803 | if attrsD.get('href'): 1804 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) 1805 | elif attrsD.get('url'): 1806 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')}) 1807 | _start_itunes_link = _start_itunes_image 1808 | 1809 | def _end_itunes_block(self): 1810 | value = self.pop('itunes_block', 0) 1811 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 1812 | 1813 | def _end_itunes_explicit(self): 1814 | value = self.pop('itunes_explicit', 0) 1815 | # Convert 'yes' -> True, 'clean' to False, and any other value to None 1816 | # False and None both evaluate as False, so the difference can be ignored 1817 | # by applications that only need to know if the content is explicit. 1818 | self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] 1819 | 1820 | def _start_media_group(self, attrsD): 1821 | # don't do anything, but don't break the enclosed tags either 1822 | pass 1823 | 1824 | def _start_media_credit(self, attrsD): 1825 | context = self._getContext() 1826 | context.setdefault('media_credit', []) 1827 | context['media_credit'].append(attrsD) 1828 | self.push('credit', 1) 1829 | 1830 | def _end_media_credit(self): 1831 | credit = self.pop('credit') 1832 | if credit != None and len(credit.strip()) != 0: 1833 | context = self._getContext() 1834 | context['media_credit'][-1]['content'] = credit 1835 | 1836 | def _start_media_restriction(self, attrsD): 1837 | context = self._getContext() 1838 | context.setdefault('media_restriction', attrsD) 1839 | self.push('restriction', 1) 1840 | 1841 | def _end_media_restriction(self): 1842 | restriction = self.pop('restriction') 1843 | if restriction != None and len(restriction.strip()) != 0: 1844 | context = self._getContext() 1845 | context['media_restriction']['content'] = restriction 1846 | 1847 | def _start_media_license(self, attrsD): 1848 | context = self._getContext() 1849 | context.setdefault('media_license', attrsD) 1850 | self.push('license', 1) 1851 | 1852 | def _end_media_license(self): 1853 | license = self.pop('license') 1854 | if license != None and len(license.strip()) != 0: 1855 | context = self._getContext() 1856 | context['media_license']['content'] = license 1857 | 1858 | def _start_media_content(self, attrsD): 1859 | context = self._getContext() 1860 | context.setdefault('media_content', []) 1861 | context['media_content'].append(attrsD) 1862 | 1863 | def _start_media_thumbnail(self, attrsD): 1864 | context = self._getContext() 1865 | context.setdefault('media_thumbnail', []) 1866 | self.push('url', 1) # new 1867 | context['media_thumbnail'].append(attrsD) 1868 | 1869 | def _end_media_thumbnail(self): 1870 | url = self.pop('url') 1871 | context = self._getContext() 1872 | if url != None and len(url.strip()) != 0: 1873 | if 'url' not in context['media_thumbnail'][-1]: 1874 | context['media_thumbnail'][-1]['url'] = url 1875 | 1876 | def _start_media_player(self, attrsD): 1877 | self.push('media_player', 0) 1878 | self._getContext()['media_player'] = FeedParserDict(attrsD) 1879 | 1880 | def _end_media_player(self): 1881 | value = self.pop('media_player') 1882 | context = self._getContext() 1883 | context['media_player']['content'] = value 1884 | 1885 | def _start_newlocation(self, attrsD): 1886 | self.push('newlocation', 1) 1887 | 1888 | def _end_newlocation(self): 1889 | url = self.pop('newlocation') 1890 | context = self._getContext() 1891 | # don't set newlocation if the context isn't right 1892 | if context is not self.feeddata: 1893 | return 1894 | context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) 1895 | 1896 | def _start_psc_chapters(self, attrsD): 1897 | version = self._getAttribute(attrsD, 'version') 1898 | if version == '1.1' and self.psc_chapters_counter == 0: 1899 | self.psc_chapters_counter += 1 1900 | attrsD['chapters'] = [] 1901 | self._getContext()['psc_chapters'] = FeedParserDict(attrsD) 1902 | 1903 | def _end_psc_chapters(self): 1904 | version = self._getContext()['psc_chapters']['version'] 1905 | if version == '1.1': 1906 | self.psc_chapters_counter += 1 1907 | 1908 | def _start_psc_chapter(self, attrsD): 1909 | if self.psc_chapters_counter == 1: 1910 | start = self._getAttribute(attrsD, 'start') 1911 | attrsD['start_parsed'] = _parse_psc_chapter_start(start) 1912 | 1913 | context = self._getContext()['psc_chapters'] 1914 | context['chapters'].append(FeedParserDict(attrsD)) 1915 | 1916 | 1917 | if _XML_AVAILABLE: 1918 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): 1919 | def __init__(self, baseuri, baselang, encoding): 1920 | xml.sax.handler.ContentHandler.__init__(self) 1921 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) 1922 | self.bozo = 0 1923 | self.exc = None 1924 | self.decls = {} 1925 | 1926 | def startPrefixMapping(self, prefix, uri): 1927 | if not uri: 1928 | return 1929 | # Jython uses '' instead of None; standardize on None 1930 | prefix = prefix or None 1931 | self.trackNamespace(prefix, uri) 1932 | if prefix and uri == 'http://www.w3.org/1999/xlink': 1933 | self.decls['xmlns:' + prefix] = uri 1934 | 1935 | def startElementNS(self, name, qname, attrs): 1936 | namespace, localname = name 1937 | lowernamespace = str(namespace or '').lower() 1938 | if lowernamespace.find(u'backend.userland.com/rss') <> -1: 1939 | # match any backend.userland.com namespace 1940 | namespace = u'http://backend.userland.com/rss' 1941 | lowernamespace = namespace 1942 | if qname and qname.find(':') > 0: 1943 | givenprefix = qname.split(':')[0] 1944 | else: 1945 | givenprefix = None 1946 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) 1947 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: 1948 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix 1949 | localname = str(localname).lower() 1950 | 1951 | # qname implementation is horribly broken in Python 2.1 (it 1952 | # doesn't report any), and slightly broken in Python 2.2 (it 1953 | # doesn't report the xml: namespace). So we match up namespaces 1954 | # with a known list first, and then possibly override them with 1955 | # the qnames the SAX parser gives us (if indeed it gives us any 1956 | # at all). Thanks to MatejC for helping me test this and 1957 | # tirelessly telling me that it didn't work yet. 1958 | attrsD, self.decls = self.decls, {} 1959 | if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': 1960 | attrsD['xmlns']=namespace 1961 | if localname=='svg' and namespace=='http://www.w3.org/2000/svg': 1962 | attrsD['xmlns']=namespace 1963 | 1964 | if prefix: 1965 | localname = prefix.lower() + ':' + localname 1966 | elif namespace and not qname: #Expat 1967 | for name,value in self.namespacesInUse.items(): 1968 | if name and value == namespace: 1969 | localname = name + ':' + localname 1970 | break 1971 | 1972 | for (namespace, attrlocalname), attrvalue in attrs.items(): 1973 | lowernamespace = (namespace or '').lower() 1974 | prefix = self._matchnamespaces.get(lowernamespace, '') 1975 | if prefix: 1976 | attrlocalname = prefix + ':' + attrlocalname 1977 | attrsD[str(attrlocalname).lower()] = attrvalue 1978 | for qname in attrs.getQNames(): 1979 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname) 1980 | localname = str(localname).lower() 1981 | self.unknown_starttag(localname, attrsD.items()) 1982 | 1983 | def characters(self, text): 1984 | self.handle_data(text) 1985 | 1986 | def endElementNS(self, name, qname): 1987 | namespace, localname = name 1988 | lowernamespace = str(namespace or '').lower() 1989 | if qname and qname.find(':') > 0: 1990 | givenprefix = qname.split(':')[0] 1991 | else: 1992 | givenprefix = '' 1993 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) 1994 | if prefix: 1995 | localname = prefix + ':' + localname 1996 | elif namespace and not qname: #Expat 1997 | for name,value in self.namespacesInUse.items(): 1998 | if name and value == namespace: 1999 | localname = name + ':' + localname 2000 | break 2001 | localname = str(localname).lower() 2002 | self.unknown_endtag(localname) 2003 | 2004 | def error(self, exc): 2005 | self.bozo = 1 2006 | self.exc = exc 2007 | 2008 | # drv_libxml2 calls warning() in some cases 2009 | warning = error 2010 | 2011 | def fatalError(self, exc): 2012 | self.error(exc) 2013 | raise exc 2014 | 2015 | class _BaseHTMLProcessor(sgmllib.SGMLParser): 2016 | special = re.compile('''[<>'"]''') 2017 | bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") 2018 | elements_no_end_tag = set([ 2019 | 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 2020 | 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 2021 | 'source', 'track', 'wbr' 2022 | ]) 2023 | 2024 | def __init__(self, encoding, _type): 2025 | self.encoding = encoding 2026 | self._type = _type 2027 | sgmllib.SGMLParser.__init__(self) 2028 | 2029 | def reset(self): 2030 | self.pieces = [] 2031 | sgmllib.SGMLParser.reset(self) 2032 | 2033 | def _shorttag_replace(self, match): 2034 | tag = match.group(1) 2035 | if tag in self.elements_no_end_tag: 2036 | return '<' + tag + ' />' 2037 | else: 2038 | return '<' + tag + '>' 2039 | 2040 | # By declaring these methods and overriding their compiled code 2041 | # with the code from sgmllib, the original code will execute in 2042 | # feedparser's scope instead of sgmllib's. This means that the 2043 | # `tagfind` and `charref` regular expressions will be found as 2044 | # they're declared above, not as they're declared in sgmllib. 2045 | def goahead(self, i): 2046 | pass 2047 | goahead.func_code = sgmllib.SGMLParser.goahead.func_code 2048 | 2049 | def __parse_starttag(self, i): 2050 | pass 2051 | __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code 2052 | 2053 | def parse_starttag(self,i): 2054 | j = self.__parse_starttag(i) 2055 | if self._type == 'application/xhtml+xml': 2056 | if j>2 and self.rawdata[j-2:j]=='/>': 2057 | self.unknown_endtag(self.lasttag) 2058 | return j 2059 | 2060 | def feed(self, data): 2061 | data = re.compile(r'\s]+?)\s*/>', self._shorttag_replace, data) 2063 | data = data.replace(''', "'") 2064 | data = data.replace('"', '"') 2065 | try: 2066 | bytes 2067 | if bytes is str: 2068 | raise NameError 2069 | self.encoding = self.encoding + u'_INVALID_PYTHON_3' 2070 | except NameError: 2071 | if self.encoding and isinstance(data, unicode): 2072 | data = data.encode(self.encoding) 2073 | sgmllib.SGMLParser.feed(self, data) 2074 | sgmllib.SGMLParser.close(self) 2075 | 2076 | def normalize_attrs(self, attrs): 2077 | if not attrs: 2078 | return attrs 2079 | # utility method to be called by descendants 2080 | attrs = dict([(k.lower(), v) for k, v in attrs]).items() 2081 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] 2082 | attrs.sort() 2083 | return attrs 2084 | 2085 | def unknown_starttag(self, tag, attrs): 2086 | # called for each start tag 2087 | # attrs is a list of (attr, value) tuples 2088 | # e.g. for
, tag='pre', attrs=[('class', 'screen')]
2089 |         uattrs = []
2090 |         strattrs=''
2091 |         if attrs:
2092 |             for key, value in attrs:
2093 |                 value=value.replace('>','>').replace('<','<').replace('"','"')
2094 |                 value = self.bare_ampersand.sub("&", value)
2095 |                 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2096 |                 if not isinstance(value, unicode):
2097 |                     value = value.decode(self.encoding, 'ignore')
2098 |                 try:
2099 |                     # Currently, in Python 3 the key is already a str, and cannot be decoded again
2100 |                     uattrs.append((unicode(key, self.encoding), value))
2101 |                 except TypeError:
2102 |                     uattrs.append((key, value))
2103 |             strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2104 |             if self.encoding:
2105 |                 try:
2106 |                     strattrs = strattrs.encode(self.encoding)
2107 |                 except (UnicodeEncodeError, LookupError):
2108 |                     pass
2109 |         if tag in self.elements_no_end_tag:
2110 |             self.pieces.append('<%s%s />' % (tag, strattrs))
2111 |         else:
2112 |             self.pieces.append('<%s%s>' % (tag, strattrs))
2113 | 
2114 |     def unknown_endtag(self, tag):
2115 |         # called for each end tag, e.g. for 
, tag will be 'pre' 2116 | # Reconstruct the original end tag. 2117 | if tag not in self.elements_no_end_tag: 2118 | self.pieces.append("" % tag) 2119 | 2120 | def handle_charref(self, ref): 2121 | # called for each character reference, e.g. for ' ', ref will be '160' 2122 | # Reconstruct the original character reference. 2123 | ref = ref.lower() 2124 | if ref.startswith('x'): 2125 | value = int(ref[1:], 16) 2126 | else: 2127 | value = int(ref) 2128 | 2129 | if value in _cp1252: 2130 | self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) 2131 | else: 2132 | self.pieces.append('&#%s;' % ref) 2133 | 2134 | def handle_entityref(self, ref): 2135 | # called for each entity reference, e.g. for '©', ref will be 'copy' 2136 | # Reconstruct the original entity reference. 2137 | if ref in name2codepoint or ref == 'apos': 2138 | self.pieces.append('&%s;' % ref) 2139 | else: 2140 | self.pieces.append('&%s' % ref) 2141 | 2142 | def handle_data(self, text): 2143 | # called for each block of plain text, i.e. outside of any tag and 2144 | # not containing any character or entity references 2145 | # Store the original text verbatim. 2146 | self.pieces.append(text) 2147 | 2148 | def handle_comment(self, text): 2149 | # called for each HTML comment, e.g. 2150 | # Reconstruct the original comment. 2151 | self.pieces.append('' % text) 2152 | 2153 | def handle_pi(self, text): 2154 | # called for each processing instruction, e.g. 2155 | # Reconstruct original processing instruction. 2156 | self.pieces.append('' % text) 2157 | 2158 | def handle_decl(self, text): 2159 | # called for the DOCTYPE, if present, e.g. 2160 | # 2162 | # Reconstruct original DOCTYPE 2163 | self.pieces.append('' % text) 2164 | 2165 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match 2166 | def _scan_name(self, i, declstartpos): 2167 | rawdata = self.rawdata 2168 | n = len(rawdata) 2169 | if i == n: 2170 | return None, -1 2171 | m = self._new_declname_match(rawdata, i) 2172 | if m: 2173 | s = m.group() 2174 | name = s.strip() 2175 | if (i + len(s)) == n: 2176 | return None, -1 # end of buffer 2177 | return name.lower(), m.end() 2178 | else: 2179 | self.handle_data(rawdata) 2180 | # self.updatepos(declstartpos, i) 2181 | return None, -1 2182 | 2183 | def convert_charref(self, name): 2184 | return '&#%s;' % name 2185 | 2186 | def convert_entityref(self, name): 2187 | return '&%s;' % name 2188 | 2189 | def output(self): 2190 | '''Return processed HTML as a single string''' 2191 | return ''.join([str(p) for p in self.pieces]) 2192 | 2193 | def parse_declaration(self, i): 2194 | try: 2195 | return sgmllib.SGMLParser.parse_declaration(self, i) 2196 | except sgmllib.SGMLParseError: 2197 | # escape the doctype declaration and continue parsing 2198 | self.handle_data('<') 2199 | return i+1 2200 | 2201 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): 2202 | def __init__(self, baseuri, baselang, encoding, entities): 2203 | sgmllib.SGMLParser.__init__(self) 2204 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) 2205 | _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') 2206 | self.entities=entities 2207 | 2208 | def decodeEntities(self, element, data): 2209 | data = data.replace('<', '<') 2210 | data = data.replace('<', '<') 2211 | data = data.replace('<', '<') 2212 | data = data.replace('>', '>') 2213 | data = data.replace('>', '>') 2214 | data = data.replace('>', '>') 2215 | data = data.replace('&', '&') 2216 | data = data.replace('&', '&') 2217 | data = data.replace('"', '"') 2218 | data = data.replace('"', '"') 2219 | data = data.replace(''', ''') 2220 | data = data.replace(''', ''') 2221 | if not self.contentparams.get('type', u'xml').endswith(u'xml'): 2222 | data = data.replace('<', '<') 2223 | data = data.replace('>', '>') 2224 | data = data.replace('&', '&') 2225 | data = data.replace('"', '"') 2226 | data = data.replace(''', "'") 2227 | return data 2228 | 2229 | def strattrs(self, attrs): 2230 | return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) 2231 | 2232 | class _RelativeURIResolver(_BaseHTMLProcessor): 2233 | relative_uris = set([('a', 'href'), 2234 | ('applet', 'codebase'), 2235 | ('area', 'href'), 2236 | ('blockquote', 'cite'), 2237 | ('body', 'background'), 2238 | ('del', 'cite'), 2239 | ('form', 'action'), 2240 | ('frame', 'longdesc'), 2241 | ('frame', 'src'), 2242 | ('iframe', 'longdesc'), 2243 | ('iframe', 'src'), 2244 | ('head', 'profile'), 2245 | ('img', 'longdesc'), 2246 | ('img', 'src'), 2247 | ('img', 'usemap'), 2248 | ('input', 'src'), 2249 | ('input', 'usemap'), 2250 | ('ins', 'cite'), 2251 | ('link', 'href'), 2252 | ('object', 'classid'), 2253 | ('object', 'codebase'), 2254 | ('object', 'data'), 2255 | ('object', 'usemap'), 2256 | ('q', 'cite'), 2257 | ('script', 'src'), 2258 | ('video', 'poster')]) 2259 | 2260 | def __init__(self, baseuri, encoding, _type): 2261 | _BaseHTMLProcessor.__init__(self, encoding, _type) 2262 | self.baseuri = baseuri 2263 | 2264 | def resolveURI(self, uri): 2265 | return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) 2266 | 2267 | def unknown_starttag(self, tag, attrs): 2268 | attrs = self.normalize_attrs(attrs) 2269 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] 2270 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) 2271 | 2272 | def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): 2273 | if not _SGML_AVAILABLE: 2274 | return htmlSource 2275 | 2276 | p = _RelativeURIResolver(baseURI, encoding, _type) 2277 | p.feed(htmlSource) 2278 | return p.output() 2279 | 2280 | def _makeSafeAbsoluteURI(base, rel=None): 2281 | # bail if ACCEPTABLE_URI_SCHEMES is empty 2282 | if not ACCEPTABLE_URI_SCHEMES: 2283 | try: 2284 | return _urljoin(base, rel or u'') 2285 | except ValueError: 2286 | return u'' 2287 | if not base: 2288 | return rel or u'' 2289 | if not rel: 2290 | try: 2291 | scheme = urlparse.urlparse(base)[0] 2292 | except ValueError: 2293 | return u'' 2294 | if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: 2295 | return base 2296 | return u'' 2297 | try: 2298 | uri = _urljoin(base, rel) 2299 | except ValueError: 2300 | return u'' 2301 | if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: 2302 | return u'' 2303 | return uri 2304 | 2305 | class _HTMLSanitizer(_BaseHTMLProcessor): 2306 | acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area', 2307 | 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 2308 | 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 2309 | 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 2310 | 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 2311 | 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', 2312 | 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 2313 | 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 2314 | 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 2315 | 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 2316 | 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 2317 | 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 2318 | 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) 2319 | 2320 | acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', 2321 | 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', 2322 | 'background', 'balance', 'bgcolor', 'bgproperties', 'border', 2323 | 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', 2324 | 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', 2325 | 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', 2326 | 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 2327 | 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 2328 | 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', 2329 | 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 2330 | 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 2331 | 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 2332 | 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', 2333 | 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', 2334 | 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', 2335 | 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel', 2336 | 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 2337 | 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 2338 | 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 2339 | 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', 2340 | 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', 2341 | 'width', 'wrap', 'xml:lang']) 2342 | 2343 | unacceptable_elements_with_end_tag = set(['script', 'applet', 'style']) 2344 | 2345 | acceptable_css_properties = set(['azimuth', 'background-color', 2346 | 'border-bottom-color', 'border-collapse', 'border-color', 2347 | 'border-left-color', 'border-right-color', 'border-top-color', 'clear', 2348 | 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', 2349 | 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', 2350 | 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', 2351 | 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', 2352 | 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', 2353 | 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', 2354 | 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', 2355 | 'white-space', 'width']) 2356 | 2357 | # survey of common keywords found in feeds 2358 | acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue', 2359 | 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', 2360 | 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', 2361 | 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', 2362 | 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', 2363 | 'transparent', 'underline', 'white', 'yellow']) 2364 | 2365 | valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + 2366 | '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') 2367 | 2368 | mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math', 2369 | 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 2370 | 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 2371 | 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 2372 | 'munderover', 'none', 'semantics']) 2373 | 2374 | mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign', 2375 | 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', 2376 | 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', 2377 | 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 2378 | 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 2379 | 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', 2380 | 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', 2381 | 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', 2382 | 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']) 2383 | 2384 | # svgtiny - foreignObject + linearGradient + radialGradient + stop 2385 | svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', 2386 | 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', 2387 | 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 2388 | 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', 2389 | 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 2390 | 'svg', 'switch', 'text', 'title', 'tspan', 'use']) 2391 | 2392 | # svgtiny + class + opacity + offset + xmlns + xmlns:xlink 2393 | svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic', 2394 | 'arabic-form', 'ascent', 'attributeName', 'attributeType', 2395 | 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 2396 | 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', 2397 | 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', 2398 | 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', 2399 | 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 2400 | 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 2401 | 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 2402 | 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', 2403 | 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', 2404 | 'min', 'name', 'offset', 'opacity', 'orient', 'origin', 2405 | 'overline-position', 'overline-thickness', 'panose-1', 'path', 2406 | 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', 2407 | 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', 2408 | 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 2409 | 'stop-color', 'stop-opacity', 'strikethrough-position', 2410 | 'strikethrough-thickness', 'stroke', 'stroke-dasharray', 2411 | 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', 2412 | 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', 2413 | 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', 2414 | 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', 2415 | 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', 2416 | 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', 2417 | 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 2418 | 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 2419 | 'y2', 'zoomAndPan']) 2420 | 2421 | svg_attr_map = None 2422 | svg_elem_map = None 2423 | 2424 | acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule', 2425 | 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', 2426 | 'stroke-opacity']) 2427 | 2428 | def reset(self): 2429 | _BaseHTMLProcessor.reset(self) 2430 | self.unacceptablestack = 0 2431 | self.mathmlOK = 0 2432 | self.svgOK = 0 2433 | 2434 | def unknown_starttag(self, tag, attrs): 2435 | acceptable_attributes = self.acceptable_attributes 2436 | keymap = {} 2437 | if not tag in self.acceptable_elements or self.svgOK: 2438 | if tag in self.unacceptable_elements_with_end_tag: 2439 | self.unacceptablestack += 1 2440 | 2441 | # add implicit namespaces to html5 inline svg/mathml 2442 | if self._type.endswith('html'): 2443 | if not dict(attrs).get('xmlns'): 2444 | if tag=='svg': 2445 | attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) 2446 | if tag=='math': 2447 | attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') ) 2448 | 2449 | # not otherwise acceptable, perhaps it is MathML or SVG? 2450 | if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: 2451 | self.mathmlOK += 1 2452 | if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: 2453 | self.svgOK += 1 2454 | 2455 | # chose acceptable attributes based on tag class, else bail 2456 | if self.mathmlOK and tag in self.mathml_elements: 2457 | acceptable_attributes = self.mathml_attributes 2458 | elif self.svgOK and tag in self.svg_elements: 2459 | # for most vocabularies, lowercasing is a good idea. Many 2460 | # svg elements, however, are camel case 2461 | if not self.svg_attr_map: 2462 | lower=[attr.lower() for attr in self.svg_attributes] 2463 | mix=[a for a in self.svg_attributes if a not in lower] 2464 | self.svg_attributes = lower 2465 | self.svg_attr_map = dict([(a.lower(),a) for a in mix]) 2466 | 2467 | lower=[attr.lower() for attr in self.svg_elements] 2468 | mix=[a for a in self.svg_elements if a not in lower] 2469 | self.svg_elements = lower 2470 | self.svg_elem_map = dict([(a.lower(),a) for a in mix]) 2471 | acceptable_attributes = self.svg_attributes 2472 | tag = self.svg_elem_map.get(tag,tag) 2473 | keymap = self.svg_attr_map 2474 | elif not tag in self.acceptable_elements: 2475 | return 2476 | 2477 | # declare xlink namespace, if needed 2478 | if self.mathmlOK or self.svgOK: 2479 | if filter(lambda (n,v): n.startswith('xlink:'),attrs): 2480 | if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: 2481 | attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) 2482 | 2483 | clean_attrs = [] 2484 | for key, value in self.normalize_attrs(attrs): 2485 | if key in acceptable_attributes: 2486 | key=keymap.get(key,key) 2487 | # make sure the uri uses an acceptable uri scheme 2488 | if key == u'href': 2489 | value = _makeSafeAbsoluteURI(value) 2490 | clean_attrs.append((key,value)) 2491 | elif key=='style': 2492 | clean_value = self.sanitize_style(value) 2493 | if clean_value: 2494 | clean_attrs.append((key,clean_value)) 2495 | _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) 2496 | 2497 | def unknown_endtag(self, tag): 2498 | if not tag in self.acceptable_elements: 2499 | if tag in self.unacceptable_elements_with_end_tag: 2500 | self.unacceptablestack -= 1 2501 | if self.mathmlOK and tag in self.mathml_elements: 2502 | if tag == 'math' and self.mathmlOK: 2503 | self.mathmlOK -= 1 2504 | elif self.svgOK and tag in self.svg_elements: 2505 | tag = self.svg_elem_map.get(tag,tag) 2506 | if tag == 'svg' and self.svgOK: 2507 | self.svgOK -= 1 2508 | else: 2509 | return 2510 | _BaseHTMLProcessor.unknown_endtag(self, tag) 2511 | 2512 | def handle_pi(self, text): 2513 | pass 2514 | 2515 | def handle_decl(self, text): 2516 | pass 2517 | 2518 | def handle_data(self, text): 2519 | if not self.unacceptablestack: 2520 | _BaseHTMLProcessor.handle_data(self, text) 2521 | 2522 | def sanitize_style(self, style): 2523 | # disallow urls 2524 | style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) 2525 | 2526 | # gauntlet 2527 | if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): 2528 | return '' 2529 | # This replaced a regexp that used re.match and was prone to pathological back-tracking. 2530 | if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): 2531 | return '' 2532 | 2533 | clean = [] 2534 | for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): 2535 | if not value: 2536 | continue 2537 | if prop.lower() in self.acceptable_css_properties: 2538 | clean.append(prop + ': ' + value + ';') 2539 | elif prop.split('-')[0].lower() in ['background','border','margin','padding']: 2540 | for keyword in value.split(): 2541 | if not keyword in self.acceptable_css_keywords and \ 2542 | not self.valid_css_values.match(keyword): 2543 | break 2544 | else: 2545 | clean.append(prop + ': ' + value + ';') 2546 | elif self.svgOK and prop.lower() in self.acceptable_svg_properties: 2547 | clean.append(prop + ': ' + value + ';') 2548 | 2549 | return ' '.join(clean) 2550 | 2551 | def parse_comment(self, i, report=1): 2552 | ret = _BaseHTMLProcessor.parse_comment(self, i, report) 2553 | if ret >= 0: 2554 | return ret 2555 | # if ret == -1, this may be a malicious attempt to circumvent 2556 | # sanitization, or a page-destroying unclosed comment 2557 | match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) 2558 | if match: 2559 | return match.end() 2560 | # unclosed comment; deliberately fail to handle_data() 2561 | return len(self.rawdata) 2562 | 2563 | 2564 | def _sanitizeHTML(htmlSource, encoding, _type): 2565 | if not _SGML_AVAILABLE: 2566 | return htmlSource 2567 | p = _HTMLSanitizer(encoding, _type) 2568 | htmlSource = htmlSource.replace(' stream 2619 | 2620 | This function lets you define parsers that take any input source 2621 | (URL, pathname to local or network file, or actual data as a string) 2622 | and deal with it in a uniform manner. Returned object is guaranteed 2623 | to have all the basic stdio read methods (read, readline, readlines). 2624 | Just .close() the object when you're done with it. 2625 | 2626 | If the etag argument is supplied, it will be used as the value of an 2627 | If-None-Match request header. 2628 | 2629 | If the modified argument is supplied, it can be a tuple of 9 integers 2630 | (as returned by gmtime() in the standard Python time module) or a date 2631 | string in any format supported by feedparser. Regardless, it MUST 2632 | be in GMT (Greenwich Mean Time). It will be reformatted into an 2633 | RFC 1123-compliant date and used as the value of an If-Modified-Since 2634 | request header. 2635 | 2636 | If the agent argument is supplied, it will be used as the value of a 2637 | User-Agent request header. 2638 | 2639 | If the referrer argument is supplied, it will be used as the value of a 2640 | Referer[sic] request header. 2641 | 2642 | If handlers is supplied, it is a list of handlers used to build a 2643 | urllib2 opener. 2644 | 2645 | if request_headers is supplied it is a dictionary of HTTP request headers 2646 | that will override the values generated by FeedParser. 2647 | """ 2648 | 2649 | if hasattr(url_file_stream_or_string, 'read'): 2650 | return url_file_stream_or_string 2651 | 2652 | if isinstance(url_file_stream_or_string, basestring) \ 2653 | and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): 2654 | # Deal with the feed URI scheme 2655 | if url_file_stream_or_string.startswith('feed:http'): 2656 | url_file_stream_or_string = url_file_stream_or_string[5:] 2657 | elif url_file_stream_or_string.startswith('feed:'): 2658 | url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] 2659 | if not agent: 2660 | agent = USER_AGENT 2661 | # Test for inline user:password credentials for HTTP basic auth 2662 | auth = None 2663 | if base64 and not url_file_stream_or_string.startswith('ftp:'): 2664 | urltype, rest = urllib.splittype(url_file_stream_or_string) 2665 | realhost, rest = urllib.splithost(rest) 2666 | if realhost: 2667 | user_passwd, realhost = urllib.splituser(realhost) 2668 | if user_passwd: 2669 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) 2670 | auth = base64.standard_b64encode(user_passwd).strip() 2671 | 2672 | # iri support 2673 | if isinstance(url_file_stream_or_string, unicode): 2674 | url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string) 2675 | 2676 | # try to open with urllib2 (to use optional headers) 2677 | request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) 2678 | opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()])) 2679 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent 2680 | try: 2681 | return opener.open(request) 2682 | finally: 2683 | opener.close() # JohnD 2684 | 2685 | # try to open with native open function (if url_file_stream_or_string is a filename) 2686 | try: 2687 | return open(url_file_stream_or_string, 'rb') 2688 | except (IOError, UnicodeEncodeError, TypeError): 2689 | # if url_file_stream_or_string is a unicode object that 2690 | # cannot be converted to the encoding returned by 2691 | # sys.getfilesystemencoding(), a UnicodeEncodeError 2692 | # will be thrown 2693 | # If url_file_stream_or_string is a string that contains NULL 2694 | # (such as an XML document encoded in UTF-32), TypeError will 2695 | # be thrown. 2696 | pass 2697 | 2698 | # treat url_file_stream_or_string as string 2699 | if isinstance(url_file_stream_or_string, unicode): 2700 | return _StringIO(url_file_stream_or_string.encode('utf-8')) 2701 | return _StringIO(url_file_stream_or_string) 2702 | 2703 | def _convert_to_idn(url): 2704 | """Convert a URL to IDN notation""" 2705 | # this function should only be called with a unicode string 2706 | # strategy: if the host cannot be encoded in ascii, then 2707 | # it'll be necessary to encode it in idn form 2708 | parts = list(urlparse.urlsplit(url)) 2709 | try: 2710 | parts[1].encode('ascii') 2711 | except UnicodeEncodeError: 2712 | # the url needs to be converted to idn notation 2713 | host = parts[1].rsplit(':', 1) 2714 | newhost = [] 2715 | port = u'' 2716 | if len(host) == 2: 2717 | port = host.pop() 2718 | for h in host[0].split('.'): 2719 | newhost.append(h.encode('idna').decode('utf-8')) 2720 | parts[1] = '.'.join(newhost) 2721 | if port: 2722 | parts[1] += ':' + port 2723 | return urlparse.urlunsplit(parts) 2724 | else: 2725 | return url 2726 | 2727 | def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): 2728 | request = urllib2.Request(url) 2729 | request.add_header('User-Agent', agent) 2730 | if etag: 2731 | request.add_header('If-None-Match', etag) 2732 | if isinstance(modified, basestring): 2733 | modified = _parse_date(modified) 2734 | elif isinstance(modified, datetime.datetime): 2735 | modified = modified.utctimetuple() 2736 | if modified: 2737 | # format into an RFC 1123-compliant timestamp. We can't use 2738 | # time.strftime() since the %a and %b directives can be affected 2739 | # by the current locale, but RFC 2616 states that dates must be 2740 | # in English. 2741 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] 2742 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 2743 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) 2744 | if referrer: 2745 | request.add_header('Referer', referrer) 2746 | if gzip and zlib: 2747 | request.add_header('Accept-encoding', 'gzip, deflate') 2748 | elif gzip: 2749 | request.add_header('Accept-encoding', 'gzip') 2750 | elif zlib: 2751 | request.add_header('Accept-encoding', 'deflate') 2752 | else: 2753 | request.add_header('Accept-encoding', '') 2754 | if auth: 2755 | request.add_header('Authorization', 'Basic %s' % auth) 2756 | if ACCEPT_HEADER: 2757 | request.add_header('Accept', ACCEPT_HEADER) 2758 | # use this for whatever -- cookies, special headers, etc 2759 | # [('Cookie','Something'),('x-special-header','Another Value')] 2760 | for header_name, header_value in request_headers.items(): 2761 | request.add_header(header_name, header_value) 2762 | request.add_header('A-IM', 'feed') # RFC 3229 support 2763 | return request 2764 | 2765 | def _parse_psc_chapter_start(start): 2766 | FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$' 2767 | 2768 | m = re.compile(FORMAT).match(start) 2769 | if m is None: 2770 | return None 2771 | 2772 | _, h, m, s, _, ms = m.groups() 2773 | h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) 2774 | return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) 2775 | 2776 | _date_handlers = [] 2777 | def registerDateHandler(func): 2778 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' 2779 | _date_handlers.insert(0, func) 2780 | 2781 | # ISO-8601 date parsing routines written by Fazal Majid. 2782 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 2783 | # parser is beyond the scope of feedparser and would be a worthwhile addition 2784 | # to the Python library. 2785 | # A single regular expression cannot parse ISO 8601 date formats into groups 2786 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or 2787 | # 0301-04-01), so we use templates instead. 2788 | # Please note the order in templates is significant because we need a 2789 | # greedy match. 2790 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', 2791 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 2792 | '-YY-?MM', '-OOO', '-YY', 2793 | '--MM-?DD', '--MM', 2794 | '---DD', 2795 | 'CC', ''] 2796 | _iso8601_re = [ 2797 | tmpl.replace( 2798 | 'YYYY', r'(?P\d{4})').replace( 2799 | 'YY', r'(?P\d\d)').replace( 2800 | 'MM', r'(?P[01]\d)').replace( 2801 | 'DD', r'(?P[0123]\d)').replace( 2802 | 'OOO', r'(?P[0123]\d\d)').replace( 2803 | 'CC', r'(?P\d\d$)') 2804 | + r'(T?(?P\d{2}):(?P\d{2})' 2805 | + r'(:(?P\d{2}))?' 2806 | + r'(\.(?P\d+))?' 2807 | + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' 2808 | for tmpl in _iso8601_tmpl] 2809 | try: 2810 | del tmpl 2811 | except NameError: 2812 | pass 2813 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] 2814 | try: 2815 | del regex 2816 | except NameError: 2817 | pass 2818 | 2819 | def _parse_date_iso8601(dateString): 2820 | '''Parse a variety of ISO-8601-compatible formats like 20040105''' 2821 | m = None 2822 | for _iso8601_match in _iso8601_matches: 2823 | m = _iso8601_match(dateString) 2824 | if m: 2825 | break 2826 | if not m: 2827 | return 2828 | if m.span() == (0, 0): 2829 | return 2830 | params = m.groupdict() 2831 | ordinal = params.get('ordinal', 0) 2832 | if ordinal: 2833 | ordinal = int(ordinal) 2834 | else: 2835 | ordinal = 0 2836 | year = params.get('year', '--') 2837 | if not year or year == '--': 2838 | year = time.gmtime()[0] 2839 | elif len(year) == 2: 2840 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 2841 | year = 100 * int(time.gmtime()[0] / 100) + int(year) 2842 | else: 2843 | year = int(year) 2844 | month = params.get('month', '-') 2845 | if not month or month == '-': 2846 | # ordinals are NOT normalized by mktime, we simulate them 2847 | # by setting month=1, day=ordinal 2848 | if ordinal: 2849 | month = 1 2850 | else: 2851 | month = time.gmtime()[1] 2852 | month = int(month) 2853 | day = params.get('day', 0) 2854 | if not day: 2855 | # see above 2856 | if ordinal: 2857 | day = ordinal 2858 | elif params.get('century', 0) or \ 2859 | params.get('year', 0) or params.get('month', 0): 2860 | day = 1 2861 | else: 2862 | day = time.gmtime()[2] 2863 | else: 2864 | day = int(day) 2865 | # special case of the century - is the first year of the 21st century 2866 | # 2000 or 2001 ? The debate goes on... 2867 | if 'century' in params: 2868 | year = (int(params['century']) - 1) * 100 + 1 2869 | # in ISO 8601 most fields are optional 2870 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: 2871 | if not params.get(field, None): 2872 | params[field] = 0 2873 | hour = int(params.get('hour', 0)) 2874 | minute = int(params.get('minute', 0)) 2875 | second = int(float(params.get('second', 0))) 2876 | # weekday is normalized by mktime(), we can ignore it 2877 | weekday = 0 2878 | daylight_savings_flag = -1 2879 | tm = [year, month, day, hour, minute, second, weekday, 2880 | ordinal, daylight_savings_flag] 2881 | # ISO 8601 time zone adjustments 2882 | tz = params.get('tz') 2883 | if tz and tz != 'Z': 2884 | if tz[0] == '-': 2885 | tm[3] += int(params.get('tzhour', 0)) 2886 | tm[4] += int(params.get('tzmin', 0)) 2887 | elif tz[0] == '+': 2888 | tm[3] -= int(params.get('tzhour', 0)) 2889 | tm[4] -= int(params.get('tzmin', 0)) 2890 | else: 2891 | return None 2892 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) 2893 | # which is guaranteed to normalize d/m/y/h/m/s. 2894 | # Many implementations have bugs, but we'll pretend they don't. 2895 | return time.localtime(time.mktime(tuple(tm))) 2896 | registerDateHandler(_parse_date_iso8601) 2897 | 2898 | # 8-bit date handling routines written by ytrewq1. 2899 | _korean_year = u'\ub144' # b3e2 in euc-kr 2900 | _korean_month = u'\uc6d4' # bff9 in euc-kr 2901 | _korean_day = u'\uc77c' # c0cf in euc-kr 2902 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr 2903 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr 2904 | 2905 | _korean_onblog_date_re = \ 2906 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ 2907 | (_korean_year, _korean_month, _korean_day)) 2908 | _korean_nate_date_re = \ 2909 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ 2910 | (_korean_am, _korean_pm)) 2911 | def _parse_date_onblog(dateString): 2912 | '''Parse a string according to the OnBlog 8-bit date format''' 2913 | m = _korean_onblog_date_re.match(dateString) 2914 | if not m: 2915 | return 2916 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ 2917 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 2918 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ 2919 | 'zonediff': '+09:00'} 2920 | return _parse_date_w3dtf(w3dtfdate) 2921 | registerDateHandler(_parse_date_onblog) 2922 | 2923 | def _parse_date_nate(dateString): 2924 | '''Parse a string according to the Nate 8-bit date format''' 2925 | m = _korean_nate_date_re.match(dateString) 2926 | if not m: 2927 | return 2928 | hour = int(m.group(5)) 2929 | ampm = m.group(4) 2930 | if (ampm == _korean_pm): 2931 | hour += 12 2932 | hour = str(hour) 2933 | if len(hour) == 1: 2934 | hour = '0' + hour 2935 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ 2936 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 2937 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ 2938 | 'zonediff': '+09:00'} 2939 | return _parse_date_w3dtf(w3dtfdate) 2940 | registerDateHandler(_parse_date_nate) 2941 | 2942 | # Unicode strings for Greek date strings 2943 | _greek_months = \ 2944 | { \ 2945 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 2946 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 2947 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 2948 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 2949 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 2950 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 2951 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 2952 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 2953 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 2954 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 2955 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 2956 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 2957 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 2958 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 2959 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 2960 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 2961 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 2962 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 2963 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 2964 | } 2965 | 2966 | _greek_wdays = \ 2967 | { \ 2968 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 2969 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 2970 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 2971 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 2972 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 2973 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 2974 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 2975 | } 2976 | 2977 | _greek_date_format_re = \ 2978 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') 2979 | 2980 | def _parse_date_greek(dateString): 2981 | '''Parse a string according to a Greek 8-bit date format.''' 2982 | m = _greek_date_format_re.match(dateString) 2983 | if not m: 2984 | return 2985 | wday = _greek_wdays[m.group(1)] 2986 | month = _greek_months[m.group(3)] 2987 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ 2988 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ 2989 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ 2990 | 'zonediff': m.group(8)} 2991 | return _parse_date_rfc822(rfc822date) 2992 | registerDateHandler(_parse_date_greek) 2993 | 2994 | # Unicode strings for Hungarian date strings 2995 | _hungarian_months = \ 2996 | { \ 2997 | u'janu\u00e1r': u'01', # e1 in iso-8859-2 2998 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2 2999 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2 3000 | u'\u00e1prilis': u'04', # e1 in iso-8859-2 3001 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2 3002 | u'j\u00fanius': u'06', # fa in iso-8859-2 3003 | u'j\u00falius': u'07', # fa in iso-8859-2 3004 | u'augusztus': u'08', 3005 | u'szeptember': u'09', 3006 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2 3007 | u'november': u'11', 3008 | u'december': u'12', 3009 | } 3010 | 3011 | _hungarian_date_format_re = \ 3012 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') 3013 | 3014 | def _parse_date_hungarian(dateString): 3015 | '''Parse a string according to a Hungarian 8-bit date format.''' 3016 | m = _hungarian_date_format_re.match(dateString) 3017 | if not m or m.group(2) not in _hungarian_months: 3018 | return None 3019 | month = _hungarian_months[m.group(2)] 3020 | day = m.group(3) 3021 | if len(day) == 1: 3022 | day = '0' + day 3023 | hour = m.group(4) 3024 | if len(hour) == 1: 3025 | hour = '0' + hour 3026 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ 3027 | {'year': m.group(1), 'month': month, 'day': day,\ 3028 | 'hour': hour, 'minute': m.group(5),\ 3029 | 'zonediff': m.group(6)} 3030 | return _parse_date_w3dtf(w3dtfdate) 3031 | registerDateHandler(_parse_date_hungarian) 3032 | 3033 | timezonenames = { 3034 | 'ut': 0, 'gmt': 0, 'z': 0, 3035 | 'adt': -3, 'ast': -4, 'at': -4, 3036 | 'edt': -4, 'est': -5, 'et': -5, 3037 | 'cdt': -5, 'cst': -6, 'ct': -6, 3038 | 'mdt': -6, 'mst': -7, 'mt': -7, 3039 | 'pdt': -7, 'pst': -8, 'pt': -8, 3040 | 'a': -1, 'n': 1, 3041 | 'm': -12, 'y': 12, 3042 | } 3043 | # W3 date and time format parser 3044 | # http://www.w3.org/TR/NOTE-datetime 3045 | # Also supports MSSQL-style datetimes as defined at: 3046 | # http://msdn.microsoft.com/en-us/library/ms186724.aspx 3047 | # (basically, allow a space as a date/time/timezone separator) 3048 | def _parse_date_w3dtf(datestr): 3049 | if not datestr.strip(): 3050 | return None 3051 | parts = datestr.lower().split('t') 3052 | if len(parts) == 1: 3053 | # This may be a date only, or may be an MSSQL-style date 3054 | parts = parts[0].split() 3055 | if len(parts) == 1: 3056 | # Treat this as a date only 3057 | parts.append('00:00:00z') 3058 | elif len(parts) > 2: 3059 | return None 3060 | date = parts[0].split('-', 2) 3061 | if not date or len(date[0]) != 4: 3062 | return None 3063 | # Ensure that `date` has 3 elements. Using '1' sets the default 3064 | # month to January and the default day to the 1st of the month. 3065 | date.extend(['1'] * (3 - len(date))) 3066 | try: 3067 | year, month, day = [int(i) for i in date] 3068 | except ValueError: 3069 | # `date` may have more than 3 elements or may contain 3070 | # non-integer strings. 3071 | return None 3072 | if parts[1].endswith('z'): 3073 | parts[1] = parts[1][:-1] 3074 | parts.append('z') 3075 | # Append the numeric timezone offset, if any, to parts. 3076 | # If this is an MSSQL-style date then parts[2] already contains 3077 | # the timezone information, so `append()` will not affect it. 3078 | # Add 1 to each value so that if `find()` returns -1 it will be 3079 | # treated as False. 3080 | loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 3081 | loc = loc - 1 3082 | parts.append(parts[1][loc:]) 3083 | parts[1] = parts[1][:loc] 3084 | time = parts[1].split(':', 2) 3085 | # Ensure that time has 3 elements. Using '0' means that the 3086 | # minutes and seconds, if missing, will default to 0. 3087 | time.extend(['0'] * (3 - len(time))) 3088 | tzhour = 0 3089 | tzmin = 0 3090 | if parts[2][:1] in ('-', '+'): 3091 | try: 3092 | tzhour = int(parts[2][1:3]) 3093 | tzmin = int(parts[2][4:]) 3094 | except ValueError: 3095 | return None 3096 | if parts[2].startswith('-'): 3097 | tzhour = tzhour * -1 3098 | tzmin = tzmin * -1 3099 | else: 3100 | tzhour = timezonenames.get(parts[2], 0) 3101 | try: 3102 | hour, minute, second = [int(float(i)) for i in time] 3103 | except ValueError: 3104 | return None 3105 | # Create the datetime object and timezone delta objects 3106 | try: 3107 | stamp = datetime.datetime(year, month, day, hour, minute, second) 3108 | except ValueError: 3109 | return None 3110 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) 3111 | # Return the date and timestamp in a UTC 9-tuple 3112 | try: 3113 | return (stamp - delta).utctimetuple() 3114 | except (OverflowError, ValueError): 3115 | # IronPython throws ValueErrors instead of OverflowErrors 3116 | return None 3117 | 3118 | registerDateHandler(_parse_date_w3dtf) 3119 | 3120 | def _parse_date_rfc822(date): 3121 | """Parse RFC 822 dates and times 3122 | http://tools.ietf.org/html/rfc822#section-5 3123 | 3124 | There are some formatting differences that are accounted for: 3125 | 1. Years may be two or four digits. 3126 | 2. The month and day can be swapped. 3127 | 3. Additional timezone names are supported. 3128 | 4. A default time and timezone are assumed if only a date is present. 3129 | """ 3130 | daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']) 3131 | months = { 3132 | 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 3133 | 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, 3134 | } 3135 | 3136 | parts = date.lower().split() 3137 | if len(parts) < 5: 3138 | # Assume that the time and timezone are missing 3139 | parts.extend(('00:00:00', '0000')) 3140 | # Remove the day name 3141 | if parts[0][:3] in daynames: 3142 | parts = parts[1:] 3143 | if len(parts) < 5: 3144 | # If there are still fewer than five parts, there's not enough 3145 | # information to interpret this 3146 | return None 3147 | try: 3148 | day = int(parts[0]) 3149 | except ValueError: 3150 | # Check if the day and month are swapped 3151 | if months.get(parts[0][:3]): 3152 | try: 3153 | day = int(parts[1]) 3154 | except ValueError: 3155 | return None 3156 | else: 3157 | parts[1] = parts[0] 3158 | else: 3159 | return None 3160 | month = months.get(parts[1][:3]) 3161 | if not month: 3162 | return None 3163 | try: 3164 | year = int(parts[2]) 3165 | except ValueError: 3166 | return None 3167 | # Normalize two-digit years: 3168 | # Anything in the 90's is interpreted as 1990 and on 3169 | # Anything 89 or less is interpreted as 2089 or before 3170 | if len(parts[2]) <= 2: 3171 | year += (1900, 2000)[year < 90] 3172 | timeparts = parts[3].split(':') 3173 | timeparts = timeparts + ([0] * (3 - len(timeparts))) 3174 | try: 3175 | (hour, minute, second) = map(int, timeparts) 3176 | except ValueError: 3177 | return None 3178 | tzhour = 0 3179 | tzmin = 0 3180 | # Strip 'Etc/' from the timezone 3181 | if parts[4].startswith('etc/'): 3182 | parts[4] = parts[4][4:] 3183 | # Normalize timezones that start with 'gmt': 3184 | # GMT-05:00 => -0500 3185 | # GMT => GMT 3186 | if parts[4].startswith('gmt'): 3187 | parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' 3188 | # Handle timezones like '-0500', '+0500', and 'EST' 3189 | if parts[4] and parts[4][0] in ('-', '+'): 3190 | try: 3191 | tzhour = int(parts[4][1:3]) 3192 | tzmin = int(parts[4][3:]) 3193 | except ValueError: 3194 | return None 3195 | if parts[4].startswith('-'): 3196 | tzhour = tzhour * -1 3197 | tzmin = tzmin * -1 3198 | else: 3199 | tzhour = timezonenames.get(parts[4], 0) 3200 | # Create the datetime object and timezone delta objects 3201 | try: 3202 | stamp = datetime.datetime(year, month, day, hour, minute, second) 3203 | except ValueError: 3204 | return None 3205 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) 3206 | # Return the date and timestamp in a UTC 9-tuple 3207 | try: 3208 | return (stamp - delta).utctimetuple() 3209 | except (OverflowError, ValueError): 3210 | # IronPython throws ValueErrors instead of OverflowErrors 3211 | return None 3212 | registerDateHandler(_parse_date_rfc822) 3213 | 3214 | _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 3215 | 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] 3216 | def _parse_date_asctime(dt): 3217 | """Parse asctime-style dates""" 3218 | dayname, month, day, remainder = dt.split(None, 3) 3219 | # Convert month and day into zero-padded integers 3220 | month = '%02i ' % (_months.index(month.lower()) + 1) 3221 | day = '%02i ' % (int(day),) 3222 | dt = month + day + remainder 3223 | return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, ) 3224 | registerDateHandler(_parse_date_asctime) 3225 | 3226 | def _parse_date_perforce(aDateString): 3227 | """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" 3228 | # Fri, 2006/09/15 08:19:53 EDT 3229 | _my_date_pattern = re.compile( \ 3230 | r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') 3231 | 3232 | m = _my_date_pattern.search(aDateString) 3233 | if m is None: 3234 | return None 3235 | dow, year, month, day, hour, minute, second, tz = m.groups() 3236 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 3237 | dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) 3238 | tm = rfc822.parsedate_tz(dateString) 3239 | if tm: 3240 | return time.gmtime(rfc822.mktime_tz(tm)) 3241 | registerDateHandler(_parse_date_perforce) 3242 | 3243 | def _parse_date(dateString): 3244 | '''Parses a variety of date formats into a 9-tuple in GMT''' 3245 | if not dateString: 3246 | return None 3247 | for handler in _date_handlers: 3248 | try: 3249 | date9tuple = handler(dateString) 3250 | except (KeyError, OverflowError, ValueError): 3251 | continue 3252 | if not date9tuple: 3253 | continue 3254 | if len(date9tuple) != 9: 3255 | continue 3256 | return date9tuple 3257 | return None 3258 | 3259 | # Each marker represents some of the characters of the opening XML 3260 | # processing instruction (' 3271 | RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>') 3272 | 3273 | # Capture the value of the XML processing instruction's encoding attribute. 3274 | # Example: 3275 | RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')) 3276 | 3277 | def convert_to_utf8(http_headers, data): 3278 | '''Detect and convert the character encoding to UTF-8. 3279 | 3280 | http_headers is a dictionary 3281 | data is a raw string (not Unicode)''' 3282 | 3283 | # This is so much trickier than it sounds, it's not even funny. 3284 | # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type 3285 | # is application/xml, application/*+xml, 3286 | # application/xml-external-parsed-entity, or application/xml-dtd, 3287 | # the encoding given in the charset parameter of the HTTP Content-Type 3288 | # takes precedence over the encoding given in the XML prefix within the 3289 | # document, and defaults to 'utf-8' if neither are specified. But, if 3290 | # the HTTP Content-Type is text/xml, text/*+xml, or 3291 | # text/xml-external-parsed-entity, the encoding given in the XML prefix 3292 | # within the document is ALWAYS IGNORED and only the encoding given in 3293 | # the charset parameter of the HTTP Content-Type header should be 3294 | # respected, and it defaults to 'us-ascii' if not specified. 3295 | 3296 | # Furthermore, discussion on the atom-syntax mailing list with the 3297 | # author of RFC 3023 leads me to the conclusion that any document 3298 | # served with a Content-Type of text/* and no charset parameter 3299 | # must be treated as us-ascii. (We now do this.) And also that it 3300 | # must always be flagged as non-well-formed. (We now do this too.) 3301 | 3302 | # If Content-Type is unspecified (input was local file or non-HTTP source) 3303 | # or unrecognized (server just got it totally wrong), then go by the 3304 | # encoding given in the XML prefix of the document and default to 3305 | # 'iso-8859-1' as per the HTTP specification (RFC 2616). 3306 | 3307 | # Then, assuming we didn't find a character encoding in the HTTP headers 3308 | # (and the HTTP Content-type allowed us to look in the body), we need 3309 | # to sniff the first few bytes of the XML data and try to determine 3310 | # whether the encoding is ASCII-compatible. Section F of the XML 3311 | # specification shows the way here: 3312 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info 3313 | 3314 | # If the sniffed encoding is not ASCII-compatible, we need to make it 3315 | # ASCII compatible so that we can sniff further into the XML declaration 3316 | # to find the encoding attribute, which will tell us the true encoding. 3317 | 3318 | # Of course, none of this guarantees that we will be able to parse the 3319 | # feed in the declared character encoding (assuming it was declared 3320 | # correctly, which many are not). iconv_codec can help a lot; 3321 | # you should definitely install it if you can. 3322 | # http://cjkpython.i18n.org/ 3323 | 3324 | bom_encoding = u'' 3325 | xml_encoding = u'' 3326 | rfc3023_encoding = u'' 3327 | 3328 | # Look at the first few bytes of the document to guess what 3329 | # its encoding may be. We only need to decode enough of the 3330 | # document that we can use an ASCII-compatible regular 3331 | # expression to search for an XML encoding declaration. 3332 | # The heuristic follows the XML specification, section F: 3333 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info 3334 | # Check for BOMs first. 3335 | if data[:4] == codecs.BOM_UTF32_BE: 3336 | bom_encoding = u'utf-32be' 3337 | data = data[4:] 3338 | elif data[:4] == codecs.BOM_UTF32_LE: 3339 | bom_encoding = u'utf-32le' 3340 | data = data[4:] 3341 | elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: 3342 | bom_encoding = u'utf-16be' 3343 | data = data[2:] 3344 | elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: 3345 | bom_encoding = u'utf-16le' 3346 | data = data[2:] 3347 | elif data[:3] == codecs.BOM_UTF8: 3348 | bom_encoding = u'utf-8' 3349 | data = data[3:] 3350 | # Check for the characters '''' 3463 | if RE_XML_DECLARATION.search(data): 3464 | data = RE_XML_DECLARATION.sub(new_declaration, data) 3465 | else: 3466 | data = new_declaration + u'\n' + data 3467 | data = data.encode('utf-8') 3468 | break 3469 | # if still no luck, give up 3470 | if not known_encoding: 3471 | error = CharacterEncodingUnknown( 3472 | 'document encoding unknown, I tried ' + 3473 | '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % 3474 | (rfc3023_encoding, xml_encoding)) 3475 | rfc3023_encoding = u'' 3476 | elif proposed_encoding != rfc3023_encoding: 3477 | error = CharacterEncodingOverride( 3478 | 'document declared as %s, but parsed as %s' % 3479 | (rfc3023_encoding, proposed_encoding)) 3480 | rfc3023_encoding = proposed_encoding 3481 | 3482 | return data, rfc3023_encoding, error 3483 | 3484 | # Match XML entity declarations. 3485 | # Example: 3486 | RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) 3487 | 3488 | # Match XML DOCTYPE declarations. 3489 | # Example: 3490 | RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) 3491 | 3492 | # Match safe entity declarations. 3493 | # This will allow hexadecimal character references through, 3494 | # as well as text, but not arbitrary nested entities. 3495 | # Example: cubed "³" 3496 | # Example: copyright "(C)" 3497 | # Forbidden: explode1 "&explode2;&explode2;" 3498 | RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) 3499 | 3500 | def replace_doctype(data): 3501 | '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) 3502 | 3503 | rss_version may be 'rss091n' or None 3504 | stripped_data is the same XML document with a replaced DOCTYPE 3505 | ''' 3506 | 3507 | # Divide the document into two groups by finding the location 3508 | # of the first element that doesn't begin with '\n\n]>') 3534 | data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data 3535 | 3536 | # Precompute the safe entities for the loose parser. 3537 | safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) 3538 | for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) 3539 | return version, data, safe_entities 3540 | 3541 | 3542 | # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' 3543 | # items, or None in the case of a parsing error. 3544 | 3545 | def _parse_poslist(value, geom_type, swap=True, dims=2): 3546 | if geom_type == 'linestring': 3547 | return _parse_georss_line(value, swap, dims) 3548 | elif geom_type == 'polygon': 3549 | ring = _parse_georss_line(value, swap, dims) 3550 | return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)} 3551 | else: 3552 | return None 3553 | 3554 | def _gen_georss_coords(value, swap=True, dims=2): 3555 | # A generator of (lon, lat) pairs from a string of encoded GeoRSS 3556 | # coordinates. Converts to floats and swaps order. 3557 | latlons = itertools.imap(float, value.strip().replace(',', ' ').split()) 3558 | nxt = latlons.next 3559 | while True: 3560 | t = [nxt(), nxt()][::swap and -1 or 1] 3561 | if dims == 3: 3562 | t.append(nxt()) 3563 | yield tuple(t) 3564 | 3565 | def _parse_georss_point(value, swap=True, dims=2): 3566 | # A point contains a single latitude-longitude pair, separated by 3567 | # whitespace. We'll also handle comma separators. 3568 | try: 3569 | coords = list(_gen_georss_coords(value, swap, dims)) 3570 | return {u'type': u'Point', u'coordinates': coords[0]} 3571 | except (IndexError, ValueError): 3572 | return None 3573 | 3574 | def _parse_georss_line(value, swap=True, dims=2): 3575 | # A line contains a space separated list of latitude-longitude pairs in 3576 | # WGS84 coordinate reference system, with each pair separated by 3577 | # whitespace. There must be at least two pairs. 3578 | try: 3579 | coords = list(_gen_georss_coords(value, swap, dims)) 3580 | return {u'type': u'LineString', u'coordinates': coords} 3581 | except (IndexError, ValueError): 3582 | return None 3583 | 3584 | def _parse_georss_polygon(value, swap=True, dims=2): 3585 | # A polygon contains a space separated list of latitude-longitude pairs, 3586 | # with each pair separated by whitespace. There must be at least four 3587 | # pairs, with the last being identical to the first (so a polygon has a 3588 | # minimum of three actual points). 3589 | try: 3590 | ring = list(_gen_georss_coords(value, swap, dims)) 3591 | except (IndexError, ValueError): 3592 | return None 3593 | if len(ring) < 4: 3594 | return None 3595 | return {u'type': u'Polygon', u'coordinates': (ring,)} 3596 | 3597 | def _parse_georss_box(value, swap=True, dims=2): 3598 | # A bounding box is a rectangular region, often used to define the extents 3599 | # of a map or a rough area of interest. A box contains two space seperate 3600 | # latitude-longitude pairs, with each pair separated by whitespace. The 3601 | # first pair is the lower corner, the second is the upper corner. 3602 | try: 3603 | coords = list(_gen_georss_coords(value, swap, dims)) 3604 | return {u'type': u'Box', u'coordinates': tuple(coords)} 3605 | except (IndexError, ValueError): 3606 | return None 3607 | 3608 | # end geospatial parsers 3609 | 3610 | 3611 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): 3612 | '''Parse a feed from a URL, file, stream, or string. 3613 | 3614 | request_headers, if given, is a dict from http header name to value to add 3615 | to the request; this overrides internally generated values. 3616 | ''' 3617 | 3618 | if handlers is None: 3619 | handlers = [] 3620 | if request_headers is None: 3621 | request_headers = {} 3622 | if response_headers is None: 3623 | response_headers = {} 3624 | 3625 | result = FeedParserDict() 3626 | result['feed'] = FeedParserDict() 3627 | result['entries'] = [] 3628 | result['bozo'] = 0 3629 | if not isinstance(handlers, list): 3630 | handlers = [handlers] 3631 | try: 3632 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) 3633 | data = f.read() 3634 | except Exception, e: 3635 | result['bozo'] = 1 3636 | result['bozo_exception'] = e 3637 | data = None 3638 | f = None 3639 | 3640 | if hasattr(f, 'headers'): 3641 | result['headers'] = dict(f.headers) 3642 | # overwrite existing headers using response_headers 3643 | if 'headers' in result: 3644 | result['headers'].update(response_headers) 3645 | elif response_headers: 3646 | result['headers'] = copy.deepcopy(response_headers) 3647 | 3648 | # lowercase all of the HTTP headers for comparisons per RFC 2616 3649 | if 'headers' in result: 3650 | http_headers = dict((k.lower(), v) for k, v in result['headers'].items()) 3651 | else: 3652 | http_headers = {} 3653 | 3654 | # if feed is gzip-compressed, decompress it 3655 | if f and data and http_headers: 3656 | if gzip and 'gzip' in http_headers.get('content-encoding', ''): 3657 | try: 3658 | data = gzip.GzipFile(fileobj=_StringIO(data)).read() 3659 | except (IOError, struct.error), e: 3660 | # IOError can occur if the gzip header is bad. 3661 | # struct.error can occur if the data is damaged. 3662 | result['bozo'] = 1 3663 | result['bozo_exception'] = e 3664 | if isinstance(e, struct.error): 3665 | # A gzip header was found but the data is corrupt. 3666 | # Ideally, we should re-request the feed without the 3667 | # 'Accept-encoding: gzip' header, but we don't. 3668 | data = None 3669 | elif zlib and 'deflate' in http_headers.get('content-encoding', ''): 3670 | try: 3671 | data = zlib.decompress(data) 3672 | except zlib.error, e: 3673 | try: 3674 | # The data may have no headers and no checksum. 3675 | data = zlib.decompress(data, -15) 3676 | except zlib.error, e: 3677 | result['bozo'] = 1 3678 | result['bozo_exception'] = e 3679 | 3680 | # save HTTP headers 3681 | if http_headers: 3682 | if 'etag' in http_headers: 3683 | etag = http_headers.get('etag', u'') 3684 | if not isinstance(etag, unicode): 3685 | etag = etag.decode('utf-8', 'ignore') 3686 | if etag: 3687 | result['etag'] = etag 3688 | if 'last-modified' in http_headers: 3689 | modified = http_headers.get('last-modified', u'') 3690 | if modified: 3691 | result['modified'] = modified 3692 | result['modified_parsed'] = _parse_date(modified) 3693 | if hasattr(f, 'url'): 3694 | if not isinstance(f.url, unicode): 3695 | result['href'] = f.url.decode('utf-8', 'ignore') 3696 | else: 3697 | result['href'] = f.url 3698 | result['status'] = 200 3699 | if hasattr(f, 'status'): 3700 | result['status'] = f.status 3701 | if hasattr(f, 'close'): 3702 | f.close() 3703 | 3704 | if data is None: 3705 | return result 3706 | 3707 | # Stop processing if the server sent HTTP 304 Not Modified. 3708 | if getattr(f, 'code', 0) == 304: 3709 | result['version'] = u'' 3710 | result['debug_message'] = 'The feed has not changed since you last checked, ' + \ 3711 | 'so the server sent no data. This is a feature, not a bug!' 3712 | return result 3713 | 3714 | data, result['encoding'], error = convert_to_utf8(http_headers, data) 3715 | use_strict_parser = result['encoding'] and True or False 3716 | if error is not None: 3717 | result['bozo'] = 1 3718 | result['bozo_exception'] = error 3719 | 3720 | result['version'], data, entities = replace_doctype(data) 3721 | 3722 | # Ensure that baseuri is an absolute URI using an acceptable URI scheme. 3723 | contentloc = http_headers.get('content-location', u'') 3724 | href = result.get('href', u'') 3725 | baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href 3726 | 3727 | baselang = http_headers.get('content-language', None) 3728 | if not isinstance(baselang, unicode) and baselang is not None: 3729 | baselang = baselang.decode('utf-8', 'ignore') 3730 | 3731 | if not _XML_AVAILABLE: 3732 | use_strict_parser = 0 3733 | if use_strict_parser: 3734 | # initialize the SAX parser 3735 | feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') 3736 | saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) 3737 | saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) 3738 | try: 3739 | # disable downloading external doctype references, if possible 3740 | saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) 3741 | except xml.sax.SAXNotSupportedException: 3742 | pass 3743 | saxparser.setContentHandler(feedparser) 3744 | saxparser.setErrorHandler(feedparser) 3745 | source = xml.sax.xmlreader.InputSource() 3746 | source.setByteStream(_StringIO(data)) 3747 | try: 3748 | saxparser.parse(source) 3749 | except xml.sax.SAXException, e: 3750 | result['bozo'] = 1 3751 | result['bozo_exception'] = feedparser.exc or e 3752 | use_strict_parser = 0 3753 | if not use_strict_parser and _SGML_AVAILABLE: 3754 | feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities) 3755 | feedparser.feed(data.decode('utf-8', 'replace')) 3756 | result['feed'] = feedparser.feeddata 3757 | result['entries'] = feedparser.entries 3758 | result['version'] = result['version'] or feedparser.version 3759 | result['namespaces'] = feedparser.namespacesInUse 3760 | return result 3761 | 3762 | # The list of EPSG codes for geographic (latitude/longitude) coordinate 3763 | # systems to support decoding of GeoRSS GML profiles. 3764 | _geogCS = [ 3765 | 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, 3766 | 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, 3767 | 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 3768 | 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, 3769 | 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 3770 | 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 3771 | 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 3772 | 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 3773 | 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 3774 | 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 3775 | 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, 3776 | 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 3777 | 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, 3778 | 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, 3779 | 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, 3780 | 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, 3781 | 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, 3782 | 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, 3783 | 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, 3784 | 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, 3785 | 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, 3786 | 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, 3787 | 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, 3788 | 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, 3789 | 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, 3790 | 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, 3791 | 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, 3792 | 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, 3793 | 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, 3794 | 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, 3795 | 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, 3796 | 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, 3797 | 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, 3798 | 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ] -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | # AUTOGENERATED 4 | 5 | # This index.yaml is automatically updated whenever the dev_appserver 6 | # detects that a new type of query is run. If you want to manage the 7 | # index.yaml file manually, remove the above marker line (the line 8 | # saying "# AUTOGENERATED"). If you want to manage some indexes 9 | # manually, move them above the marker line. The index.yaml file is 10 | # automatically uploaded to the admin console when you next deploy 11 | # your application using appcfg.py. 12 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2007 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import os 19 | import logging 20 | import webapp2 21 | import json 22 | 23 | from google.appengine.api import memcache 24 | from google.appengine.api import urlfetch 25 | from google.appengine.ext.webapp import template 26 | 27 | import extractlinks 28 | from extractlinks import LinkExtractor 29 | import feedparser 30 | import re 31 | import urlparse 32 | 33 | class MainHandler(webapp2.RequestHandler): 34 | 35 | def render_json(self, obj): 36 | self.response.headers["Content-Type"] = 'text/javascript' 37 | if self.request.get("callback"): 38 | self.response.write(self.request.get("callback") + "(" + json.dumps(obj) + ")") 39 | else: 40 | self.response.write(json.dumps(obj)) 41 | 42 | # Correct feed urls with rel="self" and add hubs 43 | def extend_feed(self, feed, links): 44 | feed_self = next((l for l in links if l['rel'] == 'self'), None) 45 | if feed_self is not None: 46 | feed['href'] = feed_self['href'] 47 | feed['type'] = feed_self['type'] 48 | feed['hubs'] = [l for l in links if l['rel'] == 'hub'] 49 | 50 | def get(self): 51 | # We need to clean up the url first and remove any fragment 52 | site_url = urlparse.urldefrag(self.request.get("url"))[0] 53 | force = (self.request.get("force").lower()) in ['true', '1'] 54 | extend = (self.request.get("extend").lower()) in ['true', '1'] 55 | feeds = [] # default value 56 | 57 | if site_url: 58 | feeds = memcache.get(site_url + "." + str(extend)) 59 | if feeds is not None and not force: 60 | # good 61 | logging.debug("Memcache hit.") 62 | self.render_json(feeds) 63 | else: 64 | logging.debug("Memcache miss.") 65 | try: 66 | result = urlfetch.fetch(url=site_url, deadline=10) 67 | parser = LinkExtractor() 68 | parser.set_base_url(site_url) 69 | parser.feed(result.content) 70 | if parser.links: 71 | feeds = parser.links 72 | else: 73 | feeds = [] 74 | 75 | if not feeds: 76 | # Let's check if by any chance this is actually not a feed? 77 | data = feedparser.parse(result.content) 78 | if data.bozo == 0: 79 | feed = {'title': data.feed.get('title', ''), 'rel': 'self', 'type': 'application/atom+xml', 'href': site_url} 80 | links = data.feed.get('links', []) 81 | if extend: 82 | self.extend_feed(feed, links) 83 | feeds = [feed] 84 | else: 85 | if extend: 86 | for f in feeds: 87 | data = feedparser.parse(f['href']) 88 | links = data.feed.get('links', []) 89 | self.extend_feed(f, links) 90 | 91 | except: 92 | feeds = [] 93 | 94 | if not memcache.set(site_url + "." + str(extend), feeds, 86400): 95 | logging.error("Memcache set failed.") 96 | else: 97 | logging.debug("Memcache set.") 98 | self.render_json(feeds) 99 | 100 | else: 101 | self.response.write(template.render(os.path.join(os.path.dirname(__file__), 'templates', "index.html"), {})) 102 | 103 | app = webapp2.WSGIApplication([('/', MainHandler)], debug=True) 104 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | Feediscovery 7 | 8 | 9 | 10 |

Feediscovery

11 |

12 | Let the music play! 13 |

14 |

Query:

15 |

GET http://feediscovery.appspot.com/?url=http://blog.superfeedr.com

16 | You can as well add a callback parameter. 17 |

Response:

18 | 19 |

[{"href":"http://blog.superfeedr.com/atom.xml","title":"Superfeedr' thoughts","rel":"alternate","type":"application/atom+xml"}]

20 | 21 | 22 | Brought to you by Superfeedr  | 23 | Learn more | Make it better! 24 | 25 | 26 | 27 | --------------------------------------------------------------------------------