├── README
├── app.yaml
├── extractlinks.py
├── feedparser.py
├── index.yaml
├── main.py
└── templates
    └── index.html


/README:
--------------------------------------------------------------------------------
1 | A tool that extracts and caches feed urls (for 1 week) for any webpage.
2 | Please do not abuse, or just fork and run it under your own Google App Engine account.
3 | 
4 | 


--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
 1 | runtime: python27
 2 | api_version: 1
 3 | threadsafe: yes
 4 | 
 5 | handlers:
 6 | - url: .*
 7 |   script: main.app
 8 | 
 9 | libraries:
10 | - name: webapp2
11 |   version: "2.5.2"
12 | 


--------------------------------------------------------------------------------
/extractlinks.py:
--------------------------------------------------------------------------------
 1 | from sgmllib import SGMLParser
 2 | from urlparse import urlparse
 3 | from urlparse import urljoin
 4 | 
 5 | import logging
 6 | 
 7 | 
 8 | class LinkExtractor(SGMLParser):
 9 |     """A simple LinkExtractor class"""
10 | 
11 |     def set_base_url(self, base_url=None):
12 |         self.base_url = base_url
13 | 
14 |     def make_absolute_and_add(self, dict_feed=None):
15 |         if 'href' in dict_feed:
16 |             p = urlparse(dict_feed['href'])
17 |             if p.scheme != "":
18 |                 self.links.append(dict_feed)
19 |             else:
20 |                 dict_feed['href'] = urljoin(self.base_url, dict_feed['href'])
21 |                 self.links.append(dict_feed)
22 | 
23 |     def reset(self):
24 |         SGMLParser.reset(self)
25 |         self.links = []
26 | 
27 |     def start_link(self, attrs):
28 |         if('type', 'application/rss+xml') in attrs:
29 |           self.make_absolute_and_add(dict(attrs))
30 |         if('type', 'application/atom+xml') in attrs:
31 |           self.make_absolute_and_add(dict(attrs))
32 | 


--------------------------------------------------------------------------------
/feedparser.py:
--------------------------------------------------------------------------------
   1 | """Universal feed parser
   2 | 
   3 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
   4 | 
   5 | Visit https://code.google.com/p/feedparser/ for the latest version
   6 | Visit http://packages.python.org/feedparser/ for the latest documentation
   7 | 
   8 | Required: Python 2.4 or later
   9 | Recommended: iconv_codec <http://cjkpython.i18n.org/>
  10 | """
  11 | 
  12 | __version__ = "5.1.3"
  13 | __license__ = """
  14 | Copyright (c) 2010-2013 Kurt McKee <contactme@kurtmckee.org>
  15 | Copyright (c) 2002-2008 Mark Pilgrim
  16 | All rights reserved.
  17 | 
  18 | Redistribution and use in source and binary forms, with or without modification,
  19 | are permitted provided that the following conditions are met:
  20 | 
  21 | * Redistributions of source code must retain the above copyright notice,
  22 |   this list of conditions and the following disclaimer.
  23 | * Redistributions in binary form must reproduce the above copyright notice,
  24 |   this list of conditions and the following disclaimer in the documentation
  25 |   and/or other materials provided with the distribution.
  26 | 
  27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 | POSSIBILITY OF SUCH DAMAGE."""
  38 | __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  39 | __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  40 |                     "John Beimler <http://john.beimler.org/>",
  41 |                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
  42 |                     "Aaron Swartz <http://aaronsw.com/>",
  43 |                     "Kevin Marks <http://epeus.blogspot.com/>",
  44 |                     "Sam Ruby <http://intertwingly.net/>",
  45 |                     "Ade Oshineye <http://blog.oshineye.com/>",
  46 |                     "Martin Pool <http://sourcefrog.net/>",
  47 |                     "Kurt McKee <http://kurtmckee.org/>",
  48 |                     "Bernd Schlapsi <https://github.com/brot>",]
  49 | 
  50 | # HTTP "User-Agent" header to send to servers when downloading feeds.
  51 | # If you are embedding feedparser in a larger application, you should
  52 | # change this to your application name and URL.
  53 | USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
  54 | 
  55 | # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
  56 | # want to send an Accept header, set this to None.
  57 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  58 | 
  59 | # List of preferred XML parsers, by SAX driver name.  These will be tried first,
  60 | # but if they're not installed, Python will keep searching through its own list
  61 | # of pre-installed parsers until it finds one that supports everything we need.
  62 | PREFERRED_XML_PARSERS = ["drv_libxml2"]
  63 | 
  64 | # If you want feedparser to automatically resolve all relative URIs, set this
  65 | # to 1.
  66 | RESOLVE_RELATIVE_URIS = 1
  67 | 
  68 | # If you want feedparser to automatically sanitize all potentially unsafe
  69 | # HTML content, set this to 1.
  70 | SANITIZE_HTML = 1
  71 | 
  72 | # ---------- Python 3 modules (make it work if possible) ----------
  73 | try:
  74 |     import rfc822
  75 | except ImportError:
  76 |     from email import _parseaddr as rfc822
  77 | 
  78 | try:
  79 |     # Python 3.1 introduces bytes.maketrans and simultaneously
  80 |     # deprecates string.maketrans; use bytes.maketrans if possible
  81 |     _maketrans = bytes.maketrans
  82 | except (NameError, AttributeError):
  83 |     import string
  84 |     _maketrans = string.maketrans
  85 | 
  86 | # base64 support for Atom feeds that contain embedded binary data
  87 | try:
  88 |     import base64, binascii
  89 | except ImportError:
  90 |     base64 = binascii = None
  91 | else:
  92 |     # Python 3.1 deprecates decodestring in favor of decodebytes
  93 |     _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
  94 | 
  95 | # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
  96 | # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
  97 | try:
  98 |     if bytes is str:
  99 |         # In Python 2.5 and below, bytes doesn't exist (NameError)
 100 |         # In Python 2.6 and above, bytes and str are the same type
 101 |         raise NameError
 102 | except NameError:
 103 |     # Python 2
 104 |     def _s2bytes(s):
 105 |         return s
 106 |     def _l2bytes(l):
 107 |         return ''.join(map(chr, l))
 108 | else:
 109 |     # Python 3
 110 |     def _s2bytes(s):
 111 |         return bytes(s, 'utf8')
 112 |     def _l2bytes(l):
 113 |         return bytes(l)
 114 | 
 115 | # If you want feedparser to allow all URL schemes, set this to ()
 116 | # List culled from Python's urlparse documentation at:
 117 | #   http://docs.python.org/library/urlparse.html
 118 | # as well as from "URI scheme" at Wikipedia:
 119 | #   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
 120 | # Many more will likely need to be added!
 121 | ACCEPTABLE_URI_SCHEMES = (
 122 |     'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
 123 |     'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
 124 |     'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
 125 |     'wais',
 126 |     # Additional common-but-unofficial schemes
 127 |     'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
 128 |     'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
 129 | )
 130 | #ACCEPTABLE_URI_SCHEMES = ()
 131 | 
 132 | # ---------- required modules (should come with any Python distribution) ----------
 133 | import cgi
 134 | import codecs
 135 | import copy
 136 | import datetime
 137 | import itertools
 138 | import re
 139 | import struct
 140 | import time
 141 | import types
 142 | import urllib
 143 | import urllib2
 144 | import urlparse
 145 | import warnings
 146 | 
 147 | from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
 148 | 
 149 | try:
 150 |     from io import BytesIO as _StringIO
 151 | except ImportError:
 152 |     try:
 153 |         from cStringIO import StringIO as _StringIO
 154 |     except ImportError:
 155 |         from StringIO import StringIO as _StringIO
 156 | 
 157 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
 158 | 
 159 | # gzip is included with most Python distributions, but may not be available if you compiled your own
 160 | try:
 161 |     import gzip
 162 | except ImportError:
 163 |     gzip = None
 164 | try:
 165 |     import zlib
 166 | except ImportError:
 167 |     zlib = None
 168 | 
 169 | # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
 170 | # been tested with the built-in SAX parser and libxml2.  On platforms where the
 171 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
 172 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
 173 | try:
 174 |     import xml.sax
 175 |     from xml.sax.saxutils import escape as _xmlescape
 176 | except ImportError:
 177 |     _XML_AVAILABLE = 0
 178 |     def _xmlescape(data,entities={}):
 179 |         data = data.replace('&', '&amp;')
 180 |         data = data.replace('>', '&gt;')
 181 |         data = data.replace('<', '&lt;')
 182 |         for char, entity in entities:
 183 |             data = data.replace(char, entity)
 184 |         return data
 185 | else:
 186 |     try:
 187 |         xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
 188 |     except xml.sax.SAXReaderNotAvailable:
 189 |         _XML_AVAILABLE = 0
 190 |     else:
 191 |         _XML_AVAILABLE = 1
 192 | 
 193 | # sgmllib is not available by default in Python 3; if the end user doesn't have
 194 | # it available then we'll lose illformed XML parsing and content santizing
 195 | try:
 196 |     import sgmllib
 197 | except ImportError:
 198 |     # This is probably Python 3, which doesn't include sgmllib anymore
 199 |     _SGML_AVAILABLE = 0
 200 | 
 201 |     # Mock sgmllib enough to allow subclassing later on
 202 |     class sgmllib(object):
 203 |         class SGMLParser(object):
 204 |             def goahead(self, i):
 205 |                 pass
 206 |             def parse_starttag(self, i):
 207 |                 pass
 208 | else:
 209 |     _SGML_AVAILABLE = 1
 210 | 
 211 |     # sgmllib defines a number of module-level regular expressions that are
 212 |     # insufficient for the XML parsing feedparser needs. Rather than modify
 213 |     # the variables directly in sgmllib, they're defined here using the same
 214 |     # names, and the compiled code objects of several sgmllib.SGMLParser
 215 |     # methods are copied into _BaseHTMLProcessor so that they execute in
 216 |     # feedparser's scope instead of sgmllib's scope.
 217 |     charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
 218 |     tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 219 |     attrfind = re.compile(
 220 |         r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
 221 |         r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
 222 |     )
 223 | 
 224 |     # Unfortunately, these must be copied over to prevent NameError exceptions
 225 |     entityref = sgmllib.entityref
 226 |     incomplete = sgmllib.incomplete
 227 |     interesting = sgmllib.interesting
 228 |     shorttag = sgmllib.shorttag
 229 |     shorttagopen = sgmllib.shorttagopen
 230 |     starttagopen = sgmllib.starttagopen
 231 | 
 232 |     class _EndBracketRegEx:
 233 |         def __init__(self):
 234 |             # Overriding the built-in sgmllib.endbracket regex allows the
 235 |             # parser to find angle brackets embedded in element attributes.
 236 |             self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
 237 |         def search(self, target, index=0):
 238 |             match = self.endbracket.match(target, index)
 239 |             if match is not None:
 240 |                 # Returning a new object in the calling thread's context
 241 |                 # resolves a thread-safety.
 242 |                 return EndBracketMatch(match)
 243 |             return None
 244 |     class EndBracketMatch:
 245 |         def __init__(self, match):
 246 |             self.match = match
 247 |         def start(self, n):
 248 |             return self.match.end(n)
 249 |     endbracket = _EndBracketRegEx()
 250 | 
 251 | 
 252 | # iconv_codec provides support for more character encodings.
 253 | # It's available from http://cjkpython.i18n.org/
 254 | try:
 255 |     import iconv_codec
 256 | except ImportError:
 257 |     pass
 258 | 
 259 | # chardet library auto-detects character encodings
 260 | # Download from http://chardet.feedparser.org/
 261 | try:
 262 |     import chardet
 263 | except ImportError:
 264 |     chardet = None
 265 | 
 266 | # ---------- don't touch these ----------
 267 | class ThingsNobodyCaresAboutButMe(Exception): pass
 268 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
 269 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
 270 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
 271 | class UndeclaredNamespace(Exception): pass
 272 | 
 273 | SUPPORTED_VERSIONS = {'': u'unknown',
 274 |                       'rss090': u'RSS 0.90',
 275 |                       'rss091n': u'RSS 0.91 (Netscape)',
 276 |                       'rss091u': u'RSS 0.91 (Userland)',
 277 |                       'rss092': u'RSS 0.92',
 278 |                       'rss093': u'RSS 0.93',
 279 |                       'rss094': u'RSS 0.94',
 280 |                       'rss20': u'RSS 2.0',
 281 |                       'rss10': u'RSS 1.0',
 282 |                       'rss': u'RSS (unknown version)',
 283 |                       'atom01': u'Atom 0.1',
 284 |                       'atom02': u'Atom 0.2',
 285 |                       'atom03': u'Atom 0.3',
 286 |                       'atom10': u'Atom 1.0',
 287 |                       'atom': u'Atom (unknown version)',
 288 |                       'cdf': u'CDF',
 289 |                       }
 290 | 
 291 | class FeedParserDict(dict):
 292 |     keymap = {'channel': 'feed',
 293 |               'items': 'entries',
 294 |               'guid': 'id',
 295 |               'date': 'updated',
 296 |               'date_parsed': 'updated_parsed',
 297 |               'description': ['summary', 'subtitle'],
 298 |               'description_detail': ['summary_detail', 'subtitle_detail'],
 299 |               'url': ['href'],
 300 |               'modified': 'updated',
 301 |               'modified_parsed': 'updated_parsed',
 302 |               'issued': 'published',
 303 |               'issued_parsed': 'published_parsed',
 304 |               'copyright': 'rights',
 305 |               'copyright_detail': 'rights_detail',
 306 |               'tagline': 'subtitle',
 307 |               'tagline_detail': 'subtitle_detail'}
 308 |     def __getitem__(self, key):
 309 |         if key == 'category':
 310 |             try:
 311 |                 return dict.__getitem__(self, 'tags')[0]['term']
 312 |             except IndexError:
 313 |                 raise KeyError, "object doesn't have key 'category'"
 314 |         elif key == 'enclosures':
 315 |             norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
 316 |             return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
 317 |         elif key == 'license':
 318 |             for link in dict.__getitem__(self, 'links'):
 319 |                 if link['rel']==u'license' and 'href' in link:
 320 |                     return link['href']
 321 |         elif key == 'updated':
 322 |             # Temporarily help developers out by keeping the old
 323 |             # broken behavior that was reported in issue 310.
 324 |             # This fix was proposed in issue 328.
 325 |             if not dict.__contains__(self, 'updated') and \
 326 |                 dict.__contains__(self, 'published'):
 327 |                 warnings.warn("To avoid breaking existing software while "
 328 |                     "fixing issue 310, a temporary mapping has been created "
 329 |                     "from `updated` to `published` if `updated` doesn't "
 330 |                     "exist. This fallback will be removed in a future version "
 331 |                     "of feedparser.", DeprecationWarning)
 332 |                 return dict.__getitem__(self, 'published')
 333 |             return dict.__getitem__(self, 'updated')
 334 |         elif key == 'updated_parsed':
 335 |             if not dict.__contains__(self, 'updated_parsed') and \
 336 |                 dict.__contains__(self, 'published_parsed'):
 337 |                 warnings.warn("To avoid breaking existing software while "
 338 |                     "fixing issue 310, a temporary mapping has been created "
 339 |                     "from `updated_parsed` to `published_parsed` if "
 340 |                     "`updated_parsed` doesn't exist. This fallback will be "
 341 |                     "removed in a future version of feedparser.",
 342 |                     DeprecationWarning)
 343 |                 return dict.__getitem__(self, 'published_parsed')
 344 |             return dict.__getitem__(self, 'updated_parsed')
 345 |         else:
 346 |             realkey = self.keymap.get(key, key)
 347 |             if isinstance(realkey, list):
 348 |                 for k in realkey:
 349 |                     if dict.__contains__(self, k):
 350 |                         return dict.__getitem__(self, k)
 351 |             elif dict.__contains__(self, realkey):
 352 |                 return dict.__getitem__(self, realkey)
 353 |         return dict.__getitem__(self, key)
 354 | 
 355 |     def __contains__(self, key):
 356 |         if key in ('updated', 'updated_parsed'):
 357 |             # Temporarily help developers out by keeping the old
 358 |             # broken behavior that was reported in issue 310.
 359 |             # This fix was proposed in issue 328.
 360 |             return dict.__contains__(self, key)
 361 |         try:
 362 |             self.__getitem__(key)
 363 |         except KeyError:
 364 |             return False
 365 |         else:
 366 |             return True
 367 | 
 368 |     has_key = __contains__
 369 | 
 370 |     def get(self, key, default=None):
 371 |         try:
 372 |             return self.__getitem__(key)
 373 |         except KeyError:
 374 |             return default
 375 | 
 376 |     def __setitem__(self, key, value):
 377 |         key = self.keymap.get(key, key)
 378 |         if isinstance(key, list):
 379 |             key = key[0]
 380 |         return dict.__setitem__(self, key, value)
 381 | 
 382 |     def setdefault(self, key, value):
 383 |         if key not in self:
 384 |             self[key] = value
 385 |             return value
 386 |         return self[key]
 387 | 
 388 |     def __getattr__(self, key):
 389 |         # __getattribute__() is called first; this will be called
 390 |         # only if an attribute was not already found
 391 |         try:
 392 |             return self.__getitem__(key)
 393 |         except KeyError:
 394 |             raise AttributeError, "object has no attribute '%s'" % key
 395 | 
 396 |     def __hash__(self):
 397 |         return id(self)
 398 | 
 399 | _cp1252 = {
 400 |     128: unichr(8364), # euro sign
 401 |     130: unichr(8218), # single low-9 quotation mark
 402 |     131: unichr( 402), # latin small letter f with hook
 403 |     132: unichr(8222), # double low-9 quotation mark
 404 |     133: unichr(8230), # horizontal ellipsis
 405 |     134: unichr(8224), # dagger
 406 |     135: unichr(8225), # double dagger
 407 |     136: unichr( 710), # modifier letter circumflex accent
 408 |     137: unichr(8240), # per mille sign
 409 |     138: unichr( 352), # latin capital letter s with caron
 410 |     139: unichr(8249), # single left-pointing angle quotation mark
 411 |     140: unichr( 338), # latin capital ligature oe
 412 |     142: unichr( 381), # latin capital letter z with caron
 413 |     145: unichr(8216), # left single quotation mark
 414 |     146: unichr(8217), # right single quotation mark
 415 |     147: unichr(8220), # left double quotation mark
 416 |     148: unichr(8221), # right double quotation mark
 417 |     149: unichr(8226), # bullet
 418 |     150: unichr(8211), # en dash
 419 |     151: unichr(8212), # em dash
 420 |     152: unichr( 732), # small tilde
 421 |     153: unichr(8482), # trade mark sign
 422 |     154: unichr( 353), # latin small letter s with caron
 423 |     155: unichr(8250), # single right-pointing angle quotation mark
 424 |     156: unichr( 339), # latin small ligature oe
 425 |     158: unichr( 382), # latin small letter z with caron
 426 |     159: unichr( 376), # latin capital letter y with diaeresis
 427 | }
 428 | 
 429 | _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
 430 | def _urljoin(base, uri):
 431 |     uri = _urifixer.sub(r'\1\3', uri)
 432 |     #try:
 433 |     if not isinstance(uri, unicode):
 434 |         uri = uri.decode('utf-8', 'ignore')
 435 |     uri = urlparse.urljoin(base, uri)
 436 |     if not isinstance(uri, unicode):
 437 |         return uri.decode('utf-8', 'ignore')
 438 |     return uri
 439 |     #except:
 440 |     #    uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
 441 |     #    return urlparse.urljoin(base, uri)
 442 | 
 443 | class _FeedParserMixin:
 444 |     namespaces = {
 445 |         '': '',
 446 |         'http://backend.userland.com/rss': '',
 447 |         'http://blogs.law.harvard.edu/tech/rss': '',
 448 |         'http://purl.org/rss/1.0/': '',
 449 |         'http://my.netscape.com/rdf/simple/0.9/': '',
 450 |         'http://example.com/newformat#': '',
 451 |         'http://example.com/necho': '',
 452 |         'http://purl.org/echo/': '',
 453 |         'uri/of/echo/namespace#': '',
 454 |         'http://purl.org/pie/': '',
 455 |         'http://purl.org/atom/ns#': '',
 456 |         'http://www.w3.org/2005/Atom': '',
 457 |         'http://purl.org/rss/1.0/modules/rss091#': '',
 458 | 
 459 |         'http://webns.net/mvcb/':                                'admin',
 460 |         'http://purl.org/rss/1.0/modules/aggregation/':          'ag',
 461 |         'http://purl.org/rss/1.0/modules/annotate/':             'annotate',
 462 |         'http://media.tangent.org/rss/1.0/':                     'audio',
 463 |         'http://backend.userland.com/blogChannelModule':         'blogChannel',
 464 |         'http://web.resource.org/cc/':                           'cc',
 465 |         'http://backend.userland.com/creativeCommonsRssModule':  'creativeCommons',
 466 |         'http://purl.org/rss/1.0/modules/company':               'co',
 467 |         'http://purl.org/rss/1.0/modules/content/':              'content',
 468 |         'http://my.theinfo.org/changed/1.0/rss/':                'cp',
 469 |         'http://purl.org/dc/elements/1.1/':                      'dc',
 470 |         'http://purl.org/dc/terms/':                             'dcterms',
 471 |         'http://purl.org/rss/1.0/modules/email/':                'email',
 472 |         'http://purl.org/rss/1.0/modules/event/':                'ev',
 473 |         'http://rssnamespace.org/feedburner/ext/1.0':            'feedburner',
 474 |         'http://freshmeat.net/rss/fm/':                          'fm',
 475 |         'http://xmlns.com/foaf/0.1/':                            'foaf',
 476 |         'http://www.w3.org/2003/01/geo/wgs84_pos#':              'geo',
 477 |         'http://www.georss.org/georss':                          'georss',
 478 |         'http://www.opengis.net/gml':                            'gml',
 479 |         'http://postneo.com/icbm/':                              'icbm',
 480 |         'http://purl.org/rss/1.0/modules/image/':                'image',
 481 |         'http://www.itunes.com/DTDs/PodCast-1.0.dtd':            'itunes',
 482 |         'http://example.com/DTDs/PodCast-1.0.dtd':               'itunes',
 483 |         'http://purl.org/rss/1.0/modules/link/':                 'l',
 484 |         'http://search.yahoo.com/mrss':                          'media',
 485 |         # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
 486 |         'http://search.yahoo.com/mrss/':                         'media',
 487 |         'http://madskills.com/public/xml/rss/module/pingback/':  'pingback',
 488 |         'http://prismstandard.org/namespaces/1.2/basic/':        'prism',
 489 |         'http://www.w3.org/1999/02/22-rdf-syntax-ns#':           'rdf',
 490 |         'http://www.w3.org/2000/01/rdf-schema#':                 'rdfs',
 491 |         'http://purl.org/rss/1.0/modules/reference/':            'ref',
 492 |         'http://purl.org/rss/1.0/modules/richequiv/':            'reqv',
 493 |         'http://purl.org/rss/1.0/modules/search/':               'search',
 494 |         'http://purl.org/rss/1.0/modules/slash/':                'slash',
 495 |         'http://schemas.xmlsoap.org/soap/envelope/':             'soap',
 496 |         'http://purl.org/rss/1.0/modules/servicestatus/':        'ss',
 497 |         'http://hacks.benhammersley.com/rss/streaming/':         'str',
 498 |         'http://purl.org/rss/1.0/modules/subscription/':         'sub',
 499 |         'http://purl.org/rss/1.0/modules/syndication/':          'sy',
 500 |         'http://schemas.pocketsoap.com/rss/myDescModule/':       'szf',
 501 |         'http://purl.org/rss/1.0/modules/taxonomy/':             'taxo',
 502 |         'http://purl.org/rss/1.0/modules/threading/':            'thr',
 503 |         'http://purl.org/rss/1.0/modules/textinput/':            'ti',
 504 |         'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
 505 |         'http://wellformedweb.org/commentAPI/':                  'wfw',
 506 |         'http://purl.org/rss/1.0/modules/wiki/':                 'wiki',
 507 |         'http://www.w3.org/1999/xhtml':                          'xhtml',
 508 |         'http://www.w3.org/1999/xlink':                          'xlink',
 509 |         'http://www.w3.org/XML/1998/namespace':                  'xml',
 510 |         'http://podlove.org/simple-chapters':                    'psc',
 511 |     }
 512 |     _matchnamespaces = {}
 513 | 
 514 |     can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
 515 |     can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
 516 |     can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
 517 |     html_types = [u'text/html', u'application/xhtml+xml']
 518 | 
 519 |     def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
 520 |         if not self._matchnamespaces:
 521 |             for k, v in self.namespaces.items():
 522 |                 self._matchnamespaces[k.lower()] = v
 523 |         self.feeddata = FeedParserDict() # feed-level data
 524 |         self.encoding = encoding # character encoding
 525 |         self.entries = [] # list of entry-level data
 526 |         self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
 527 |         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
 528 | 
 529 |         # the following are used internally to track state;
 530 |         # this is really out of control and should be refactored
 531 |         self.infeed = 0
 532 |         self.inentry = 0
 533 |         self.incontent = 0
 534 |         self.intextinput = 0
 535 |         self.inimage = 0
 536 |         self.inauthor = 0
 537 |         self.incontributor = 0
 538 |         self.inpublisher = 0
 539 |         self.insource = 0
 540 |         
 541 |         # georss
 542 |         self.ingeometry = 0
 543 |         
 544 |         self.sourcedata = FeedParserDict()
 545 |         self.contentparams = FeedParserDict()
 546 |         self._summaryKey = None
 547 |         self.namespacemap = {}
 548 |         self.elementstack = []
 549 |         self.basestack = []
 550 |         self.langstack = []
 551 |         self.baseuri = baseuri or u''
 552 |         self.lang = baselang or None
 553 |         self.svgOK = 0
 554 |         self.title_depth = -1
 555 |         self.depth = 0
 556 |         self.psc_chapters_counter = 0
 557 |         if baselang:
 558 |             self.feeddata['language'] = baselang.replace('_','-')
 559 | 
 560 |         # A map of the following form:
 561 |         #     {
 562 |         #         object_that_value_is_set_on: {
 563 |         #             property_name: depth_of_node_property_was_extracted_from,
 564 |         #             other_property: depth_of_node_property_was_extracted_from,
 565 |         #         },
 566 |         #     }
 567 |         self.property_depth_map = {}
 568 | 
 569 |     def _normalize_attributes(self, kv):
 570 |         k = kv[0].lower()
 571 |         v = k in ('rel', 'type') and kv[1].lower() or kv[1]
 572 |         # the sgml parser doesn't handle entities in attributes, nor
 573 |         # does it pass the attribute values through as unicode, while
 574 |         # strict xml parsers do -- account for this difference
 575 |         if isinstance(self, _LooseFeedParser):
 576 |             v = v.replace('&amp;', '&')
 577 |             if not isinstance(v, unicode):
 578 |                 v = v.decode('utf-8')
 579 |         return (k, v)
 580 | 
 581 |     def unknown_starttag(self, tag, attrs):
 582 |         # increment depth counter
 583 |         self.depth += 1
 584 | 
 585 |         # normalize attrs
 586 |         attrs = map(self._normalize_attributes, attrs)
 587 | 
 588 |         # track xml:base and xml:lang
 589 |         attrsD = dict(attrs)
 590 |         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
 591 |         if not isinstance(baseuri, unicode):
 592 |             baseuri = baseuri.decode(self.encoding, 'ignore')
 593 |         # ensure that self.baseuri is always an absolute URI that
 594 |         # uses a whitelisted URI scheme (e.g. not `javscript:`)
 595 |         if self.baseuri:
 596 |             self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
 597 |         else:
 598 |             self.baseuri = _urljoin(self.baseuri, baseuri)
 599 |         lang = attrsD.get('xml:lang', attrsD.get('lang'))
 600 |         if lang == '':
 601 |             # xml:lang could be explicitly set to '', we need to capture that
 602 |             lang = None
 603 |         elif lang is None:
 604 |             # if no xml:lang is specified, use parent lang
 605 |             lang = self.lang
 606 |         if lang:
 607 |             if tag in ('feed', 'rss', 'rdf:RDF'):
 608 |                 self.feeddata['language'] = lang.replace('_','-')
 609 |         self.lang = lang
 610 |         self.basestack.append(self.baseuri)
 611 |         self.langstack.append(lang)
 612 | 
 613 |         # track namespaces
 614 |         for prefix, uri in attrs:
 615 |             if prefix.startswith('xmlns:'):
 616 |                 self.trackNamespace(prefix[6:], uri)
 617 |             elif prefix == 'xmlns':
 618 |                 self.trackNamespace(None, uri)
 619 | 
 620 |         # track inline content
 621 |         if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
 622 |             if tag in ('xhtml:div', 'div'):
 623 |                 return # typepad does this 10/2007
 624 |             # element declared itself as escaped markup, but it isn't really
 625 |             self.contentparams['type'] = u'application/xhtml+xml'
 626 |         if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
 627 |             if tag.find(':') <> -1:
 628 |                 prefix, tag = tag.split(':', 1)
 629 |                 namespace = self.namespacesInUse.get(prefix, '')
 630 |                 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
 631 |                     attrs.append(('xmlns',namespace))
 632 |                 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
 633 |                     attrs.append(('xmlns',namespace))
 634 |             if tag == 'svg':
 635 |                 self.svgOK += 1
 636 |             return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
 637 | 
 638 |         # match namespaces
 639 |         if tag.find(':') <> -1:
 640 |             prefix, suffix = tag.split(':', 1)
 641 |         else:
 642 |             prefix, suffix = '', tag
 643 |         prefix = self.namespacemap.get(prefix, prefix)
 644 |         if prefix:
 645 |             prefix = prefix + '_'
 646 | 
 647 |         # special hack for better tracking of empty textinput/image elements in illformed feeds
 648 |         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
 649 |             self.intextinput = 0
 650 |         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
 651 |             self.inimage = 0
 652 | 
 653 |         # call special handler (if defined) or default handler
 654 |         methodname = '_start_' + prefix + suffix
 655 |         try:
 656 |             method = getattr(self, methodname)
 657 |             return method(attrsD)
 658 |         except AttributeError:
 659 |             # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
 660 |             unknown_tag = prefix + suffix
 661 |             if len(attrsD) == 0:
 662 |                 # No attributes so merge it into the encosing dictionary
 663 |                 return self.push(unknown_tag, 1)
 664 |             else:
 665 |                 # Has attributes so create it in its own dictionary
 666 |                 context = self._getContext()
 667 |                 context[unknown_tag] = attrsD
 668 | 
 669 |     def unknown_endtag(self, tag):
 670 |         # match namespaces
 671 |         if tag.find(':') <> -1:
 672 |             prefix, suffix = tag.split(':', 1)
 673 |         else:
 674 |             prefix, suffix = '', tag
 675 |         prefix = self.namespacemap.get(prefix, prefix)
 676 |         if prefix:
 677 |             prefix = prefix + '_'
 678 |         if suffix == 'svg' and self.svgOK:
 679 |             self.svgOK -= 1
 680 | 
 681 |         # call special handler (if defined) or default handler
 682 |         methodname = '_end_' + prefix + suffix
 683 |         try:
 684 |             if self.svgOK:
 685 |                 raise AttributeError()
 686 |             method = getattr(self, methodname)
 687 |             method()
 688 |         except AttributeError:
 689 |             self.pop(prefix + suffix)
 690 | 
 691 |         # track inline content
 692 |         if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
 693 |             # element declared itself as escaped markup, but it isn't really
 694 |             if tag in ('xhtml:div', 'div'):
 695 |                 return # typepad does this 10/2007
 696 |             self.contentparams['type'] = u'application/xhtml+xml'
 697 |         if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
 698 |             tag = tag.split(':')[-1]
 699 |             self.handle_data('</%s>' % tag, escape=0)
 700 | 
 701 |         # track xml:base and xml:lang going out of scope
 702 |         if self.basestack:
 703 |             self.basestack.pop()
 704 |             if self.basestack and self.basestack[-1]:
 705 |                 self.baseuri = self.basestack[-1]
 706 |         if self.langstack:
 707 |             self.langstack.pop()
 708 |             if self.langstack: # and (self.langstack[-1] is not None):
 709 |                 self.lang = self.langstack[-1]
 710 | 
 711 |         self.depth -= 1
 712 | 
 713 |     def handle_charref(self, ref):
 714 |         # called for each character reference, e.g. for '&#160;', ref will be '160'
 715 |         if not self.elementstack:
 716 |             return
 717 |         ref = ref.lower()
 718 |         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
 719 |             text = '&#%s;' % ref
 720 |         else:
 721 |             if ref[0] == 'x':
 722 |                 c = int(ref[1:], 16)
 723 |             else:
 724 |                 c = int(ref)
 725 |             text = unichr(c).encode('utf-8')
 726 |         self.elementstack[-1][2].append(text)
 727 | 
 728 |     def handle_entityref(self, ref):
 729 |         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
 730 |         if not self.elementstack:
 731 |             return
 732 |         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
 733 |             text = '&%s;' % ref
 734 |         elif ref in self.entities:
 735 |             text = self.entities[ref]
 736 |             if text.startswith('&#') and text.endswith(';'):
 737 |                 return self.handle_entityref(text)
 738 |         else:
 739 |             try:
 740 |                 name2codepoint[ref]
 741 |             except KeyError:
 742 |                 text = '&%s;' % ref
 743 |             else:
 744 |                 text = unichr(name2codepoint[ref]).encode('utf-8')
 745 |         self.elementstack[-1][2].append(text)
 746 | 
 747 |     def handle_data(self, text, escape=1):
 748 |         # called for each block of plain text, i.e. outside of any tag and
 749 |         # not containing any character or entity references
 750 |         if not self.elementstack:
 751 |             return
 752 |         if escape and self.contentparams.get('type') == u'application/xhtml+xml':
 753 |             text = _xmlescape(text)
 754 |         self.elementstack[-1][2].append(text)
 755 | 
 756 |     def handle_comment(self, text):
 757 |         # called for each comment, e.g. <!-- insert message here -->
 758 |         pass
 759 | 
 760 |     def handle_pi(self, text):
 761 |         # called for each processing instruction, e.g. <?instruction>
 762 |         pass
 763 | 
 764 |     def handle_decl(self, text):
 765 |         pass
 766 | 
 767 |     def parse_declaration(self, i):
 768 |         # override internal declaration handler to handle CDATA blocks
 769 |         if self.rawdata[i:i+9] == '<![CDATA[':
 770 |             k = self.rawdata.find(']]>', i)
 771 |             if k == -1:
 772 |                 # CDATA block began but didn't finish
 773 |                 k = len(self.rawdata)
 774 |                 return k
 775 |             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
 776 |             return k+3
 777 |         else:
 778 |             k = self.rawdata.find('>', i)
 779 |             if k >= 0:
 780 |                 return k+1
 781 |             else:
 782 |                 # We have an incomplete CDATA block.
 783 |                 return k
 784 | 
 785 |     def mapContentType(self, contentType):
 786 |         contentType = contentType.lower()
 787 |         if contentType == 'text' or contentType == 'plain':
 788 |             contentType = u'text/plain'
 789 |         elif contentType == 'html':
 790 |             contentType = u'text/html'
 791 |         elif contentType == 'xhtml':
 792 |             contentType = u'application/xhtml+xml'
 793 |         return contentType
 794 | 
 795 |     def trackNamespace(self, prefix, uri):
 796 |         loweruri = uri.lower()
 797 |         if not self.version:
 798 |             if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
 799 |                 self.version = u'rss090'
 800 |             elif loweruri == 'http://purl.org/rss/1.0/':
 801 |                 self.version = u'rss10'
 802 |             elif loweruri == 'http://www.w3.org/2005/atom':
 803 |                 self.version = u'atom10'
 804 |         if loweruri.find(u'backend.userland.com/rss') <> -1:
 805 |             # match any backend.userland.com namespace
 806 |             uri = u'http://backend.userland.com/rss'
 807 |             loweruri = uri
 808 |         if loweruri in self._matchnamespaces:
 809 |             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
 810 |             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
 811 |         else:
 812 |             self.namespacesInUse[prefix or ''] = uri
 813 | 
 814 |     def resolveURI(self, uri):
 815 |         return _urljoin(self.baseuri or u'', uri)
 816 | 
 817 |     def decodeEntities(self, element, data):
 818 |         return data
 819 | 
 820 |     def strattrs(self, attrs):
 821 |         return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
 822 | 
 823 |     def push(self, element, expectingText):
 824 |         self.elementstack.append([element, expectingText, []])
 825 | 
 826 |     def pop(self, element, stripWhitespace=1):
 827 |         if not self.elementstack:
 828 |             return
 829 |         if self.elementstack[-1][0] != element:
 830 |             return
 831 | 
 832 |         element, expectingText, pieces = self.elementstack.pop()
 833 | 
 834 |         if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
 835 |             # remove enclosing child element, but only if it is a <div> and
 836 |             # only if all the remaining content is nested underneath it.
 837 |             # This means that the divs would be retained in the following:
 838 |             #    <div>foo</div><div>bar</div>
 839 |             while pieces and len(pieces)>1 and not pieces[-1].strip():
 840 |                 del pieces[-1]
 841 |             while pieces and len(pieces)>1 and not pieces[0].strip():
 842 |                 del pieces[0]
 843 |             if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
 844 |                 depth = 0
 845 |                 for piece in pieces[:-1]:
 846 |                     if piece.startswith('</'):
 847 |                         depth -= 1
 848 |                         if depth == 0:
 849 |                             break
 850 |                     elif piece.startswith('<') and not piece.endswith('/>'):
 851 |                         depth += 1
 852 |                 else:
 853 |                     pieces = pieces[1:-1]
 854 | 
 855 |         # Ensure each piece is a str for Python 3
 856 |         for (i, v) in enumerate(pieces):
 857 |             if not isinstance(v, unicode):
 858 |                 pieces[i] = v.decode('utf-8')
 859 | 
 860 |         output = u''.join(pieces)
 861 |         if stripWhitespace:
 862 |             output = output.strip()
 863 |         if not expectingText:
 864 |             return output
 865 | 
 866 |         # decode base64 content
 867 |         if base64 and self.contentparams.get('base64', 0):
 868 |             try:
 869 |                 output = _base64decode(output)
 870 |             except binascii.Error:
 871 |                 pass
 872 |             except binascii.Incomplete:
 873 |                 pass
 874 |             except TypeError:
 875 |                 # In Python 3, base64 takes and outputs bytes, not str
 876 |                 # This may not be the most correct way to accomplish this
 877 |                 output = _base64decode(output.encode('utf-8')).decode('utf-8')
 878 | 
 879 |         # resolve relative URIs
 880 |         if (element in self.can_be_relative_uri) and output:
 881 |             output = self.resolveURI(output)
 882 | 
 883 |         # decode entities within embedded markup
 884 |         if not self.contentparams.get('base64', 0):
 885 |             output = self.decodeEntities(element, output)
 886 | 
 887 |         # some feed formats require consumers to guess
 888 |         # whether the content is html or plain text
 889 |         if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
 890 |             if self.lookslikehtml(output):
 891 |                 self.contentparams['type'] = u'text/html'
 892 | 
 893 |         # remove temporary cruft from contentparams
 894 |         try:
 895 |             del self.contentparams['mode']
 896 |         except KeyError:
 897 |             pass
 898 |         try:
 899 |             del self.contentparams['base64']
 900 |         except KeyError:
 901 |             pass
 902 | 
 903 |         is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
 904 |         # resolve relative URIs within embedded markup
 905 |         if is_htmlish and RESOLVE_RELATIVE_URIS:
 906 |             if element in self.can_contain_relative_uris:
 907 |                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
 908 | 
 909 |         # sanitize embedded markup
 910 |         if is_htmlish and SANITIZE_HTML:
 911 |             if element in self.can_contain_dangerous_markup:
 912 |                 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
 913 | 
 914 |         if self.encoding and not isinstance(output, unicode):
 915 |             output = output.decode(self.encoding, 'ignore')
 916 | 
 917 |         # address common error where people take data that is already
 918 |         # utf-8, presume that it is iso-8859-1, and re-encode it.
 919 |         if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
 920 |             try:
 921 |                 output = output.encode('iso-8859-1').decode('utf-8')
 922 |             except (UnicodeEncodeError, UnicodeDecodeError):
 923 |                 pass
 924 | 
 925 |         # map win-1252 extensions to the proper code points
 926 |         if isinstance(output, unicode):
 927 |             output = output.translate(_cp1252)
 928 | 
 929 |         # categories/tags/keywords/whatever are handled in _end_category
 930 |         if element == 'category':
 931 |             return output
 932 | 
 933 |         if element == 'title' and -1 < self.title_depth <= self.depth:
 934 |             return output
 935 | 
 936 |         # store output in appropriate place(s)
 937 |         if self.inentry and not self.insource:
 938 |             if element == 'content':
 939 |                 self.entries[-1].setdefault(element, [])
 940 |                 contentparams = copy.deepcopy(self.contentparams)
 941 |                 contentparams['value'] = output
 942 |                 self.entries[-1][element].append(contentparams)
 943 |             elif element == 'link':
 944 |                 if not self.inimage:
 945 |                     # query variables in urls in link elements are improperly
 946 |                     # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
 947 |                     # unhandled character references. fix this special case.
 948 |                     output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
 949 |                     self.entries[-1][element] = output
 950 |                     if output:
 951 |                         self.entries[-1]['links'][-1]['href'] = output
 952 |             else:
 953 |                 if element == 'description':
 954 |                     element = 'summary'
 955 |                 old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
 956 |                 if old_value_depth is None or self.depth <= old_value_depth:
 957 |                     self.property_depth_map[self.entries[-1]][element] = self.depth
 958 |                     self.entries[-1][element] = output
 959 |                 if self.incontent:
 960 |                     contentparams = copy.deepcopy(self.contentparams)
 961 |                     contentparams['value'] = output
 962 |                     self.entries[-1][element + '_detail'] = contentparams
 963 |         elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
 964 |             context = self._getContext()
 965 |             if element == 'description':
 966 |                 element = 'subtitle'
 967 |             context[element] = output
 968 |             if element == 'link':
 969 |                 # fix query variables; see above for the explanation
 970 |                 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
 971 |                 context[element] = output
 972 |                 context['links'][-1]['href'] = output
 973 |             elif self.incontent:
 974 |                 contentparams = copy.deepcopy(self.contentparams)
 975 |                 contentparams['value'] = output
 976 |                 context[element + '_detail'] = contentparams
 977 |         return output
 978 | 
 979 |     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
 980 |         self.incontent += 1
 981 |         if self.lang:
 982 |             self.lang=self.lang.replace('_','-')
 983 |         self.contentparams = FeedParserDict({
 984 |             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
 985 |             'language': self.lang,
 986 |             'base': self.baseuri})
 987 |         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
 988 |         self.push(tag, expectingText)
 989 | 
 990 |     def popContent(self, tag):
 991 |         value = self.pop(tag)
 992 |         self.incontent -= 1
 993 |         self.contentparams.clear()
 994 |         return value
 995 | 
 996 |     # a number of elements in a number of RSS variants are nominally plain
 997 |     # text, but this is routinely ignored.  This is an attempt to detect
 998 |     # the most common cases.  As false positives often result in silent
 999 |     # data loss, this function errs on the conservative side.
1000 |     @staticmethod
1001 |     def lookslikehtml(s):
1002 |         # must have a close tag or an entity reference to qualify
1003 |         if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
1004 |             return
1005 | 
1006 |         # all tags must be in a restricted subset of valid HTML tags
1007 |         if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1008 |             re.findall(r'</?(\w+)',s)):
1009 |             return
1010 | 
1011 |         # all entities must have been defined as valid HTML entities
1012 |         if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1013 |             return
1014 | 
1015 |         return 1
1016 | 
1017 |     def _mapToStandardPrefix(self, name):
1018 |         colonpos = name.find(':')
1019 |         if colonpos <> -1:
1020 |             prefix = name[:colonpos]
1021 |             suffix = name[colonpos+1:]
1022 |             prefix = self.namespacemap.get(prefix, prefix)
1023 |             name = prefix + ':' + suffix
1024 |         return name
1025 | 
1026 |     def _getAttribute(self, attrsD, name):
1027 |         return attrsD.get(self._mapToStandardPrefix(name))
1028 | 
1029 |     def _isBase64(self, attrsD, contentparams):
1030 |         if attrsD.get('mode', '') == 'base64':
1031 |             return 1
1032 |         if self.contentparams['type'].startswith(u'text/'):
1033 |             return 0
1034 |         if self.contentparams['type'].endswith(u'+xml'):
1035 |             return 0
1036 |         if self.contentparams['type'].endswith(u'/xml'):
1037 |             return 0
1038 |         return 1
1039 | 
1040 |     def _itsAnHrefDamnIt(self, attrsD):
1041 |         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1042 |         if href:
1043 |             try:
1044 |                 del attrsD['url']
1045 |             except KeyError:
1046 |                 pass
1047 |             try:
1048 |                 del attrsD['uri']
1049 |             except KeyError:
1050 |                 pass
1051 |             attrsD['href'] = href
1052 |         return attrsD
1053 | 
1054 |     def _save(self, key, value, overwrite=False):
1055 |         context = self._getContext()
1056 |         if overwrite:
1057 |             context[key] = value
1058 |         else:
1059 |             context.setdefault(key, value)
1060 | 
1061 |     def _start_rss(self, attrsD):
1062 |         versionmap = {'0.91': u'rss091u',
1063 |                       '0.92': u'rss092',
1064 |                       '0.93': u'rss093',
1065 |                       '0.94': u'rss094'}
1066 |         #If we're here then this is an RSS feed.
1067 |         #If we don't have a version or have a version that starts with something
1068 |         #other than RSS then there's been a mistake. Correct it.
1069 |         if not self.version or not self.version.startswith(u'rss'):
1070 |             attr_version = attrsD.get('version', '')
1071 |             version = versionmap.get(attr_version)
1072 |             if version:
1073 |                 self.version = version
1074 |             elif attr_version.startswith('2.'):
1075 |                 self.version = u'rss20'
1076 |             else:
1077 |                 self.version = u'rss'
1078 | 
1079 |     def _start_channel(self, attrsD):
1080 |         self.infeed = 1
1081 |         self._cdf_common(attrsD)
1082 | 
1083 |     def _cdf_common(self, attrsD):
1084 |         if 'lastmod' in attrsD:
1085 |             self._start_modified({})
1086 |             self.elementstack[-1][-1] = attrsD['lastmod']
1087 |             self._end_modified()
1088 |         if 'href' in attrsD:
1089 |             self._start_link({})
1090 |             self.elementstack[-1][-1] = attrsD['href']
1091 |             self._end_link()
1092 | 
1093 |     def _start_feed(self, attrsD):
1094 |         self.infeed = 1
1095 |         versionmap = {'0.1': u'atom01',
1096 |                       '0.2': u'atom02',
1097 |                       '0.3': u'atom03'}
1098 |         if not self.version:
1099 |             attr_version = attrsD.get('version')
1100 |             version = versionmap.get(attr_version)
1101 |             if version:
1102 |                 self.version = version
1103 |             else:
1104 |                 self.version = u'atom'
1105 | 
1106 |     def _end_channel(self):
1107 |         self.infeed = 0
1108 |     _end_feed = _end_channel
1109 | 
1110 |     def _start_image(self, attrsD):
1111 |         context = self._getContext()
1112 |         if not self.inentry:
1113 |             context.setdefault('image', FeedParserDict())
1114 |         self.inimage = 1
1115 |         self.title_depth = -1
1116 |         self.push('image', 0)
1117 | 
1118 |     def _end_image(self):
1119 |         self.pop('image')
1120 |         self.inimage = 0
1121 | 
1122 |     def _start_textinput(self, attrsD):
1123 |         context = self._getContext()
1124 |         context.setdefault('textinput', FeedParserDict())
1125 |         self.intextinput = 1
1126 |         self.title_depth = -1
1127 |         self.push('textinput', 0)
1128 |     _start_textInput = _start_textinput
1129 | 
1130 |     def _end_textinput(self):
1131 |         self.pop('textinput')
1132 |         self.intextinput = 0
1133 |     _end_textInput = _end_textinput
1134 | 
1135 |     def _start_author(self, attrsD):
1136 |         self.inauthor = 1
1137 |         self.push('author', 1)
1138 |         # Append a new FeedParserDict when expecting an author
1139 |         context = self._getContext()
1140 |         context.setdefault('authors', [])
1141 |         context['authors'].append(FeedParserDict())
1142 |     _start_managingeditor = _start_author
1143 |     _start_dc_author = _start_author
1144 |     _start_dc_creator = _start_author
1145 |     _start_itunes_author = _start_author
1146 | 
1147 |     def _end_author(self):
1148 |         self.pop('author')
1149 |         self.inauthor = 0
1150 |         self._sync_author_detail()
1151 |     _end_managingeditor = _end_author
1152 |     _end_dc_author = _end_author
1153 |     _end_dc_creator = _end_author
1154 |     _end_itunes_author = _end_author
1155 | 
1156 |     def _start_itunes_owner(self, attrsD):
1157 |         self.inpublisher = 1
1158 |         self.push('publisher', 0)
1159 | 
1160 |     def _end_itunes_owner(self):
1161 |         self.pop('publisher')
1162 |         self.inpublisher = 0
1163 |         self._sync_author_detail('publisher')
1164 | 
1165 |     def _start_contributor(self, attrsD):
1166 |         self.incontributor = 1
1167 |         context = self._getContext()
1168 |         context.setdefault('contributors', [])
1169 |         context['contributors'].append(FeedParserDict())
1170 |         self.push('contributor', 0)
1171 | 
1172 |     def _end_contributor(self):
1173 |         self.pop('contributor')
1174 |         self.incontributor = 0
1175 | 
1176 |     def _start_dc_contributor(self, attrsD):
1177 |         self.incontributor = 1
1178 |         context = self._getContext()
1179 |         context.setdefault('contributors', [])
1180 |         context['contributors'].append(FeedParserDict())
1181 |         self.push('name', 0)
1182 | 
1183 |     def _end_dc_contributor(self):
1184 |         self._end_name()
1185 |         self.incontributor = 0
1186 | 
1187 |     def _start_name(self, attrsD):
1188 |         self.push('name', 0)
1189 |     _start_itunes_name = _start_name
1190 | 
1191 |     def _end_name(self):
1192 |         value = self.pop('name')
1193 |         if self.inpublisher:
1194 |             self._save_author('name', value, 'publisher')
1195 |         elif self.inauthor:
1196 |             self._save_author('name', value)
1197 |         elif self.incontributor:
1198 |             self._save_contributor('name', value)
1199 |         elif self.intextinput:
1200 |             context = self._getContext()
1201 |             context['name'] = value
1202 |     _end_itunes_name = _end_name
1203 | 
1204 |     def _start_width(self, attrsD):
1205 |         self.push('width', 0)
1206 | 
1207 |     def _end_width(self):
1208 |         value = self.pop('width')
1209 |         try:
1210 |             value = int(value)
1211 |         except ValueError:
1212 |             value = 0
1213 |         if self.inimage:
1214 |             context = self._getContext()
1215 |             context['width'] = value
1216 | 
1217 |     def _start_height(self, attrsD):
1218 |         self.push('height', 0)
1219 | 
1220 |     def _end_height(self):
1221 |         value = self.pop('height')
1222 |         try:
1223 |             value = int(value)
1224 |         except ValueError:
1225 |             value = 0
1226 |         if self.inimage:
1227 |             context = self._getContext()
1228 |             context['height'] = value
1229 | 
1230 |     def _start_url(self, attrsD):
1231 |         self.push('href', 1)
1232 |     _start_homepage = _start_url
1233 |     _start_uri = _start_url
1234 | 
1235 |     def _end_url(self):
1236 |         value = self.pop('href')
1237 |         if self.inauthor:
1238 |             self._save_author('href', value)
1239 |         elif self.incontributor:
1240 |             self._save_contributor('href', value)
1241 |     _end_homepage = _end_url
1242 |     _end_uri = _end_url
1243 | 
1244 |     def _start_email(self, attrsD):
1245 |         self.push('email', 0)
1246 |     _start_itunes_email = _start_email
1247 | 
1248 |     def _end_email(self):
1249 |         value = self.pop('email')
1250 |         if self.inpublisher:
1251 |             self._save_author('email', value, 'publisher')
1252 |         elif self.inauthor:
1253 |             self._save_author('email', value)
1254 |         elif self.incontributor:
1255 |             self._save_contributor('email', value)
1256 |     _end_itunes_email = _end_email
1257 | 
1258 |     def _getContext(self):
1259 |         if self.insource:
1260 |             context = self.sourcedata
1261 |         elif self.inimage and 'image' in self.feeddata:
1262 |             context = self.feeddata['image']
1263 |         elif self.intextinput:
1264 |             context = self.feeddata['textinput']
1265 |         elif self.inentry:
1266 |             context = self.entries[-1]
1267 |         else:
1268 |             context = self.feeddata
1269 |         return context
1270 | 
1271 |     def _save_author(self, key, value, prefix='author'):
1272 |         context = self._getContext()
1273 |         context.setdefault(prefix + '_detail', FeedParserDict())
1274 |         context[prefix + '_detail'][key] = value
1275 |         self._sync_author_detail()
1276 |         context.setdefault('authors', [FeedParserDict()])
1277 |         context['authors'][-1][key] = value
1278 | 
1279 |     def _save_contributor(self, key, value):
1280 |         context = self._getContext()
1281 |         context.setdefault('contributors', [FeedParserDict()])
1282 |         context['contributors'][-1][key] = value
1283 | 
1284 |     def _sync_author_detail(self, key='author'):
1285 |         context = self._getContext()
1286 |         detail = context.get('%s_detail' % key)
1287 |         if detail:
1288 |             name = detail.get('name')
1289 |             email = detail.get('email')
1290 |             if name and email:
1291 |                 context[key] = u'%s (%s)' % (name, email)
1292 |             elif name:
1293 |                 context[key] = name
1294 |             elif email:
1295 |                 context[key] = email
1296 |         else:
1297 |             author, email = context.get(key), None
1298 |             if not author:
1299 |                 return
1300 |             emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1301 |             if emailmatch:
1302 |                 email = emailmatch.group(0)
1303 |                 # probably a better way to do the following, but it passes all the tests
1304 |                 author = author.replace(email, u'')
1305 |                 author = author.replace(u'()', u'')
1306 |                 author = author.replace(u'<>', u'')
1307 |                 author = author.replace(u'&lt;&gt;', u'')
1308 |                 author = author.strip()
1309 |                 if author and (author[0] == u'('):
1310 |                     author = author[1:]
1311 |                 if author and (author[-1] == u')'):
1312 |                     author = author[:-1]
1313 |                 author = author.strip()
1314 |             if author or email:
1315 |                 context.setdefault('%s_detail' % key, FeedParserDict())
1316 |             if author:
1317 |                 context['%s_detail' % key]['name'] = author
1318 |             if email:
1319 |                 context['%s_detail' % key]['email'] = email
1320 | 
1321 |     def _start_subtitle(self, attrsD):
1322 |         self.pushContent('subtitle', attrsD, u'text/plain', 1)
1323 |     _start_tagline = _start_subtitle
1324 |     _start_itunes_subtitle = _start_subtitle
1325 | 
1326 |     def _end_subtitle(self):
1327 |         self.popContent('subtitle')
1328 |     _end_tagline = _end_subtitle
1329 |     _end_itunes_subtitle = _end_subtitle
1330 | 
1331 |     def _start_rights(self, attrsD):
1332 |         self.pushContent('rights', attrsD, u'text/plain', 1)
1333 |     _start_dc_rights = _start_rights
1334 |     _start_copyright = _start_rights
1335 | 
1336 |     def _end_rights(self):
1337 |         self.popContent('rights')
1338 |     _end_dc_rights = _end_rights
1339 |     _end_copyright = _end_rights
1340 | 
1341 |     def _start_item(self, attrsD):
1342 |         self.entries.append(FeedParserDict())
1343 |         self.push('item', 0)
1344 |         self.inentry = 1
1345 |         self.guidislink = 0
1346 |         self.title_depth = -1
1347 |         self.psc_chapters_counter = 0
1348 |         id = self._getAttribute(attrsD, 'rdf:about')
1349 |         if id:
1350 |             context = self._getContext()
1351 |             context['id'] = id
1352 |         self._cdf_common(attrsD)
1353 |     _start_entry = _start_item
1354 | 
1355 |     def _end_item(self):
1356 |         self.pop('item')
1357 |         self.inentry = 0
1358 |     _end_entry = _end_item
1359 | 
1360 |     def _start_dc_language(self, attrsD):
1361 |         self.push('language', 1)
1362 |     _start_language = _start_dc_language
1363 | 
1364 |     def _end_dc_language(self):
1365 |         self.lang = self.pop('language')
1366 |     _end_language = _end_dc_language
1367 | 
1368 |     def _start_dc_publisher(self, attrsD):
1369 |         self.push('publisher', 1)
1370 |     _start_webmaster = _start_dc_publisher
1371 | 
1372 |     def _end_dc_publisher(self):
1373 |         self.pop('publisher')
1374 |         self._sync_author_detail('publisher')
1375 |     _end_webmaster = _end_dc_publisher
1376 | 
1377 |     def _start_published(self, attrsD):
1378 |         self.push('published', 1)
1379 |     _start_dcterms_issued = _start_published
1380 |     _start_issued = _start_published
1381 |     _start_pubdate = _start_published
1382 | 
1383 |     def _end_published(self):
1384 |         value = self.pop('published')
1385 |         self._save('published_parsed', _parse_date(value), overwrite=True)
1386 |     _end_dcterms_issued = _end_published
1387 |     _end_issued = _end_published
1388 |     _end_pubdate = _end_published
1389 | 
1390 |     def _start_updated(self, attrsD):
1391 |         self.push('updated', 1)
1392 |     _start_modified = _start_updated
1393 |     _start_dcterms_modified = _start_updated
1394 |     _start_dc_date = _start_updated
1395 |     _start_lastbuilddate = _start_updated
1396 | 
1397 |     def _end_updated(self):
1398 |         value = self.pop('updated')
1399 |         parsed_value = _parse_date(value)
1400 |         self._save('updated_parsed', parsed_value, overwrite=True)
1401 |     _end_modified = _end_updated
1402 |     _end_dcterms_modified = _end_updated
1403 |     _end_dc_date = _end_updated
1404 |     _end_lastbuilddate = _end_updated
1405 | 
1406 |     def _start_created(self, attrsD):
1407 |         self.push('created', 1)
1408 |     _start_dcterms_created = _start_created
1409 | 
1410 |     def _end_created(self):
1411 |         value = self.pop('created')
1412 |         self._save('created_parsed', _parse_date(value), overwrite=True)
1413 |     _end_dcterms_created = _end_created
1414 | 
1415 |     def _start_expirationdate(self, attrsD):
1416 |         self.push('expired', 1)
1417 | 
1418 |     def _end_expirationdate(self):
1419 |         self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1420 | 
1421 |     # geospatial location, or "where", from georss.org
1422 | 
1423 |     def _start_georssgeom(self, attrsD):
1424 |         self.push('geometry', 0)
1425 |         context = self._getContext()
1426 |         context['where'] = FeedParserDict()
1427 | 
1428 |     _start_georss_point = _start_georssgeom
1429 |     _start_georss_line = _start_georssgeom
1430 |     _start_georss_polygon = _start_georssgeom
1431 |     _start_georss_box = _start_georssgeom
1432 | 
1433 |     def _save_where(self, geometry):
1434 |         context = self._getContext()
1435 |         context['where'].update(geometry)
1436 | 
1437 |     def _end_georss_point(self):
1438 |         geometry = _parse_georss_point(self.pop('geometry'))
1439 |         if geometry:
1440 |             self._save_where(geometry)
1441 | 
1442 |     def _end_georss_line(self):
1443 |         geometry = _parse_georss_line(self.pop('geometry'))
1444 |         if geometry:
1445 |             self._save_where(geometry)
1446 |     
1447 |     def _end_georss_polygon(self):
1448 |         this = self.pop('geometry')
1449 |         geometry = _parse_georss_polygon(this)
1450 |         if geometry:
1451 |             self._save_where(geometry)
1452 | 
1453 |     def _end_georss_box(self):
1454 |         geometry = _parse_georss_box(self.pop('geometry'))
1455 |         if geometry:
1456 |             self._save_where(geometry)
1457 | 
1458 |     def _start_where(self, attrsD):
1459 |         self.push('where', 0)
1460 |         context = self._getContext()
1461 |         context['where'] = FeedParserDict()
1462 |     _start_georss_where = _start_where
1463 | 
1464 |     def _parse_srs_attrs(self, attrsD):
1465 |         srsName = attrsD.get('srsname')
1466 |         try:
1467 |             srsDimension = int(attrsD.get('srsdimension', '2'))
1468 |         except ValueError:
1469 |             srsDimension = 2
1470 |         context = self._getContext()
1471 |         context['where']['srsName'] = srsName
1472 |         context['where']['srsDimension'] = srsDimension
1473 | 
1474 |     def _start_gml_point(self, attrsD):
1475 |         self._parse_srs_attrs(attrsD)
1476 |         self.ingeometry = 1
1477 |         self.push('geometry', 0)
1478 | 
1479 |     def _start_gml_linestring(self, attrsD):
1480 |         self._parse_srs_attrs(attrsD)
1481 |         self.ingeometry = 'linestring'
1482 |         self.push('geometry', 0)
1483 | 
1484 |     def _start_gml_polygon(self, attrsD):
1485 |         self._parse_srs_attrs(attrsD)
1486 |         self.push('geometry', 0)
1487 | 
1488 |     def _start_gml_exterior(self, attrsD):
1489 |         self.push('geometry', 0)
1490 | 
1491 |     def _start_gml_linearring(self, attrsD):
1492 |         self.ingeometry = 'polygon'
1493 |         self.push('geometry', 0)
1494 | 
1495 |     def _start_gml_pos(self, attrsD):
1496 |         self.push('pos', 0)
1497 | 
1498 |     def _end_gml_pos(self):
1499 |         this = self.pop('pos')
1500 |         context = self._getContext()
1501 |         srsName = context['where'].get('srsName')
1502 |         srsDimension = context['where'].get('srsDimension', 2)
1503 |         swap = True
1504 |         if srsName and "EPSG" in srsName:
1505 |             epsg = int(srsName.split(":")[-1])
1506 |             swap = bool(epsg in _geogCS)
1507 |         geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
1508 |         if geometry:
1509 |             self._save_where(geometry)
1510 | 
1511 |     def _start_gml_poslist(self, attrsD):
1512 |         self.push('pos', 0)
1513 | 
1514 |     def _end_gml_poslist(self):
1515 |         this = self.pop('pos')
1516 |         context = self._getContext()
1517 |         srsName = context['where'].get('srsName')
1518 |         srsDimension = context['where'].get('srsDimension', 2)
1519 |         swap = True
1520 |         if srsName and "EPSG" in srsName:
1521 |             epsg = int(srsName.split(":")[-1])
1522 |             swap = bool(epsg in _geogCS)
1523 |         geometry = _parse_poslist(
1524 |             this, self.ingeometry, swap=swap, dims=srsDimension)
1525 |         if geometry:
1526 |             self._save_where(geometry)
1527 | 
1528 |     def _end_geom(self):
1529 |         self.ingeometry = 0
1530 |         self.pop('geometry')
1531 |     _end_gml_point = _end_geom
1532 |     _end_gml_linestring = _end_geom
1533 |     _end_gml_linearring = _end_geom
1534 |     _end_gml_exterior = _end_geom
1535 |     _end_gml_polygon = _end_geom
1536 | 
1537 |     def _end_where(self):
1538 |         self.pop('where')
1539 |     _end_georss_where = _end_where
1540 | 
1541 |     # end geospatial
1542 | 
1543 |     def _start_cc_license(self, attrsD):
1544 |         context = self._getContext()
1545 |         value = self._getAttribute(attrsD, 'rdf:resource')
1546 |         attrsD = FeedParserDict()
1547 |         attrsD['rel'] = u'license'
1548 |         if value:
1549 |             attrsD['href']=value
1550 |         context.setdefault('links', []).append(attrsD)
1551 | 
1552 |     def _start_creativecommons_license(self, attrsD):
1553 |         self.push('license', 1)
1554 |     _start_creativeCommons_license = _start_creativecommons_license
1555 | 
1556 |     def _end_creativecommons_license(self):
1557 |         value = self.pop('license')
1558 |         context = self._getContext()
1559 |         attrsD = FeedParserDict()
1560 |         attrsD['rel'] = u'license'
1561 |         if value:
1562 |             attrsD['href'] = value
1563 |         context.setdefault('links', []).append(attrsD)
1564 |         del context['license']
1565 |     _end_creativeCommons_license = _end_creativecommons_license
1566 | 
1567 |     def _addTag(self, term, scheme, label):
1568 |         context = self._getContext()
1569 |         tags = context.setdefault('tags', [])
1570 |         if (not term) and (not scheme) and (not label):
1571 |             return
1572 |         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1573 |         if value not in tags:
1574 |             tags.append(value)
1575 | 
1576 |     def _start_category(self, attrsD):
1577 |         term = attrsD.get('term')
1578 |         scheme = attrsD.get('scheme', attrsD.get('domain'))
1579 |         label = attrsD.get('label')
1580 |         self._addTag(term, scheme, label)
1581 |         self.push('category', 1)
1582 |     _start_dc_subject = _start_category
1583 |     _start_keywords = _start_category
1584 | 
1585 |     def _start_media_category(self, attrsD):
1586 |         attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1587 |         self._start_category(attrsD)
1588 | 
1589 |     def _end_itunes_keywords(self):
1590 |         for term in self.pop('itunes_keywords').split(','):
1591 |             if term.strip():
1592 |                 self._addTag(term.strip(), u'http://www.itunes.com/', None)
1593 | 
1594 |     def _start_itunes_category(self, attrsD):
1595 |         self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1596 |         self.push('category', 1)
1597 | 
1598 |     def _end_category(self):
1599 |         value = self.pop('category')
1600 |         if not value:
1601 |             return
1602 |         context = self._getContext()
1603 |         tags = context['tags']
1604 |         if value and len(tags) and not tags[-1]['term']:
1605 |             tags[-1]['term'] = value
1606 |         else:
1607 |             self._addTag(value, None, None)
1608 |     _end_dc_subject = _end_category
1609 |     _end_keywords = _end_category
1610 |     _end_itunes_category = _end_category
1611 |     _end_media_category = _end_category
1612 | 
1613 |     def _start_cloud(self, attrsD):
1614 |         self._getContext()['cloud'] = FeedParserDict(attrsD)
1615 | 
1616 |     def _start_link(self, attrsD):
1617 |         attrsD.setdefault('rel', u'alternate')
1618 |         if attrsD['rel'] == u'self':
1619 |             attrsD.setdefault('type', u'application/atom+xml')
1620 |         else:
1621 |             attrsD.setdefault('type', u'text/html')
1622 |         context = self._getContext()
1623 |         attrsD = self._itsAnHrefDamnIt(attrsD)
1624 |         if 'href' in attrsD:
1625 |             attrsD['href'] = self.resolveURI(attrsD['href'])
1626 |         expectingText = self.infeed or self.inentry or self.insource
1627 |         context.setdefault('links', [])
1628 |         if not (self.inentry and self.inimage):
1629 |             context['links'].append(FeedParserDict(attrsD))
1630 |         if 'href' in attrsD:
1631 |             expectingText = 0
1632 |             if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1633 |                 context['link'] = attrsD['href']
1634 |         else:
1635 |             self.push('link', expectingText)
1636 | 
1637 |     def _end_link(self):
1638 |         value = self.pop('link')
1639 | 
1640 |     def _start_guid(self, attrsD):
1641 |         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1642 |         self.push('id', 1)
1643 |     _start_id = _start_guid
1644 | 
1645 |     def _end_guid(self):
1646 |         value = self.pop('id')
1647 |         self._save('guidislink', self.guidislink and 'link' not in self._getContext())
1648 |         if self.guidislink:
1649 |             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1650 |             # and only if the item doesn't already have a link element
1651 |             self._save('link', value)
1652 |     _end_id = _end_guid
1653 | 
1654 |     def _start_title(self, attrsD):
1655 |         if self.svgOK:
1656 |             return self.unknown_starttag('title', attrsD.items())
1657 |         self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1658 |     _start_dc_title = _start_title
1659 |     _start_media_title = _start_title
1660 | 
1661 |     def _end_title(self):
1662 |         if self.svgOK:
1663 |             return
1664 |         value = self.popContent('title')
1665 |         if not value:
1666 |             return
1667 |         self.title_depth = self.depth
1668 |     _end_dc_title = _end_title
1669 | 
1670 |     def _end_media_title(self):
1671 |         title_depth = self.title_depth
1672 |         self._end_title()
1673 |         self.title_depth = title_depth
1674 | 
1675 |     def _start_description(self, attrsD):
1676 |         context = self._getContext()
1677 |         if 'summary' in context:
1678 |             self._summaryKey = 'content'
1679 |             self._start_content(attrsD)
1680 |         else:
1681 |             self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1682 |     _start_dc_description = _start_description
1683 |     _start_media_description = _start_description
1684 | 
1685 |     def _start_abstract(self, attrsD):
1686 |         self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1687 | 
1688 |     def _end_description(self):
1689 |         if self._summaryKey == 'content':
1690 |             self._end_content()
1691 |         else:
1692 |             value = self.popContent('description')
1693 |         self._summaryKey = None
1694 |     _end_abstract = _end_description
1695 |     _end_dc_description = _end_description
1696 |     _end_media_description = _end_description
1697 | 
1698 |     def _start_info(self, attrsD):
1699 |         self.pushContent('info', attrsD, u'text/plain', 1)
1700 |     _start_feedburner_browserfriendly = _start_info
1701 | 
1702 |     def _end_info(self):
1703 |         self.popContent('info')
1704 |     _end_feedburner_browserfriendly = _end_info
1705 | 
1706 |     def _start_generator(self, attrsD):
1707 |         if attrsD:
1708 |             attrsD = self._itsAnHrefDamnIt(attrsD)
1709 |             if 'href' in attrsD:
1710 |                 attrsD['href'] = self.resolveURI(attrsD['href'])
1711 |         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1712 |         self.push('generator', 1)
1713 | 
1714 |     def _end_generator(self):
1715 |         value = self.pop('generator')
1716 |         context = self._getContext()
1717 |         if 'generator_detail' in context:
1718 |             context['generator_detail']['name'] = value
1719 | 
1720 |     def _start_admin_generatoragent(self, attrsD):
1721 |         self.push('generator', 1)
1722 |         value = self._getAttribute(attrsD, 'rdf:resource')
1723 |         if value:
1724 |             self.elementstack[-1][2].append(value)
1725 |         self.pop('generator')
1726 |         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1727 | 
1728 |     def _start_admin_errorreportsto(self, attrsD):
1729 |         self.push('errorreportsto', 1)
1730 |         value = self._getAttribute(attrsD, 'rdf:resource')
1731 |         if value:
1732 |             self.elementstack[-1][2].append(value)
1733 |         self.pop('errorreportsto')
1734 | 
1735 |     def _start_summary(self, attrsD):
1736 |         context = self._getContext()
1737 |         if 'summary' in context:
1738 |             self._summaryKey = 'content'
1739 |             self._start_content(attrsD)
1740 |         else:
1741 |             self._summaryKey = 'summary'
1742 |             self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1743 |     _start_itunes_summary = _start_summary
1744 | 
1745 |     def _end_summary(self):
1746 |         if self._summaryKey == 'content':
1747 |             self._end_content()
1748 |         else:
1749 |             self.popContent(self._summaryKey or 'summary')
1750 |         self._summaryKey = None
1751 |     _end_itunes_summary = _end_summary
1752 | 
1753 |     def _start_enclosure(self, attrsD):
1754 |         attrsD = self._itsAnHrefDamnIt(attrsD)
1755 |         context = self._getContext()
1756 |         attrsD['rel'] = u'enclosure'
1757 |         context.setdefault('links', []).append(FeedParserDict(attrsD))
1758 | 
1759 |     def _start_source(self, attrsD):
1760 |         if 'url' in attrsD:
1761 |             # This means that we're processing a source element from an RSS 2.0 feed
1762 |             self.sourcedata['href'] = attrsD[u'url']
1763 |         self.push('source', 1)
1764 |         self.insource = 1
1765 |         self.title_depth = -1
1766 | 
1767 |     def _end_source(self):
1768 |         self.insource = 0
1769 |         value = self.pop('source')
1770 |         if value:
1771 |             self.sourcedata['title'] = value
1772 |         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1773 |         self.sourcedata.clear()
1774 | 
1775 |     def _start_content(self, attrsD):
1776 |         self.pushContent('content', attrsD, u'text/plain', 1)
1777 |         src = attrsD.get('src')
1778 |         if src:
1779 |             self.contentparams['src'] = src
1780 |         self.push('content', 1)
1781 | 
1782 |     def _start_body(self, attrsD):
1783 |         self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1784 |     _start_xhtml_body = _start_body
1785 | 
1786 |     def _start_content_encoded(self, attrsD):
1787 |         self.pushContent('content', attrsD, u'text/html', 1)
1788 |     _start_fullitem = _start_content_encoded
1789 | 
1790 |     def _end_content(self):
1791 |         copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1792 |         value = self.popContent('content')
1793 |         if copyToSummary:
1794 |             self._save('summary', value)
1795 | 
1796 |     _end_body = _end_content
1797 |     _end_xhtml_body = _end_content
1798 |     _end_content_encoded = _end_content
1799 |     _end_fullitem = _end_content
1800 | 
1801 |     def _start_itunes_image(self, attrsD):
1802 |         self.push('itunes_image', 0)
1803 |         if attrsD.get('href'):
1804 |             self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1805 |         elif attrsD.get('url'):
1806 |             self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
1807 |     _start_itunes_link = _start_itunes_image
1808 | 
1809 |     def _end_itunes_block(self):
1810 |         value = self.pop('itunes_block', 0)
1811 |         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1812 | 
1813 |     def _end_itunes_explicit(self):
1814 |         value = self.pop('itunes_explicit', 0)
1815 |         # Convert 'yes' -> True, 'clean' to False, and any other value to None
1816 |         # False and None both evaluate as False, so the difference can be ignored
1817 |         # by applications that only need to know if the content is explicit.
1818 |         self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1819 | 
1820 |     def _start_media_group(self, attrsD):
1821 |         # don't do anything, but don't break the enclosed tags either
1822 |         pass
1823 | 
1824 |     def _start_media_credit(self, attrsD):
1825 |         context = self._getContext()
1826 |         context.setdefault('media_credit', [])
1827 |         context['media_credit'].append(attrsD)
1828 |         self.push('credit', 1)
1829 | 
1830 |     def _end_media_credit(self):
1831 |         credit = self.pop('credit')
1832 |         if credit != None and len(credit.strip()) != 0:
1833 |             context = self._getContext()
1834 |             context['media_credit'][-1]['content'] = credit
1835 | 
1836 |     def _start_media_restriction(self, attrsD):
1837 |         context = self._getContext()
1838 |         context.setdefault('media_restriction', attrsD)
1839 |         self.push('restriction', 1)
1840 | 
1841 |     def _end_media_restriction(self):
1842 |         restriction = self.pop('restriction')
1843 |         if restriction != None and len(restriction.strip()) != 0:
1844 |             context = self._getContext()
1845 |             context['media_restriction']['content'] = restriction
1846 | 
1847 |     def _start_media_license(self, attrsD):
1848 |         context = self._getContext()
1849 |         context.setdefault('media_license', attrsD)
1850 |         self.push('license', 1)
1851 | 
1852 |     def _end_media_license(self):
1853 |         license = self.pop('license')
1854 |         if license != None and len(license.strip()) != 0:
1855 |             context = self._getContext()
1856 |             context['media_license']['content'] = license
1857 | 
1858 |     def _start_media_content(self, attrsD):
1859 |         context = self._getContext()
1860 |         context.setdefault('media_content', [])
1861 |         context['media_content'].append(attrsD)
1862 | 
1863 |     def _start_media_thumbnail(self, attrsD):
1864 |         context = self._getContext()
1865 |         context.setdefault('media_thumbnail', [])
1866 |         self.push('url', 1) # new
1867 |         context['media_thumbnail'].append(attrsD)
1868 | 
1869 |     def _end_media_thumbnail(self):
1870 |         url = self.pop('url')
1871 |         context = self._getContext()
1872 |         if url != None and len(url.strip()) != 0:
1873 |             if 'url' not in context['media_thumbnail'][-1]:
1874 |                 context['media_thumbnail'][-1]['url'] = url
1875 | 
1876 |     def _start_media_player(self, attrsD):
1877 |         self.push('media_player', 0)
1878 |         self._getContext()['media_player'] = FeedParserDict(attrsD)
1879 | 
1880 |     def _end_media_player(self):
1881 |         value = self.pop('media_player')
1882 |         context = self._getContext()
1883 |         context['media_player']['content'] = value
1884 | 
1885 |     def _start_newlocation(self, attrsD):
1886 |         self.push('newlocation', 1)
1887 | 
1888 |     def _end_newlocation(self):
1889 |         url = self.pop('newlocation')
1890 |         context = self._getContext()
1891 |         # don't set newlocation if the context isn't right
1892 |         if context is not self.feeddata:
1893 |             return
1894 |         context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1895 | 
1896 |     def _start_psc_chapters(self, attrsD):
1897 |         version = self._getAttribute(attrsD, 'version')
1898 |         if version == '1.1' and self.psc_chapters_counter == 0:
1899 |             self.psc_chapters_counter += 1
1900 |             attrsD['chapters'] = []
1901 |             self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
1902 |             
1903 |     def _end_psc_chapters(self):
1904 |         version = self._getContext()['psc_chapters']['version']
1905 |         if version == '1.1':
1906 |             self.psc_chapters_counter += 1
1907 |         
1908 |     def _start_psc_chapter(self, attrsD):
1909 |         if self.psc_chapters_counter == 1:
1910 |             start = self._getAttribute(attrsD, 'start')
1911 |             attrsD['start_parsed'] = _parse_psc_chapter_start(start)
1912 | 
1913 |             context = self._getContext()['psc_chapters']
1914 |             context['chapters'].append(FeedParserDict(attrsD))
1915 | 
1916 | 
1917 | if _XML_AVAILABLE:
1918 |     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1919 |         def __init__(self, baseuri, baselang, encoding):
1920 |             xml.sax.handler.ContentHandler.__init__(self)
1921 |             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1922 |             self.bozo = 0
1923 |             self.exc = None
1924 |             self.decls = {}
1925 | 
1926 |         def startPrefixMapping(self, prefix, uri):
1927 |             if not uri:
1928 |                 return
1929 |             # Jython uses '' instead of None; standardize on None
1930 |             prefix = prefix or None
1931 |             self.trackNamespace(prefix, uri)
1932 |             if prefix and uri == 'http://www.w3.org/1999/xlink':
1933 |                 self.decls['xmlns:' + prefix] = uri
1934 | 
1935 |         def startElementNS(self, name, qname, attrs):
1936 |             namespace, localname = name
1937 |             lowernamespace = str(namespace or '').lower()
1938 |             if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1939 |                 # match any backend.userland.com namespace
1940 |                 namespace = u'http://backend.userland.com/rss'
1941 |                 lowernamespace = namespace
1942 |             if qname and qname.find(':') > 0:
1943 |                 givenprefix = qname.split(':')[0]
1944 |             else:
1945 |                 givenprefix = None
1946 |             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1947 |             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
1948 |                 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1949 |             localname = str(localname).lower()
1950 | 
1951 |             # qname implementation is horribly broken in Python 2.1 (it
1952 |             # doesn't report any), and slightly broken in Python 2.2 (it
1953 |             # doesn't report the xml: namespace). So we match up namespaces
1954 |             # with a known list first, and then possibly override them with
1955 |             # the qnames the SAX parser gives us (if indeed it gives us any
1956 |             # at all).  Thanks to MatejC for helping me test this and
1957 |             # tirelessly telling me that it didn't work yet.
1958 |             attrsD, self.decls = self.decls, {}
1959 |             if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1960 |                 attrsD['xmlns']=namespace
1961 |             if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1962 |                 attrsD['xmlns']=namespace
1963 | 
1964 |             if prefix:
1965 |                 localname = prefix.lower() + ':' + localname
1966 |             elif namespace and not qname: #Expat
1967 |                 for name,value in self.namespacesInUse.items():
1968 |                     if name and value == namespace:
1969 |                         localname = name + ':' + localname
1970 |                         break
1971 | 
1972 |             for (namespace, attrlocalname), attrvalue in attrs.items():
1973 |                 lowernamespace = (namespace or '').lower()
1974 |                 prefix = self._matchnamespaces.get(lowernamespace, '')
1975 |                 if prefix:
1976 |                     attrlocalname = prefix + ':' + attrlocalname
1977 |                 attrsD[str(attrlocalname).lower()] = attrvalue
1978 |             for qname in attrs.getQNames():
1979 |                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1980 |             localname = str(localname).lower()
1981 |             self.unknown_starttag(localname, attrsD.items())
1982 | 
1983 |         def characters(self, text):
1984 |             self.handle_data(text)
1985 | 
1986 |         def endElementNS(self, name, qname):
1987 |             namespace, localname = name
1988 |             lowernamespace = str(namespace or '').lower()
1989 |             if qname and qname.find(':') > 0:
1990 |                 givenprefix = qname.split(':')[0]
1991 |             else:
1992 |                 givenprefix = ''
1993 |             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1994 |             if prefix:
1995 |                 localname = prefix + ':' + localname
1996 |             elif namespace and not qname: #Expat
1997 |                 for name,value in self.namespacesInUse.items():
1998 |                     if name and value == namespace:
1999 |                         localname = name + ':' + localname
2000 |                         break
2001 |             localname = str(localname).lower()
2002 |             self.unknown_endtag(localname)
2003 | 
2004 |         def error(self, exc):
2005 |             self.bozo = 1
2006 |             self.exc = exc
2007 | 
2008 |         # drv_libxml2 calls warning() in some cases
2009 |         warning = error
2010 | 
2011 |         def fatalError(self, exc):
2012 |             self.error(exc)
2013 |             raise exc
2014 | 
2015 | class _BaseHTMLProcessor(sgmllib.SGMLParser):
2016 |     special = re.compile('''[<>'"]''')
2017 |     bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
2018 |     elements_no_end_tag = set([
2019 |       'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
2020 |       'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
2021 |       'source', 'track', 'wbr'
2022 |     ])
2023 | 
2024 |     def __init__(self, encoding, _type):
2025 |         self.encoding = encoding
2026 |         self._type = _type
2027 |         sgmllib.SGMLParser.__init__(self)
2028 | 
2029 |     def reset(self):
2030 |         self.pieces = []
2031 |         sgmllib.SGMLParser.reset(self)
2032 | 
2033 |     def _shorttag_replace(self, match):
2034 |         tag = match.group(1)
2035 |         if tag in self.elements_no_end_tag:
2036 |             return '<' + tag + ' />'
2037 |         else:
2038 |             return '<' + tag + '></' + tag + '>'
2039 | 
2040 |     # By declaring these methods and overriding their compiled code
2041 |     # with the code from sgmllib, the original code will execute in
2042 |     # feedparser's scope instead of sgmllib's. This means that the
2043 |     # `tagfind` and `charref` regular expressions will be found as
2044 |     # they're declared above, not as they're declared in sgmllib.
2045 |     def goahead(self, i):
2046 |         pass
2047 |     goahead.func_code = sgmllib.SGMLParser.goahead.func_code
2048 | 
2049 |     def __parse_starttag(self, i):
2050 |         pass
2051 |     __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
2052 | 
2053 |     def parse_starttag(self,i):
2054 |         j = self.__parse_starttag(i)
2055 |         if self._type == 'application/xhtml+xml':
2056 |             if j>2 and self.rawdata[j-2:j]=='/>':
2057 |                 self.unknown_endtag(self.lasttag)
2058 |         return j
2059 | 
2060 |     def feed(self, data):
2061 |         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
2062 |         data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
2063 |         data = data.replace('&#39;', "'")
2064 |         data = data.replace('&#34;', '"')
2065 |         try:
2066 |             bytes
2067 |             if bytes is str:
2068 |                 raise NameError
2069 |             self.encoding = self.encoding + u'_INVALID_PYTHON_3'
2070 |         except NameError:
2071 |             if self.encoding and isinstance(data, unicode):
2072 |                 data = data.encode(self.encoding)
2073 |         sgmllib.SGMLParser.feed(self, data)
2074 |         sgmllib.SGMLParser.close(self)
2075 | 
2076 |     def normalize_attrs(self, attrs):
2077 |         if not attrs:
2078 |             return attrs
2079 |         # utility method to be called by descendants
2080 |         attrs = dict([(k.lower(), v) for k, v in attrs]).items()
2081 |         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
2082 |         attrs.sort()
2083 |         return attrs
2084 | 
2085 |     def unknown_starttag(self, tag, attrs):
2086 |         # called for each start tag
2087 |         # attrs is a list of (attr, value) tuples
2088 |         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
2089 |         uattrs = []
2090 |         strattrs=''
2091 |         if attrs:
2092 |             for key, value in attrs:
2093 |                 value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
2094 |                 value = self.bare_ampersand.sub("&amp;", value)
2095 |                 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2096 |                 if not isinstance(value, unicode):
2097 |                     value = value.decode(self.encoding, 'ignore')
2098 |                 try:
2099 |                     # Currently, in Python 3 the key is already a str, and cannot be decoded again
2100 |                     uattrs.append((unicode(key, self.encoding), value))
2101 |                 except TypeError:
2102 |                     uattrs.append((key, value))
2103 |             strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2104 |             if self.encoding:
2105 |                 try:
2106 |                     strattrs = strattrs.encode(self.encoding)
2107 |                 except (UnicodeEncodeError, LookupError):
2108 |                     pass
2109 |         if tag in self.elements_no_end_tag:
2110 |             self.pieces.append('<%s%s />' % (tag, strattrs))
2111 |         else:
2112 |             self.pieces.append('<%s%s>' % (tag, strattrs))
2113 | 
2114 |     def unknown_endtag(self, tag):
2115 |         # called for each end tag, e.g. for </pre>, tag will be 'pre'
2116 |         # Reconstruct the original end tag.
2117 |         if tag not in self.elements_no_end_tag:
2118 |             self.pieces.append("</%s>" % tag)
2119 | 
2120 |     def handle_charref(self, ref):
2121 |         # called for each character reference, e.g. for '&#160;', ref will be '160'
2122 |         # Reconstruct the original character reference.
2123 |         ref = ref.lower()
2124 |         if ref.startswith('x'):
2125 |             value = int(ref[1:], 16)
2126 |         else:
2127 |             value = int(ref)
2128 | 
2129 |         if value in _cp1252:
2130 |             self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
2131 |         else:
2132 |             self.pieces.append('&#%s;' % ref)
2133 | 
2134 |     def handle_entityref(self, ref):
2135 |         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
2136 |         # Reconstruct the original entity reference.
2137 |         if ref in name2codepoint or ref == 'apos':
2138 |             self.pieces.append('&%s;' % ref)
2139 |         else:
2140 |             self.pieces.append('&amp;%s' % ref)
2141 | 
2142 |     def handle_data(self, text):
2143 |         # called for each block of plain text, i.e. outside of any tag and
2144 |         # not containing any character or entity references
2145 |         # Store the original text verbatim.
2146 |         self.pieces.append(text)
2147 | 
2148 |     def handle_comment(self, text):
2149 |         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
2150 |         # Reconstruct the original comment.
2151 |         self.pieces.append('<!--%s-->' % text)
2152 | 
2153 |     def handle_pi(self, text):
2154 |         # called for each processing instruction, e.g. <?instruction>
2155 |         # Reconstruct original processing instruction.
2156 |         self.pieces.append('<?%s>' % text)
2157 | 
2158 |     def handle_decl(self, text):
2159 |         # called for the DOCTYPE, if present, e.g.
2160 |         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2161 |         #     "http://www.w3.org/TR/html4/loose.dtd">
2162 |         # Reconstruct original DOCTYPE
2163 |         self.pieces.append('<!%s>' % text)
2164 | 
2165 |     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
2166 |     def _scan_name(self, i, declstartpos):
2167 |         rawdata = self.rawdata
2168 |         n = len(rawdata)
2169 |         if i == n:
2170 |             return None, -1
2171 |         m = self._new_declname_match(rawdata, i)
2172 |         if m:
2173 |             s = m.group()
2174 |             name = s.strip()
2175 |             if (i + len(s)) == n:
2176 |                 return None, -1  # end of buffer
2177 |             return name.lower(), m.end()
2178 |         else:
2179 |             self.handle_data(rawdata)
2180 | #            self.updatepos(declstartpos, i)
2181 |             return None, -1
2182 | 
2183 |     def convert_charref(self, name):
2184 |         return '&#%s;' % name
2185 | 
2186 |     def convert_entityref(self, name):
2187 |         return '&%s;' % name
2188 | 
2189 |     def output(self):
2190 |         '''Return processed HTML as a single string'''
2191 |         return ''.join([str(p) for p in self.pieces])
2192 | 
2193 |     def parse_declaration(self, i):
2194 |         try:
2195 |             return sgmllib.SGMLParser.parse_declaration(self, i)
2196 |         except sgmllib.SGMLParseError:
2197 |             # escape the doctype declaration and continue parsing
2198 |             self.handle_data('&lt;')
2199 |             return i+1
2200 | 
2201 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2202 |     def __init__(self, baseuri, baselang, encoding, entities):
2203 |         sgmllib.SGMLParser.__init__(self)
2204 |         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2205 |         _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2206 |         self.entities=entities
2207 | 
2208 |     def decodeEntities(self, element, data):
2209 |         data = data.replace('&#60;', '&lt;')
2210 |         data = data.replace('&#x3c;', '&lt;')
2211 |         data = data.replace('&#x3C;', '&lt;')
2212 |         data = data.replace('&#62;', '&gt;')
2213 |         data = data.replace('&#x3e;', '&gt;')
2214 |         data = data.replace('&#x3E;', '&gt;')
2215 |         data = data.replace('&#38;', '&amp;')
2216 |         data = data.replace('&#x26;', '&amp;')
2217 |         data = data.replace('&#34;', '&quot;')
2218 |         data = data.replace('&#x22;', '&quot;')
2219 |         data = data.replace('&#39;', '&apos;')
2220 |         data = data.replace('&#x27;', '&apos;')
2221 |         if not self.contentparams.get('type', u'xml').endswith(u'xml'):
2222 |             data = data.replace('&lt;', '<')
2223 |             data = data.replace('&gt;', '>')
2224 |             data = data.replace('&amp;', '&')
2225 |             data = data.replace('&quot;', '"')
2226 |             data = data.replace('&apos;', "'")
2227 |         return data
2228 | 
2229 |     def strattrs(self, attrs):
2230 |         return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
2231 | 
2232 | class _RelativeURIResolver(_BaseHTMLProcessor):
2233 |     relative_uris = set([('a', 'href'),
2234 |                      ('applet', 'codebase'),
2235 |                      ('area', 'href'),
2236 |                      ('blockquote', 'cite'),
2237 |                      ('body', 'background'),
2238 |                      ('del', 'cite'),
2239 |                      ('form', 'action'),
2240 |                      ('frame', 'longdesc'),
2241 |                      ('frame', 'src'),
2242 |                      ('iframe', 'longdesc'),
2243 |                      ('iframe', 'src'),
2244 |                      ('head', 'profile'),
2245 |                      ('img', 'longdesc'),
2246 |                      ('img', 'src'),
2247 |                      ('img', 'usemap'),
2248 |                      ('input', 'src'),
2249 |                      ('input', 'usemap'),
2250 |                      ('ins', 'cite'),
2251 |                      ('link', 'href'),
2252 |                      ('object', 'classid'),
2253 |                      ('object', 'codebase'),
2254 |                      ('object', 'data'),
2255 |                      ('object', 'usemap'),
2256 |                      ('q', 'cite'),
2257 |                      ('script', 'src'),
2258 |                      ('video', 'poster')])
2259 | 
2260 |     def __init__(self, baseuri, encoding, _type):
2261 |         _BaseHTMLProcessor.__init__(self, encoding, _type)
2262 |         self.baseuri = baseuri
2263 | 
2264 |     def resolveURI(self, uri):
2265 |         return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
2266 | 
2267 |     def unknown_starttag(self, tag, attrs):
2268 |         attrs = self.normalize_attrs(attrs)
2269 |         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2270 |         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2271 | 
2272 | def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2273 |     if not _SGML_AVAILABLE:
2274 |         return htmlSource
2275 | 
2276 |     p = _RelativeURIResolver(baseURI, encoding, _type)
2277 |     p.feed(htmlSource)
2278 |     return p.output()
2279 | 
2280 | def _makeSafeAbsoluteURI(base, rel=None):
2281 |     # bail if ACCEPTABLE_URI_SCHEMES is empty
2282 |     if not ACCEPTABLE_URI_SCHEMES:
2283 |         try:
2284 |             return _urljoin(base, rel or u'')
2285 |         except ValueError:
2286 |             return u''
2287 |     if not base:
2288 |         return rel or u''
2289 |     if not rel:
2290 |         try:
2291 |             scheme = urlparse.urlparse(base)[0]
2292 |         except ValueError:
2293 |             return u''
2294 |         if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2295 |             return base
2296 |         return u''
2297 |     try:
2298 |         uri = _urljoin(base, rel)
2299 |     except ValueError:
2300 |         return u''
2301 |     if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2302 |         return u''
2303 |     return uri
2304 | 
2305 | class _HTMLSanitizer(_BaseHTMLProcessor):
2306 |     acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
2307 |         'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2308 |         'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2309 |         'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2310 |         'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2311 |         'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2312 |         'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2313 |         'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2314 |         'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2315 |         'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2316 |         'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2317 |         'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2318 |         'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
2319 | 
2320 |     acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
2321 |       'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2322 |       'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2323 |       'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2324 |       'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2325 |       'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2326 |       'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2327 |       'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2328 |       'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2329 |       'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2330 |       'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2331 |       'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2332 |       'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2333 |       'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2334 |       'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2335 |       'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
2336 |       'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
2337 |       'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
2338 |       'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
2339 |       'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
2340 |       'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
2341 |       'width', 'wrap', 'xml:lang'])
2342 | 
2343 |     unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
2344 | 
2345 |     acceptable_css_properties = set(['azimuth', 'background-color',
2346 |       'border-bottom-color', 'border-collapse', 'border-color',
2347 |       'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2348 |       'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2349 |       'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2350 |       'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2351 |       'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2352 |       'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2353 |       'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2354 |       'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2355 |       'white-space', 'width'])
2356 | 
2357 |     # survey of common keywords found in feeds
2358 |     acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
2359 |       'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2360 |       'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2361 |       'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2362 |       'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2363 |       'transparent', 'underline', 'white', 'yellow'])
2364 | 
2365 |     valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2366 |       '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2367 | 
2368 |     mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math',
2369 |       'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2370 |       'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2371 |       'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2372 |       'munderover', 'none', 'semantics'])
2373 | 
2374 |     mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign',
2375 |       'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2376 |       'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2377 |       'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2378 |       'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2379 |       'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2380 |       'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2381 |       'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2382 |       'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'])
2383 | 
2384 |     # svgtiny - foreignObject + linearGradient + radialGradient + stop
2385 |     svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
2386 |       'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2387 |       'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2388 |       'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2389 |       'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2390 |       'svg', 'switch', 'text', 'title', 'tspan', 'use'])
2391 | 
2392 |     # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2393 |     svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
2394 |        'arabic-form', 'ascent', 'attributeName', 'attributeType',
2395 |        'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2396 |        'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2397 |        'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2398 |        'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2399 |        'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2400 |        'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2401 |        'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2402 |        'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2403 |        'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2404 |        'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2405 |        'overline-position', 'overline-thickness', 'panose-1', 'path',
2406 |        'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2407 |        'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2408 |        'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2409 |        'stop-color', 'stop-opacity', 'strikethrough-position',
2410 |        'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2411 |        'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2412 |        'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2413 |        'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2414 |        'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2415 |        'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2416 |        'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2417 |        'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2418 |        'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2419 |        'y2', 'zoomAndPan'])
2420 | 
2421 |     svg_attr_map = None
2422 |     svg_elem_map = None
2423 | 
2424 |     acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
2425 |       'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2426 |       'stroke-opacity'])
2427 | 
2428 |     def reset(self):
2429 |         _BaseHTMLProcessor.reset(self)
2430 |         self.unacceptablestack = 0
2431 |         self.mathmlOK = 0
2432 |         self.svgOK = 0
2433 | 
2434 |     def unknown_starttag(self, tag, attrs):
2435 |         acceptable_attributes = self.acceptable_attributes
2436 |         keymap = {}
2437 |         if not tag in self.acceptable_elements or self.svgOK:
2438 |             if tag in self.unacceptable_elements_with_end_tag:
2439 |                 self.unacceptablestack += 1
2440 | 
2441 |             # add implicit namespaces to html5 inline svg/mathml
2442 |             if self._type.endswith('html'):
2443 |                 if not dict(attrs).get('xmlns'):
2444 |                     if tag=='svg':
2445 |                         attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2446 |                     if tag=='math':
2447 |                         attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2448 | 
2449 |             # not otherwise acceptable, perhaps it is MathML or SVG?
2450 |             if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2451 |                 self.mathmlOK += 1
2452 |             if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2453 |                 self.svgOK += 1
2454 | 
2455 |             # chose acceptable attributes based on tag class, else bail
2456 |             if  self.mathmlOK and tag in self.mathml_elements:
2457 |                 acceptable_attributes = self.mathml_attributes
2458 |             elif self.svgOK and tag in self.svg_elements:
2459 |                 # for most vocabularies, lowercasing is a good idea.  Many
2460 |                 # svg elements, however, are camel case
2461 |                 if not self.svg_attr_map:
2462 |                     lower=[attr.lower() for attr in self.svg_attributes]
2463 |                     mix=[a for a in self.svg_attributes if a not in lower]
2464 |                     self.svg_attributes = lower
2465 |                     self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2466 | 
2467 |                     lower=[attr.lower() for attr in self.svg_elements]
2468 |                     mix=[a for a in self.svg_elements if a not in lower]
2469 |                     self.svg_elements = lower
2470 |                     self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2471 |                 acceptable_attributes = self.svg_attributes
2472 |                 tag = self.svg_elem_map.get(tag,tag)
2473 |                 keymap = self.svg_attr_map
2474 |             elif not tag in self.acceptable_elements:
2475 |                 return
2476 | 
2477 |         # declare xlink namespace, if needed
2478 |         if self.mathmlOK or self.svgOK:
2479 |             if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2480 |                 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2481 |                     attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2482 | 
2483 |         clean_attrs = []
2484 |         for key, value in self.normalize_attrs(attrs):
2485 |             if key in acceptable_attributes:
2486 |                 key=keymap.get(key,key)
2487 |                 # make sure the uri uses an acceptable uri scheme
2488 |                 if key == u'href':
2489 |                     value = _makeSafeAbsoluteURI(value)
2490 |                 clean_attrs.append((key,value))
2491 |             elif key=='style':
2492 |                 clean_value = self.sanitize_style(value)
2493 |                 if clean_value:
2494 |                     clean_attrs.append((key,clean_value))
2495 |         _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2496 | 
2497 |     def unknown_endtag(self, tag):
2498 |         if not tag in self.acceptable_elements:
2499 |             if tag in self.unacceptable_elements_with_end_tag:
2500 |                 self.unacceptablestack -= 1
2501 |             if self.mathmlOK and tag in self.mathml_elements:
2502 |                 if tag == 'math' and self.mathmlOK:
2503 |                     self.mathmlOK -= 1
2504 |             elif self.svgOK and tag in self.svg_elements:
2505 |                 tag = self.svg_elem_map.get(tag,tag)
2506 |                 if tag == 'svg' and self.svgOK:
2507 |                     self.svgOK -= 1
2508 |             else:
2509 |                 return
2510 |         _BaseHTMLProcessor.unknown_endtag(self, tag)
2511 | 
2512 |     def handle_pi(self, text):
2513 |         pass
2514 | 
2515 |     def handle_decl(self, text):
2516 |         pass
2517 | 
2518 |     def handle_data(self, text):
2519 |         if not self.unacceptablestack:
2520 |             _BaseHTMLProcessor.handle_data(self, text)
2521 | 
2522 |     def sanitize_style(self, style):
2523 |         # disallow urls
2524 |         style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2525 | 
2526 |         # gauntlet
2527 |         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
2528 |             return ''
2529 |         # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2530 |         if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
2531 |             return ''
2532 | 
2533 |         clean = []
2534 |         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2535 |             if not value:
2536 |                 continue
2537 |             if prop.lower() in self.acceptable_css_properties:
2538 |                 clean.append(prop + ': ' + value + ';')
2539 |             elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2540 |                 for keyword in value.split():
2541 |                     if not keyword in self.acceptable_css_keywords and \
2542 |                         not self.valid_css_values.match(keyword):
2543 |                         break
2544 |                 else:
2545 |                     clean.append(prop + ': ' + value + ';')
2546 |             elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2547 |                 clean.append(prop + ': ' + value + ';')
2548 | 
2549 |         return ' '.join(clean)
2550 | 
2551 |     def parse_comment(self, i, report=1):
2552 |         ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2553 |         if ret >= 0:
2554 |             return ret
2555 |         # if ret == -1, this may be a malicious attempt to circumvent
2556 |         # sanitization, or a page-destroying unclosed comment
2557 |         match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2558 |         if match:
2559 |             return match.end()
2560 |         # unclosed comment; deliberately fail to handle_data()
2561 |         return len(self.rawdata)
2562 | 
2563 | 
2564 | def _sanitizeHTML(htmlSource, encoding, _type):
2565 |     if not _SGML_AVAILABLE:
2566 |         return htmlSource
2567 |     p = _HTMLSanitizer(encoding, _type)
2568 |     htmlSource = htmlSource.replace('<![CDATA[', '&lt;![CDATA[')
2569 |     p.feed(htmlSource)
2570 |     data = p.output()
2571 |     data = data.strip().replace('\r\n', '\n')
2572 |     return data
2573 | 
2574 | class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2575 |     def http_error_default(self, req, fp, code, msg, headers):
2576 |         # The default implementation just raises HTTPError.
2577 |         # Forget that.
2578 |         fp.status = code
2579 |         return fp
2580 | 
2581 |     def http_error_301(self, req, fp, code, msg, hdrs):
2582 |         result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
2583 |                                                             code, msg, hdrs)
2584 |         result.status = code
2585 |         result.newurl = result.geturl()
2586 |         return result
2587 |     # The default implementations in urllib2.HTTPRedirectHandler
2588 |     # are identical, so hardcoding a http_error_301 call above
2589 |     # won't affect anything
2590 |     http_error_300 = http_error_301
2591 |     http_error_302 = http_error_301
2592 |     http_error_303 = http_error_301
2593 |     http_error_307 = http_error_301
2594 | 
2595 |     def http_error_401(self, req, fp, code, msg, headers):
2596 |         # Check if
2597 |         # - server requires digest auth, AND
2598 |         # - we tried (unsuccessfully) with basic auth, AND
2599 |         # If all conditions hold, parse authentication information
2600 |         # out of the Authorization header we sent the first time
2601 |         # (for the username and password) and the WWW-Authenticate
2602 |         # header the server sent back (for the realm) and retry
2603 |         # the request with the appropriate digest auth headers instead.
2604 |         # This evil genius hack has been brought to you by Aaron Swartz.
2605 |         host = urlparse.urlparse(req.get_full_url())[1]
2606 |         if base64 is None or 'Authorization' not in req.headers \
2607 |                           or 'WWW-Authenticate' not in headers:
2608 |             return self.http_error_default(req, fp, code, msg, headers)
2609 |         auth = _base64decode(req.headers['Authorization'].split(' ')[1])
2610 |         user, passw = auth.split(':')
2611 |         realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2612 |         self.add_password(realm, host, user, passw)
2613 |         retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2614 |         self.reset_retry_count()
2615 |         return retry
2616 | 
2617 | def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2618 |     """URL, filename, or string --> stream
2619 | 
2620 |     This function lets you define parsers that take any input source
2621 |     (URL, pathname to local or network file, or actual data as a string)
2622 |     and deal with it in a uniform manner.  Returned object is guaranteed
2623 |     to have all the basic stdio read methods (read, readline, readlines).
2624 |     Just .close() the object when you're done with it.
2625 | 
2626 |     If the etag argument is supplied, it will be used as the value of an
2627 |     If-None-Match request header.
2628 | 
2629 |     If the modified argument is supplied, it can be a tuple of 9 integers
2630 |     (as returned by gmtime() in the standard Python time module) or a date
2631 |     string in any format supported by feedparser. Regardless, it MUST
2632 |     be in GMT (Greenwich Mean Time). It will be reformatted into an
2633 |     RFC 1123-compliant date and used as the value of an If-Modified-Since
2634 |     request header.
2635 | 
2636 |     If the agent argument is supplied, it will be used as the value of a
2637 |     User-Agent request header.
2638 | 
2639 |     If the referrer argument is supplied, it will be used as the value of a
2640 |     Referer[sic] request header.
2641 | 
2642 |     If handlers is supplied, it is a list of handlers used to build a
2643 |     urllib2 opener.
2644 | 
2645 |     if request_headers is supplied it is a dictionary of HTTP request headers
2646 |     that will override the values generated by FeedParser.
2647 |     """
2648 | 
2649 |     if hasattr(url_file_stream_or_string, 'read'):
2650 |         return url_file_stream_or_string
2651 | 
2652 |     if isinstance(url_file_stream_or_string, basestring) \
2653 |        and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2654 |         # Deal with the feed URI scheme
2655 |         if url_file_stream_or_string.startswith('feed:http'):
2656 |             url_file_stream_or_string = url_file_stream_or_string[5:]
2657 |         elif url_file_stream_or_string.startswith('feed:'):
2658 |             url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2659 |         if not agent:
2660 |             agent = USER_AGENT
2661 |         # Test for inline user:password credentials for HTTP basic auth
2662 |         auth = None
2663 |         if base64 and not url_file_stream_or_string.startswith('ftp:'):
2664 |             urltype, rest = urllib.splittype(url_file_stream_or_string)
2665 |             realhost, rest = urllib.splithost(rest)
2666 |             if realhost:
2667 |                 user_passwd, realhost = urllib.splituser(realhost)
2668 |                 if user_passwd:
2669 |                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2670 |                     auth = base64.standard_b64encode(user_passwd).strip()
2671 | 
2672 |         # iri support
2673 |         if isinstance(url_file_stream_or_string, unicode):
2674 |             url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2675 | 
2676 |         # try to open with urllib2 (to use optional headers)
2677 |         request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2678 |         opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
2679 |         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2680 |         try:
2681 |             return opener.open(request)
2682 |         finally:
2683 |             opener.close() # JohnD
2684 | 
2685 |     # try to open with native open function (if url_file_stream_or_string is a filename)
2686 |     try:
2687 |         return open(url_file_stream_or_string, 'rb')
2688 |     except (IOError, UnicodeEncodeError, TypeError):
2689 |         # if url_file_stream_or_string is a unicode object that
2690 |         # cannot be converted to the encoding returned by
2691 |         # sys.getfilesystemencoding(), a UnicodeEncodeError
2692 |         # will be thrown
2693 |         # If url_file_stream_or_string is a string that contains NULL
2694 |         # (such as an XML document encoded in UTF-32), TypeError will
2695 |         # be thrown.
2696 |         pass
2697 | 
2698 |     # treat url_file_stream_or_string as string
2699 |     if isinstance(url_file_stream_or_string, unicode):
2700 |         return _StringIO(url_file_stream_or_string.encode('utf-8'))
2701 |     return _StringIO(url_file_stream_or_string)
2702 | 
2703 | def _convert_to_idn(url):
2704 |     """Convert a URL to IDN notation"""
2705 |     # this function should only be called with a unicode string
2706 |     # strategy: if the host cannot be encoded in ascii, then
2707 |     # it'll be necessary to encode it in idn form
2708 |     parts = list(urlparse.urlsplit(url))
2709 |     try:
2710 |         parts[1].encode('ascii')
2711 |     except UnicodeEncodeError:
2712 |         # the url needs to be converted to idn notation
2713 |         host = parts[1].rsplit(':', 1)
2714 |         newhost = []
2715 |         port = u''
2716 |         if len(host) == 2:
2717 |             port = host.pop()
2718 |         for h in host[0].split('.'):
2719 |             newhost.append(h.encode('idna').decode('utf-8'))
2720 |         parts[1] = '.'.join(newhost)
2721 |         if port:
2722 |             parts[1] += ':' + port
2723 |         return urlparse.urlunsplit(parts)
2724 |     else:
2725 |         return url
2726 | 
2727 | def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2728 |     request = urllib2.Request(url)
2729 |     request.add_header('User-Agent', agent)
2730 |     if etag:
2731 |         request.add_header('If-None-Match', etag)
2732 |     if isinstance(modified, basestring):
2733 |         modified = _parse_date(modified)
2734 |     elif isinstance(modified, datetime.datetime):
2735 |         modified = modified.utctimetuple()
2736 |     if modified:
2737 |         # format into an RFC 1123-compliant timestamp. We can't use
2738 |         # time.strftime() since the %a and %b directives can be affected
2739 |         # by the current locale, but RFC 2616 states that dates must be
2740 |         # in English.
2741 |         short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2742 |         months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2743 |         request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2744 |     if referrer:
2745 |         request.add_header('Referer', referrer)
2746 |     if gzip and zlib:
2747 |         request.add_header('Accept-encoding', 'gzip, deflate')
2748 |     elif gzip:
2749 |         request.add_header('Accept-encoding', 'gzip')
2750 |     elif zlib:
2751 |         request.add_header('Accept-encoding', 'deflate')
2752 |     else:
2753 |         request.add_header('Accept-encoding', '')
2754 |     if auth:
2755 |         request.add_header('Authorization', 'Basic %s' % auth)
2756 |     if ACCEPT_HEADER:
2757 |         request.add_header('Accept', ACCEPT_HEADER)
2758 |     # use this for whatever -- cookies, special headers, etc
2759 |     # [('Cookie','Something'),('x-special-header','Another Value')]
2760 |     for header_name, header_value in request_headers.items():
2761 |         request.add_header(header_name, header_value)
2762 |     request.add_header('A-IM', 'feed') # RFC 3229 support
2763 |     return request
2764 | 
2765 | def _parse_psc_chapter_start(start):
2766 |     FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
2767 | 
2768 |     m = re.compile(FORMAT).match(start)
2769 |     if m is None:
2770 |         return None
2771 | 
2772 |     _, h, m, s, _, ms = m.groups()
2773 |     h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
2774 |     return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)
2775 | 
2776 | _date_handlers = []
2777 | def registerDateHandler(func):
2778 |     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2779 |     _date_handlers.insert(0, func)
2780 | 
2781 | # ISO-8601 date parsing routines written by Fazal Majid.
2782 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2783 | # parser is beyond the scope of feedparser and would be a worthwhile addition
2784 | # to the Python library.
2785 | # A single regular expression cannot parse ISO 8601 date formats into groups
2786 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2787 | # 0301-04-01), so we use templates instead.
2788 | # Please note the order in templates is significant because we need a
2789 | # greedy match.
2790 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2791 |                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2792 |                 '-YY-?MM', '-OOO', '-YY',
2793 |                 '--MM-?DD', '--MM',
2794 |                 '---DD',
2795 |                 'CC', '']
2796 | _iso8601_re = [
2797 |     tmpl.replace(
2798 |     'YYYY', r'(?P<year>\d{4})').replace(
2799 |     'YY', r'(?P<year>\d\d)').replace(
2800 |     'MM', r'(?P<month>[01]\d)').replace(
2801 |     'DD', r'(?P<day>[0123]\d)').replace(
2802 |     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2803 |     'CC', r'(?P<century>\d\d$)')
2804 |     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2805 |     + r'(:(?P<second>\d{2}))?'
2806 |     + r'(\.(?P<fracsecond>\d+))?'
2807 |     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2808 |     for tmpl in _iso8601_tmpl]
2809 | try:
2810 |     del tmpl
2811 | except NameError:
2812 |     pass
2813 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2814 | try:
2815 |     del regex
2816 | except NameError:
2817 |     pass
2818 |     
2819 | def _parse_date_iso8601(dateString):
2820 |     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2821 |     m = None
2822 |     for _iso8601_match in _iso8601_matches:
2823 |         m = _iso8601_match(dateString)
2824 |         if m:
2825 |             break
2826 |     if not m:
2827 |         return
2828 |     if m.span() == (0, 0):
2829 |         return
2830 |     params = m.groupdict()
2831 |     ordinal = params.get('ordinal', 0)
2832 |     if ordinal:
2833 |         ordinal = int(ordinal)
2834 |     else:
2835 |         ordinal = 0
2836 |     year = params.get('year', '--')
2837 |     if not year or year == '--':
2838 |         year = time.gmtime()[0]
2839 |     elif len(year) == 2:
2840 |         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2841 |         year = 100 * int(time.gmtime()[0] / 100) + int(year)
2842 |     else:
2843 |         year = int(year)
2844 |     month = params.get('month', '-')
2845 |     if not month or month == '-':
2846 |         # ordinals are NOT normalized by mktime, we simulate them
2847 |         # by setting month=1, day=ordinal
2848 |         if ordinal:
2849 |             month = 1
2850 |         else:
2851 |             month = time.gmtime()[1]
2852 |     month = int(month)
2853 |     day = params.get('day', 0)
2854 |     if not day:
2855 |         # see above
2856 |         if ordinal:
2857 |             day = ordinal
2858 |         elif params.get('century', 0) or \
2859 |                  params.get('year', 0) or params.get('month', 0):
2860 |             day = 1
2861 |         else:
2862 |             day = time.gmtime()[2]
2863 |     else:
2864 |         day = int(day)
2865 |     # special case of the century - is the first year of the 21st century
2866 |     # 2000 or 2001 ? The debate goes on...
2867 |     if 'century' in params:
2868 |         year = (int(params['century']) - 1) * 100 + 1
2869 |     # in ISO 8601 most fields are optional
2870 |     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
2871 |         if not params.get(field, None):
2872 |             params[field] = 0
2873 |     hour = int(params.get('hour', 0))
2874 |     minute = int(params.get('minute', 0))
2875 |     second = int(float(params.get('second', 0)))
2876 |     # weekday is normalized by mktime(), we can ignore it
2877 |     weekday = 0
2878 |     daylight_savings_flag = -1
2879 |     tm = [year, month, day, hour, minute, second, weekday,
2880 |           ordinal, daylight_savings_flag]
2881 |     # ISO 8601 time zone adjustments
2882 |     tz = params.get('tz')
2883 |     if tz and tz != 'Z':
2884 |         if tz[0] == '-':
2885 |             tm[3] += int(params.get('tzhour', 0))
2886 |             tm[4] += int(params.get('tzmin', 0))
2887 |         elif tz[0] == '+':
2888 |             tm[3] -= int(params.get('tzhour', 0))
2889 |             tm[4] -= int(params.get('tzmin', 0))
2890 |         else:
2891 |             return None
2892 |     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
2893 |     # which is guaranteed to normalize d/m/y/h/m/s.
2894 |     # Many implementations have bugs, but we'll pretend they don't.
2895 |     return time.localtime(time.mktime(tuple(tm)))
2896 | registerDateHandler(_parse_date_iso8601)
2897 | 
2898 | # 8-bit date handling routines written by ytrewq1.
2899 | _korean_year  = u'\ub144' # b3e2 in euc-kr
2900 | _korean_month = u'\uc6d4' # bff9 in euc-kr
2901 | _korean_day   = u'\uc77c' # c0cf in euc-kr
2902 | _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
2903 | _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
2904 | 
2905 | _korean_onblog_date_re = \
2906 |     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
2907 |                (_korean_year, _korean_month, _korean_day))
2908 | _korean_nate_date_re = \
2909 |     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
2910 |                (_korean_am, _korean_pm))
2911 | def _parse_date_onblog(dateString):
2912 |     '''Parse a string according to the OnBlog 8-bit date format'''
2913 |     m = _korean_onblog_date_re.match(dateString)
2914 |     if not m:
2915 |         return
2916 |     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2917 |                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2918 |                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2919 |                  'zonediff': '+09:00'}
2920 |     return _parse_date_w3dtf(w3dtfdate)
2921 | registerDateHandler(_parse_date_onblog)
2922 | 
2923 | def _parse_date_nate(dateString):
2924 |     '''Parse a string according to the Nate 8-bit date format'''
2925 |     m = _korean_nate_date_re.match(dateString)
2926 |     if not m:
2927 |         return
2928 |     hour = int(m.group(5))
2929 |     ampm = m.group(4)
2930 |     if (ampm == _korean_pm):
2931 |         hour += 12
2932 |     hour = str(hour)
2933 |     if len(hour) == 1:
2934 |         hour = '0' + hour
2935 |     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2936 |                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2937 |                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
2938 |                  'zonediff': '+09:00'}
2939 |     return _parse_date_w3dtf(w3dtfdate)
2940 | registerDateHandler(_parse_date_nate)
2941 | 
2942 | # Unicode strings for Greek date strings
2943 | _greek_months = \
2944 |   { \
2945 |    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2946 |    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2947 |    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2948 |    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2949 |    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2950 |    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2951 |    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2952 |    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2953 |    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2954 |    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2955 |    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2956 |    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2957 |    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2958 |    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2959 |    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2960 |    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2961 |    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2962 |    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2963 |    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2964 |   }
2965 | 
2966 | _greek_wdays = \
2967 |   { \
2968 |    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2969 |    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2970 |    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2971 |    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2972 |    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2973 |    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2974 |    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2975 |   }
2976 | 
2977 | _greek_date_format_re = \
2978 |     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2979 | 
2980 | def _parse_date_greek(dateString):
2981 |     '''Parse a string according to a Greek 8-bit date format.'''
2982 |     m = _greek_date_format_re.match(dateString)
2983 |     if not m:
2984 |         return
2985 |     wday = _greek_wdays[m.group(1)]
2986 |     month = _greek_months[m.group(3)]
2987 |     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2988 |                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2989 |                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2990 |                   'zonediff': m.group(8)}
2991 |     return _parse_date_rfc822(rfc822date)
2992 | registerDateHandler(_parse_date_greek)
2993 | 
2994 | # Unicode strings for Hungarian date strings
2995 | _hungarian_months = \
2996 |   { \
2997 |     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2998 |     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2999 |     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
3000 |     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
3001 |     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
3002 |     u'j\u00fanius':   u'06',  # fa in iso-8859-2
3003 |     u'j\u00falius':   u'07',  # fa in iso-8859-2
3004 |     u'augusztus':     u'08',
3005 |     u'szeptember':    u'09',
3006 |     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
3007 |     u'november':      u'11',
3008 |     u'december':      u'12',
3009 |   }
3010 | 
3011 | _hungarian_date_format_re = \
3012 |   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3013 | 
3014 | def _parse_date_hungarian(dateString):
3015 |     '''Parse a string according to a Hungarian 8-bit date format.'''
3016 |     m = _hungarian_date_format_re.match(dateString)
3017 |     if not m or m.group(2) not in _hungarian_months:
3018 |         return None
3019 |     month = _hungarian_months[m.group(2)]
3020 |     day = m.group(3)
3021 |     if len(day) == 1:
3022 |         day = '0' + day
3023 |     hour = m.group(4)
3024 |     if len(hour) == 1:
3025 |         hour = '0' + hour
3026 |     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3027 |                 {'year': m.group(1), 'month': month, 'day': day,\
3028 |                  'hour': hour, 'minute': m.group(5),\
3029 |                  'zonediff': m.group(6)}
3030 |     return _parse_date_w3dtf(w3dtfdate)
3031 | registerDateHandler(_parse_date_hungarian)
3032 | 
3033 | timezonenames = {
3034 |     'ut': 0, 'gmt': 0, 'z': 0,
3035 |     'adt': -3, 'ast': -4, 'at': -4,
3036 |     'edt': -4, 'est': -5, 'et': -5,
3037 |     'cdt': -5, 'cst': -6, 'ct': -6,
3038 |     'mdt': -6, 'mst': -7, 'mt': -7,
3039 |     'pdt': -7, 'pst': -8, 'pt': -8,
3040 |     'a': -1, 'n': 1,
3041 |     'm': -12, 'y': 12,
3042 | }
3043 | # W3 date and time format parser
3044 | # http://www.w3.org/TR/NOTE-datetime
3045 | # Also supports MSSQL-style datetimes as defined at:
3046 | # http://msdn.microsoft.com/en-us/library/ms186724.aspx
3047 | # (basically, allow a space as a date/time/timezone separator)
3048 | def _parse_date_w3dtf(datestr):
3049 |     if not datestr.strip():
3050 |         return None
3051 |     parts = datestr.lower().split('t')
3052 |     if len(parts) == 1:
3053 |         # This may be a date only, or may be an MSSQL-style date
3054 |         parts = parts[0].split()
3055 |         if len(parts) == 1:
3056 |             # Treat this as a date only
3057 |             parts.append('00:00:00z')
3058 |     elif len(parts) > 2:
3059 |         return None
3060 |     date = parts[0].split('-', 2) 
3061 |     if not date or len(date[0]) != 4:
3062 |         return None
3063 |     # Ensure that `date` has 3 elements. Using '1' sets the default
3064 |     # month to January and the default day to the 1st of the month.
3065 |     date.extend(['1'] * (3 - len(date)))
3066 |     try:
3067 |         year, month, day = [int(i) for i in date]
3068 |     except ValueError:
3069 |         # `date` may have more than 3 elements or may contain
3070 |         # non-integer strings.
3071 |         return None
3072 |     if parts[1].endswith('z'):
3073 |         parts[1] = parts[1][:-1]
3074 |         parts.append('z')
3075 |     # Append the numeric timezone offset, if any, to parts.
3076 |     # If this is an MSSQL-style date then parts[2] already contains
3077 |     # the timezone information, so `append()` will not affect it.
3078 |     # Add 1 to each value so that if `find()` returns -1 it will be
3079 |     # treated as False.
3080 |     loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
3081 |     loc = loc - 1
3082 |     parts.append(parts[1][loc:])
3083 |     parts[1] = parts[1][:loc]
3084 |     time = parts[1].split(':', 2)
3085 |     # Ensure that time has 3 elements. Using '0' means that the
3086 |     # minutes and seconds, if missing, will default to 0.
3087 |     time.extend(['0'] * (3 - len(time)))
3088 |     tzhour = 0
3089 |     tzmin = 0
3090 |     if parts[2][:1] in ('-', '+'):
3091 |         try:
3092 |             tzhour = int(parts[2][1:3])
3093 |             tzmin = int(parts[2][4:])
3094 |         except ValueError:
3095 |             return None
3096 |         if parts[2].startswith('-'):
3097 |             tzhour = tzhour * -1
3098 |             tzmin = tzmin * -1
3099 |     else:
3100 |         tzhour = timezonenames.get(parts[2], 0)
3101 |     try:
3102 |         hour, minute, second = [int(float(i)) for i in time]
3103 |     except ValueError:
3104 |         return None
3105 |     # Create the datetime object and timezone delta objects
3106 |     try:
3107 |         stamp = datetime.datetime(year, month, day, hour, minute, second)
3108 |     except ValueError:
3109 |         return None
3110 |     delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3111 |     # Return the date and timestamp in a UTC 9-tuple
3112 |     try:
3113 |         return (stamp - delta).utctimetuple()
3114 |     except (OverflowError, ValueError):
3115 |         # IronPython throws ValueErrors instead of OverflowErrors
3116 |         return None
3117 | 
3118 | registerDateHandler(_parse_date_w3dtf)
3119 | 
3120 | def _parse_date_rfc822(date):
3121 |     """Parse RFC 822 dates and times
3122 |     http://tools.ietf.org/html/rfc822#section-5
3123 | 
3124 |     There are some formatting differences that are accounted for:
3125 |     1. Years may be two or four digits.
3126 |     2. The month and day can be swapped.
3127 |     3. Additional timezone names are supported.
3128 |     4. A default time and timezone are assumed if only a date is present.
3129 |     """
3130 |     daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
3131 |     months = {
3132 |         'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
3133 |         'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
3134 |     }
3135 | 
3136 |     parts = date.lower().split()
3137 |     if len(parts) < 5:
3138 |         # Assume that the time and timezone are missing
3139 |         parts.extend(('00:00:00', '0000'))
3140 |     # Remove the day name
3141 |     if parts[0][:3] in daynames:
3142 |         parts = parts[1:]
3143 |     if len(parts) < 5:
3144 |         # If there are still fewer than five parts, there's not enough
3145 |         # information to interpret this
3146 |         return None
3147 |     try:
3148 |         day = int(parts[0])
3149 |     except ValueError:
3150 |         # Check if the day and month are swapped
3151 |         if months.get(parts[0][:3]):
3152 |             try:
3153 |                 day = int(parts[1])
3154 |             except ValueError:
3155 |                 return None
3156 |             else:
3157 |                 parts[1] = parts[0]
3158 |         else:
3159 |             return None
3160 |     month = months.get(parts[1][:3])
3161 |     if not month:
3162 |         return None
3163 |     try:
3164 |         year = int(parts[2])
3165 |     except ValueError:
3166 |         return None
3167 |     # Normalize two-digit years:
3168 |     # Anything in the 90's is interpreted as 1990 and on
3169 |     # Anything 89 or less is interpreted as 2089 or before
3170 |     if len(parts[2]) <= 2:
3171 |         year += (1900, 2000)[year < 90]
3172 |     timeparts = parts[3].split(':')
3173 |     timeparts = timeparts + ([0] * (3 - len(timeparts)))
3174 |     try:
3175 |         (hour, minute, second) = map(int, timeparts)
3176 |     except ValueError:
3177 |         return None
3178 |     tzhour = 0
3179 |     tzmin = 0
3180 |     # Strip 'Etc/' from the timezone
3181 |     if parts[4].startswith('etc/'):
3182 |         parts[4] = parts[4][4:]
3183 |     # Normalize timezones that start with 'gmt':
3184 |     # GMT-05:00 => -0500
3185 |     # GMT => GMT
3186 |     if parts[4].startswith('gmt'):
3187 |         parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
3188 |     # Handle timezones like '-0500', '+0500', and 'EST'
3189 |     if parts[4] and parts[4][0] in ('-', '+'):
3190 |         try:
3191 |             tzhour = int(parts[4][1:3])
3192 |             tzmin = int(parts[4][3:])
3193 |         except ValueError:
3194 |             return None
3195 |         if parts[4].startswith('-'):
3196 |             tzhour = tzhour * -1
3197 |             tzmin = tzmin * -1
3198 |     else:
3199 |         tzhour = timezonenames.get(parts[4], 0)
3200 |     # Create the datetime object and timezone delta objects
3201 |     try:
3202 |         stamp = datetime.datetime(year, month, day, hour, minute, second)
3203 |     except ValueError:
3204 |         return None
3205 |     delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3206 |     # Return the date and timestamp in a UTC 9-tuple
3207 |     try:
3208 |         return (stamp - delta).utctimetuple()
3209 |     except (OverflowError, ValueError):
3210 |         # IronPython throws ValueErrors instead of OverflowErrors
3211 |         return None
3212 | registerDateHandler(_parse_date_rfc822)
3213 | 
3214 | _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
3215 |            'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
3216 | def _parse_date_asctime(dt):
3217 |     """Parse asctime-style dates"""
3218 |     dayname, month, day, remainder = dt.split(None, 3)
3219 |     # Convert month and day into zero-padded integers
3220 |     month = '%02i ' % (_months.index(month.lower()) + 1)
3221 |     day = '%02i ' % (int(day),)
3222 |     dt = month + day + remainder
3223 |     return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, )
3224 | registerDateHandler(_parse_date_asctime)
3225 | 
3226 | def _parse_date_perforce(aDateString):
3227 |     """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3228 |     # Fri, 2006/09/15 08:19:53 EDT
3229 |     _my_date_pattern = re.compile( \
3230 |         r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3231 | 
3232 |     m = _my_date_pattern.search(aDateString)
3233 |     if m is None:
3234 |         return None
3235 |     dow, year, month, day, hour, minute, second, tz = m.groups()
3236 |     months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3237 |     dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3238 |     tm = rfc822.parsedate_tz(dateString)
3239 |     if tm:
3240 |         return time.gmtime(rfc822.mktime_tz(tm))
3241 | registerDateHandler(_parse_date_perforce)
3242 | 
3243 | def _parse_date(dateString):
3244 |     '''Parses a variety of date formats into a 9-tuple in GMT'''
3245 |     if not dateString:
3246 |         return None
3247 |     for handler in _date_handlers:
3248 |         try:
3249 |             date9tuple = handler(dateString)
3250 |         except (KeyError, OverflowError, ValueError):
3251 |             continue
3252 |         if not date9tuple:
3253 |             continue
3254 |         if len(date9tuple) != 9:
3255 |             continue
3256 |         return date9tuple
3257 |     return None
3258 | 
3259 | # Each marker represents some of the characters of the opening XML
3260 | # processing instruction ('<?xm') in the specified encoding.
3261 | EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
3262 | UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
3263 | UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
3264 | UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
3265 | UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
3266 | 
3267 | ZERO_BYTES = _l2bytes([0x00, 0x00])
3268 | 
3269 | # Match the opening XML declaration.
3270 | # Example: <?xml version="1.0" encoding="utf-8"?>
3271 | RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
3272 | 
3273 | # Capture the value of the XML processing instruction's encoding attribute.
3274 | # Example: <?xml version="1.0" encoding="utf-8"?>
3275 | RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
3276 | 
3277 | def convert_to_utf8(http_headers, data):
3278 |     '''Detect and convert the character encoding to UTF-8.
3279 | 
3280 |     http_headers is a dictionary
3281 |     data is a raw string (not Unicode)'''
3282 | 
3283 |     # This is so much trickier than it sounds, it's not even funny.
3284 |     # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3285 |     # is application/xml, application/*+xml,
3286 |     # application/xml-external-parsed-entity, or application/xml-dtd,
3287 |     # the encoding given in the charset parameter of the HTTP Content-Type
3288 |     # takes precedence over the encoding given in the XML prefix within the
3289 |     # document, and defaults to 'utf-8' if neither are specified.  But, if
3290 |     # the HTTP Content-Type is text/xml, text/*+xml, or
3291 |     # text/xml-external-parsed-entity, the encoding given in the XML prefix
3292 |     # within the document is ALWAYS IGNORED and only the encoding given in
3293 |     # the charset parameter of the HTTP Content-Type header should be
3294 |     # respected, and it defaults to 'us-ascii' if not specified.
3295 | 
3296 |     # Furthermore, discussion on the atom-syntax mailing list with the
3297 |     # author of RFC 3023 leads me to the conclusion that any document
3298 |     # served with a Content-Type of text/* and no charset parameter
3299 |     # must be treated as us-ascii.  (We now do this.)  And also that it
3300 |     # must always be flagged as non-well-formed.  (We now do this too.)
3301 | 
3302 |     # If Content-Type is unspecified (input was local file or non-HTTP source)
3303 |     # or unrecognized (server just got it totally wrong), then go by the
3304 |     # encoding given in the XML prefix of the document and default to
3305 |     # 'iso-8859-1' as per the HTTP specification (RFC 2616).
3306 | 
3307 |     # Then, assuming we didn't find a character encoding in the HTTP headers
3308 |     # (and the HTTP Content-type allowed us to look in the body), we need
3309 |     # to sniff the first few bytes of the XML data and try to determine
3310 |     # whether the encoding is ASCII-compatible.  Section F of the XML
3311 |     # specification shows the way here:
3312 |     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3313 | 
3314 |     # If the sniffed encoding is not ASCII-compatible, we need to make it
3315 |     # ASCII compatible so that we can sniff further into the XML declaration
3316 |     # to find the encoding attribute, which will tell us the true encoding.
3317 | 
3318 |     # Of course, none of this guarantees that we will be able to parse the
3319 |     # feed in the declared character encoding (assuming it was declared
3320 |     # correctly, which many are not).  iconv_codec can help a lot;
3321 |     # you should definitely install it if you can.
3322 |     # http://cjkpython.i18n.org/
3323 | 
3324 |     bom_encoding = u''
3325 |     xml_encoding = u''
3326 |     rfc3023_encoding = u''
3327 | 
3328 |     # Look at the first few bytes of the document to guess what
3329 |     # its encoding may be. We only need to decode enough of the
3330 |     # document that we can use an ASCII-compatible regular
3331 |     # expression to search for an XML encoding declaration.
3332 |     # The heuristic follows the XML specification, section F:
3333 |     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3334 |     # Check for BOMs first.
3335 |     if data[:4] == codecs.BOM_UTF32_BE:
3336 |         bom_encoding = u'utf-32be'
3337 |         data = data[4:]
3338 |     elif data[:4] == codecs.BOM_UTF32_LE:
3339 |         bom_encoding = u'utf-32le'
3340 |         data = data[4:]
3341 |     elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
3342 |         bom_encoding = u'utf-16be'
3343 |         data = data[2:]
3344 |     elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
3345 |         bom_encoding = u'utf-16le'
3346 |         data = data[2:]
3347 |     elif data[:3] == codecs.BOM_UTF8:
3348 |         bom_encoding = u'utf-8'
3349 |         data = data[3:]
3350 |     # Check for the characters '<?xm' in several encodings.
3351 |     elif data[:4] == EBCDIC_MARKER:
3352 |         bom_encoding = u'cp037'
3353 |     elif data[:4] == UTF16BE_MARKER:
3354 |         bom_encoding = u'utf-16be'
3355 |     elif data[:4] == UTF16LE_MARKER:
3356 |         bom_encoding = u'utf-16le'
3357 |     elif data[:4] == UTF32BE_MARKER:
3358 |         bom_encoding = u'utf-32be'
3359 |     elif data[:4] == UTF32LE_MARKER:
3360 |         bom_encoding = u'utf-32le'
3361 | 
3362 |     tempdata = data
3363 |     try:
3364 |         if bom_encoding:
3365 |             tempdata = data.decode(bom_encoding).encode('utf-8')
3366 |     except (UnicodeDecodeError, LookupError):
3367 |         # feedparser recognizes UTF-32 encodings that aren't
3368 |         # available in Python 2.4 and 2.5, so it's possible to
3369 |         # encounter a LookupError during decoding.
3370 |         xml_encoding_match = None
3371 |     else:
3372 |         xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
3373 | 
3374 |     if xml_encoding_match:
3375 |         xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3376 |         # Normalize the xml_encoding if necessary.
3377 |         if bom_encoding and (xml_encoding in (
3378 |             u'u16', u'utf-16', u'utf16', u'utf_16',
3379 |             u'u32', u'utf-32', u'utf32', u'utf_32',
3380 |             u'iso-10646-ucs-2', u'iso-10646-ucs-4',
3381 |             u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
3382 |         )):
3383 |             xml_encoding = bom_encoding
3384 | 
3385 |     # Find the HTTP Content-Type and, hopefully, a character
3386 |     # encoding provided by the server. The Content-Type is used
3387 |     # to choose the "correct" encoding among the BOM encoding,
3388 |     # XML declaration encoding, and HTTP encoding, following the
3389 |     # heuristic defined in RFC 3023.
3390 |     http_content_type = http_headers.get('content-type') or ''
3391 |     http_content_type, params = cgi.parse_header(http_content_type)
3392 |     http_encoding = params.get('charset', '').replace("'", "")
3393 |     if not isinstance(http_encoding, unicode):
3394 |         http_encoding = http_encoding.decode('utf-8', 'ignore')
3395 | 
3396 |     acceptable_content_type = 0
3397 |     application_content_types = (u'application/xml', u'application/xml-dtd',
3398 |                                  u'application/xml-external-parsed-entity')
3399 |     text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
3400 |     if (http_content_type in application_content_types) or \
3401 |        (http_content_type.startswith(u'application/') and 
3402 |         http_content_type.endswith(u'+xml')):
3403 |         acceptable_content_type = 1
3404 |         rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
3405 |     elif (http_content_type in text_content_types) or \
3406 |          (http_content_type.startswith(u'text/') and
3407 |           http_content_type.endswith(u'+xml')):
3408 |         acceptable_content_type = 1
3409 |         rfc3023_encoding = http_encoding or u'us-ascii'
3410 |     elif http_content_type.startswith(u'text/'):
3411 |         rfc3023_encoding = http_encoding or u'us-ascii'
3412 |     elif http_headers and 'content-type' not in http_headers:
3413 |         rfc3023_encoding = xml_encoding or u'iso-8859-1'
3414 |     else:
3415 |         rfc3023_encoding = xml_encoding or u'utf-8'
3416 |     # gb18030 is a superset of gb2312, so always replace gb2312
3417 |     # with gb18030 for greater compatibility.
3418 |     if rfc3023_encoding.lower() == u'gb2312':
3419 |         rfc3023_encoding = u'gb18030'
3420 |     if xml_encoding.lower() == u'gb2312':
3421 |         xml_encoding = u'gb18030'
3422 | 
3423 |     # there are four encodings to keep track of:
3424 |     # - http_encoding is the encoding declared in the Content-Type HTTP header
3425 |     # - xml_encoding is the encoding declared in the <?xml declaration
3426 |     # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
3427 |     # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3428 |     error = None
3429 | 
3430 |     if http_headers and (not acceptable_content_type):
3431 |         if 'content-type' in http_headers:
3432 |             msg = '%s is not an XML media type' % http_headers['content-type']
3433 |         else:
3434 |             msg = 'no Content-type specified'
3435 |         error = NonXMLContentType(msg)
3436 | 
3437 |     # determine character encoding
3438 |     known_encoding = 0
3439 |     chardet_encoding = None
3440 |     tried_encodings = []
3441 |     if chardet:
3442 |         chardet_encoding = chardet.detect(data)['encoding']
3443 |         if not chardet_encoding:
3444 |             chardet_encoding = ''
3445 |         if not isinstance(chardet_encoding, unicode):
3446 |             chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore')
3447 |     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3448 |     for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
3449 |                               chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
3450 |         if not proposed_encoding:
3451 |             continue
3452 |         if proposed_encoding in tried_encodings:
3453 |             continue
3454 |         tried_encodings.append(proposed_encoding)
3455 |         try:
3456 |             data = data.decode(proposed_encoding)
3457 |         except (UnicodeDecodeError, LookupError):
3458 |             pass
3459 |         else:
3460 |             known_encoding = 1
3461 |             # Update the encoding in the opening XML processing instruction.
3462 |             new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
3463 |             if RE_XML_DECLARATION.search(data):
3464 |                 data = RE_XML_DECLARATION.sub(new_declaration, data)
3465 |             else:
3466 |                 data = new_declaration + u'\n' + data
3467 |             data = data.encode('utf-8')
3468 |             break
3469 |     # if still no luck, give up
3470 |     if not known_encoding:
3471 |         error = CharacterEncodingUnknown(
3472 |             'document encoding unknown, I tried ' +
3473 |             '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
3474 |             (rfc3023_encoding, xml_encoding))
3475 |         rfc3023_encoding = u''
3476 |     elif proposed_encoding != rfc3023_encoding:
3477 |         error = CharacterEncodingOverride(
3478 |             'document declared as %s, but parsed as %s' %
3479 |             (rfc3023_encoding, proposed_encoding))
3480 |         rfc3023_encoding = proposed_encoding
3481 | 
3482 |     return data, rfc3023_encoding, error
3483 | 
3484 | # Match XML entity declarations.
3485 | # Example: <!ENTITY copyright "(C)">
3486 | RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3487 | 
3488 | # Match XML DOCTYPE declarations.
3489 | # Example: <!DOCTYPE feed [ ]>
3490 | RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3491 | 
3492 | # Match safe entity declarations.
3493 | # This will allow hexadecimal character references through,
3494 | # as well as text, but not arbitrary nested entities.
3495 | # Example: cubed "&#179;"
3496 | # Example: copyright "(C)"
3497 | # Forbidden: explode1 "&explode2;&explode2;"
3498 | RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3499 | 
3500 | def replace_doctype(data):
3501 |     '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
3502 | 
3503 |     rss_version may be 'rss091n' or None
3504 |     stripped_data is the same XML document with a replaced DOCTYPE
3505 |     '''
3506 | 
3507 |     # Divide the document into two groups by finding the location
3508 |     # of the first element that doesn't begin with '<?' or '<!'.
3509 |     start = re.search(_s2bytes('<\w'), data)
3510 |     start = start and start.start() or -1
3511 |     head, data = data[:start+1], data[start+1:]
3512 | 
3513 |     # Save and then remove all of the ENTITY declarations.
3514 |     entity_results = RE_ENTITY_PATTERN.findall(head)
3515 |     head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
3516 | 
3517 |     # Find the DOCTYPE declaration and check the feed type.
3518 |     doctype_results = RE_DOCTYPE_PATTERN.findall(head)
3519 |     doctype = doctype_results and doctype_results[0] or _s2bytes('')
3520 |     if _s2bytes('netscape') in doctype.lower():
3521 |         version = u'rss091n'
3522 |     else:
3523 |         version = None
3524 | 
3525 |     # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
3526 |     replacement = _s2bytes('')
3527 |     if len(doctype_results) == 1 and entity_results:
3528 |         match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
3529 |         safe_entities = filter(match_safe_entities, entity_results)
3530 |         if safe_entities:
3531 |             replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
3532 |                         + _s2bytes('>\n<!ENTITY ').join(safe_entities) \
3533 |                         + _s2bytes('>\n]>')
3534 |     data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
3535 | 
3536 |     # Precompute the safe entities for the loose parser.
3537 |     safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
3538 |                       for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
3539 |     return version, data, safe_entities
3540 | 
3541 | 
3542 | # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3543 | # items, or None in the case of a parsing error.
3544 | 
3545 | def _parse_poslist(value, geom_type, swap=True, dims=2):
3546 |     if geom_type == 'linestring':
3547 |         return _parse_georss_line(value, swap, dims)
3548 |     elif geom_type == 'polygon':
3549 |         ring = _parse_georss_line(value, swap, dims)
3550 |         return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
3551 |     else:
3552 |         return None
3553 | 
3554 | def _gen_georss_coords(value, swap=True, dims=2):
3555 |     # A generator of (lon, lat) pairs from a string of encoded GeoRSS
3556 |     # coordinates. Converts to floats and swaps order.
3557 |     latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
3558 |     nxt = latlons.next
3559 |     while True:
3560 |         t = [nxt(), nxt()][::swap and -1 or 1]
3561 |         if dims == 3:
3562 |             t.append(nxt())
3563 |         yield tuple(t)
3564 | 
3565 | def _parse_georss_point(value, swap=True, dims=2):
3566 |     # A point contains a single latitude-longitude pair, separated by
3567 |     # whitespace. We'll also handle comma separators.
3568 |     try:
3569 |         coords = list(_gen_georss_coords(value, swap, dims))
3570 |         return {u'type': u'Point', u'coordinates': coords[0]}
3571 |     except (IndexError, ValueError):
3572 |         return None
3573 | 
3574 | def _parse_georss_line(value, swap=True, dims=2):
3575 |     # A line contains a space separated list of latitude-longitude pairs in
3576 |     # WGS84 coordinate reference system, with each pair separated by
3577 |     # whitespace. There must be at least two pairs.
3578 |     try:
3579 |         coords = list(_gen_georss_coords(value, swap, dims))
3580 |         return {u'type': u'LineString', u'coordinates': coords}
3581 |     except (IndexError, ValueError):
3582 |         return None
3583 | 
3584 | def _parse_georss_polygon(value, swap=True, dims=2):
3585 |     # A polygon contains a space separated list of latitude-longitude pairs,
3586 |     # with each pair separated by whitespace. There must be at least four
3587 |     # pairs, with the last being identical to the first (so a polygon has a
3588 |     # minimum of three actual points). 
3589 |     try:
3590 |         ring = list(_gen_georss_coords(value, swap, dims))
3591 |     except (IndexError, ValueError):
3592 |         return None
3593 |     if len(ring) < 4:
3594 |         return None
3595 |     return {u'type': u'Polygon', u'coordinates': (ring,)}
3596 | 
3597 | def _parse_georss_box(value, swap=True, dims=2):
3598 |     # A bounding box is a rectangular region, often used to define the extents
3599 |     # of a map or a rough area of interest. A box contains two space seperate
3600 |     # latitude-longitude pairs, with each pair separated by whitespace. The
3601 |     # first pair is the lower corner, the second is the upper corner.
3602 |     try:
3603 |         coords = list(_gen_georss_coords(value, swap, dims))
3604 |         return {u'type': u'Box', u'coordinates': tuple(coords)}
3605 |     except (IndexError, ValueError):
3606 |         return None
3607 | 
3608 | # end geospatial parsers
3609 | 
3610 | 
3611 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3612 |     '''Parse a feed from a URL, file, stream, or string.
3613 | 
3614 |     request_headers, if given, is a dict from http header name to value to add
3615 |     to the request; this overrides internally generated values.
3616 |     '''
3617 | 
3618 |     if handlers is None:
3619 |         handlers = []
3620 |     if request_headers is None:
3621 |         request_headers = {}
3622 |     if response_headers is None:
3623 |         response_headers = {}
3624 | 
3625 |     result = FeedParserDict()
3626 |     result['feed'] = FeedParserDict()
3627 |     result['entries'] = []
3628 |     result['bozo'] = 0
3629 |     if not isinstance(handlers, list):
3630 |         handlers = [handlers]
3631 |     try:
3632 |         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3633 |         data = f.read()
3634 |     except Exception, e:
3635 |         result['bozo'] = 1
3636 |         result['bozo_exception'] = e
3637 |         data = None
3638 |         f = None
3639 | 
3640 |     if hasattr(f, 'headers'):
3641 |         result['headers'] = dict(f.headers)
3642 |     # overwrite existing headers using response_headers
3643 |     if 'headers' in result:
3644 |         result['headers'].update(response_headers)
3645 |     elif response_headers:
3646 |         result['headers'] = copy.deepcopy(response_headers)
3647 | 
3648 |     # lowercase all of the HTTP headers for comparisons per RFC 2616
3649 |     if 'headers' in result:
3650 |         http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
3651 |     else:
3652 |         http_headers = {}
3653 | 
3654 |     # if feed is gzip-compressed, decompress it
3655 |     if f and data and http_headers:
3656 |         if gzip and 'gzip' in http_headers.get('content-encoding', ''):
3657 |             try:
3658 |                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3659 |             except (IOError, struct.error), e:
3660 |                 # IOError can occur if the gzip header is bad.
3661 |                 # struct.error can occur if the data is damaged.
3662 |                 result['bozo'] = 1
3663 |                 result['bozo_exception'] = e
3664 |                 if isinstance(e, struct.error):
3665 |                     # A gzip header was found but the data is corrupt.
3666 |                     # Ideally, we should re-request the feed without the
3667 |                     # 'Accept-encoding: gzip' header, but we don't.
3668 |                     data = None
3669 |         elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
3670 |             try:
3671 |                 data = zlib.decompress(data)
3672 |             except zlib.error, e:
3673 |                 try:
3674 |                     # The data may have no headers and no checksum.
3675 |                     data = zlib.decompress(data, -15)
3676 |                 except zlib.error, e:
3677 |                     result['bozo'] = 1
3678 |                     result['bozo_exception'] = e
3679 | 
3680 |     # save HTTP headers
3681 |     if http_headers:
3682 |         if 'etag' in http_headers:
3683 |             etag = http_headers.get('etag', u'')
3684 |             if not isinstance(etag, unicode):
3685 |                 etag = etag.decode('utf-8', 'ignore')
3686 |             if etag:
3687 |                 result['etag'] = etag
3688 |         if 'last-modified' in http_headers:
3689 |             modified = http_headers.get('last-modified', u'')
3690 |             if modified:
3691 |                 result['modified'] = modified
3692 |                 result['modified_parsed'] = _parse_date(modified)
3693 |     if hasattr(f, 'url'):
3694 |         if not isinstance(f.url, unicode):
3695 |             result['href'] = f.url.decode('utf-8', 'ignore')
3696 |         else:
3697 |             result['href'] = f.url
3698 |         result['status'] = 200
3699 |     if hasattr(f, 'status'):
3700 |         result['status'] = f.status
3701 |     if hasattr(f, 'close'):
3702 |         f.close()
3703 | 
3704 |     if data is None:
3705 |         return result
3706 | 
3707 |     # Stop processing if the server sent HTTP 304 Not Modified.
3708 |     if getattr(f, 'code', 0) == 304:
3709 |         result['version'] = u''
3710 |         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3711 |             'so the server sent no data.  This is a feature, not a bug!'
3712 |         return result
3713 | 
3714 |     data, result['encoding'], error = convert_to_utf8(http_headers, data)
3715 |     use_strict_parser = result['encoding'] and True or False
3716 |     if error is not None:
3717 |         result['bozo'] = 1
3718 |         result['bozo_exception'] = error
3719 | 
3720 |     result['version'], data, entities = replace_doctype(data)
3721 | 
3722 |     # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
3723 |     contentloc = http_headers.get('content-location', u'')
3724 |     href = result.get('href', u'')
3725 |     baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3726 | 
3727 |     baselang = http_headers.get('content-language', None)
3728 |     if not isinstance(baselang, unicode) and baselang is not None:
3729 |         baselang = baselang.decode('utf-8', 'ignore')
3730 | 
3731 |     if not _XML_AVAILABLE:
3732 |         use_strict_parser = 0
3733 |     if use_strict_parser:
3734 |         # initialize the SAX parser
3735 |         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3736 |         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3737 |         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3738 |         try:
3739 |             # disable downloading external doctype references, if possible
3740 |             saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3741 |         except xml.sax.SAXNotSupportedException:
3742 |             pass
3743 |         saxparser.setContentHandler(feedparser)
3744 |         saxparser.setErrorHandler(feedparser)
3745 |         source = xml.sax.xmlreader.InputSource()
3746 |         source.setByteStream(_StringIO(data))
3747 |         try:
3748 |             saxparser.parse(source)
3749 |         except xml.sax.SAXException, e:
3750 |             result['bozo'] = 1
3751 |             result['bozo_exception'] = feedparser.exc or e
3752 |             use_strict_parser = 0
3753 |     if not use_strict_parser and _SGML_AVAILABLE:
3754 |         feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3755 |         feedparser.feed(data.decode('utf-8', 'replace'))
3756 |     result['feed'] = feedparser.feeddata
3757 |     result['entries'] = feedparser.entries
3758 |     result['version'] = result['version'] or feedparser.version
3759 |     result['namespaces'] = feedparser.namespacesInUse
3760 |     return result
3761 | 
3762 | # The list of EPSG codes for geographic (latitude/longitude) coordinate
3763 | # systems to support decoding of GeoRSS GML profiles.
3764 | _geogCS = [
3765 | 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
3766 | 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
3767 | 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
3768 | 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
3769 | 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
3770 | 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
3771 | 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
3772 | 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
3773 | 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
3774 | 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
3775 | 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
3776 | 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
3777 | 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
3778 | 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
3779 | 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
3780 | 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
3781 | 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
3782 | 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
3783 | 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
3784 | 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
3785 | 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
3786 | 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
3787 | 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
3788 | 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
3789 | 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
3790 | 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
3791 | 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
3792 | 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
3793 | 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
3794 | 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
3795 | 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
3796 | 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
3797 | 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
3798 | 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]


--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
 1 | indexes:
 2 | 
 3 | # AUTOGENERATED
 4 | 
 5 | # This index.yaml is automatically updated whenever the dev_appserver
 6 | # detects that a new type of query is run.  If you want to manage the
 7 | # index.yaml file manually, remove the above marker line (the line
 8 | # saying "# AUTOGENERATED").  If you want to manage some indexes
 9 | # manually, move them above the marker line.  The index.yaml file is
10 | # automatically uploaded to the admin console when you next deploy
11 | # your application using appcfg.py.
12 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright 2007 Google Inc.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | import os
 19 | import logging
 20 | import webapp2
 21 | import json
 22 | 
 23 | from google.appengine.api import memcache
 24 | from google.appengine.api import urlfetch
 25 | from google.appengine.ext.webapp import template
 26 | 
 27 | import extractlinks
 28 | from extractlinks import LinkExtractor
 29 | import feedparser
 30 | import re
 31 | import urlparse
 32 | 
 33 | class MainHandler(webapp2.RequestHandler):
 34 | 
 35 |   def render_json(self, obj):
 36 |     self.response.headers["Content-Type"] = 'text/javascript'
 37 |     if self.request.get("callback"):
 38 |       self.response.write(self.request.get("callback") + "(" + json.dumps(obj) + ")")
 39 |     else:
 40 |       self.response.write(json.dumps(obj))
 41 | 
 42 |   # Correct feed urls with rel="self" and add hubs
 43 |   def extend_feed(self, feed, links):
 44 |     feed_self = next((l for l in links if l['rel'] == 'self'), None)
 45 |     if feed_self is not None:
 46 |       feed['href'] = feed_self['href']
 47 |       feed['type'] = feed_self['type']
 48 |     feed['hubs'] = [l for l in links if l['rel'] == 'hub']
 49 | 
 50 |   def get(self):
 51 |     # We need to clean up the url first and remove any fragment
 52 |     site_url = urlparse.urldefrag(self.request.get("url"))[0]
 53 |     force = (self.request.get("force").lower()) in ['true', '1']
 54 |     extend = (self.request.get("extend").lower()) in ['true', '1']
 55 |     feeds = [] # default value
 56 | 
 57 |     if site_url:
 58 |       feeds = memcache.get(site_url + "." + str(extend))
 59 |       if feeds is not None and not force:
 60 |         # good
 61 |         logging.debug("Memcache hit.")
 62 |         self.render_json(feeds)
 63 |       else:
 64 |         logging.debug("Memcache miss.")
 65 |         try:
 66 |           result = urlfetch.fetch(url=site_url, deadline=10)
 67 |           parser = LinkExtractor()
 68 |           parser.set_base_url(site_url)
 69 |           parser.feed(result.content)
 70 |           if parser.links:
 71 |             feeds = parser.links
 72 |           else:
 73 |             feeds = []
 74 | 
 75 |           if not feeds:
 76 |             # Let's check if by any chance this is actually not a feed?
 77 |             data = feedparser.parse(result.content)
 78 |             if data.bozo == 0:
 79 |               feed = {'title': data.feed.get('title', ''), 'rel': 'self', 'type': 'application/atom+xml', 'href': site_url}
 80 |               links = data.feed.get('links', [])
 81 |               if extend:
 82 |                 self.extend_feed(feed, links)
 83 |               feeds = [feed]
 84 |           else:
 85 |             if extend:
 86 |               for f in feeds:
 87 |                 data = feedparser.parse(f['href'])
 88 |                 links = data.feed.get('links', [])
 89 |                 self.extend_feed(f, links)
 90 | 
 91 |         except:
 92 |           feeds = []
 93 | 
 94 |         if not memcache.set(site_url + "." + str(extend), feeds, 86400):
 95 |           logging.error("Memcache set failed.")
 96 |         else:
 97 |           logging.debug("Memcache set.")
 98 |         self.render_json(feeds)
 99 | 
100 |     else:
101 |       self.response.write(template.render(os.path.join(os.path.dirname(__file__), 'templates', "index.html"), {}))
102 | 
103 | app = webapp2.WSGIApplication([('/', MainHandler)], debug=True)
104 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 2 |    "http://www.w3.org/TR/html4/loose.dtd">
 3 | <html lang="en">
 4 | <head>
 5 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 6 |     <title>Feediscovery</title>
 7 |     <meta name="author" content="Julien Genestoux">
 8 | </head>
 9 | <body style="background-color:#333; color: white; font-family:helvetica; padding: 0px 10% 0px 10%; font-size:1.4em">
10 |     <h1 style="color:#FF0080; font-size: 3.4em;">Fee<span style="color: #FF8000;">disco</span>very</h1>
11 |     <p>
12 |         Let the music play!
13 |     </p>
14 |     <h2>Query:</h2>
15 |     <p><code>GET http://feediscovery.appspot.com/?url=http://blog.superfeedr.com</code></p>
16 |     <small>You can as well add a <em>callback</em> parameter.</small>
17 |     <h2>Response:</h2>
18 |     
19 |     <p><code>[{"href":"http://blog.superfeedr.com/atom.xml","title":"Superfeedr' thoughts","rel":"alternate","type":"application/atom+xml"}]</code></p>
20 |     
21 |     <span style="font-size:0.6em;position:absolute; bottom:0; margin:0px 0px 20px 0px; padding:0px;">
22 |         Brought to you by <a href="http://superfeedr.com" style="color:#CCC" >Superfeedr</a>&nbsp;<img src="http://www.gravatar.com/avatar/a2f7d4dd6df7dd59e4adab811c00a3a1?s=24" style="vertical-align:middle;"> |
23 |     <a href="http://blog.superfeedr.com/api/feeds/discovery/feediscovery/feediscovery/" style="color:#CCC">Learn more</a> | <a href="http://github.com/superfeedr/feediscovery"  style="color:#CCC;">Make it better!</a>
24 |     </span>
25 | </body>
26 | </html>
27 | 


--------------------------------------------------------------------------------