├── .gitignore ├── README.textile ├── app.yaml ├── extractlinks.py ├── feedparser.py ├── index.yaml ├── main.py └── templates ├── index.html └── subscribe.html /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h2. notifixlite 2 | 3 | *WARNING* Google App Engine will not support XMPP anymore soon. As a consequence, we're not maintaining this code anymore. However, we have released a much better (and more complete!) bot called "Notifix":https://github.com/julien51/notifix. Check it out! 4 | 5 | h3. What is it? 6 | 7 | "notifixlite":http://notifixlite.appspot.com/ is a simple RSS to IM (Jabber, XMPP, Gtalk) bot. 8 | 9 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: notifixlite 2 | version: 1 3 | runtime: python 4 | api_version: 1 5 | 6 | handlers: 7 | - url: /remote_api 8 | script: $PYTHON_LIB/google/appengine/ext/remote_api/handler.py 9 | login: admin 10 | 11 | - url: .* 12 | script: main.py 13 | 14 | inbound_services: 15 | - xmpp_message 16 | -------------------------------------------------------------------------------- /extractlinks.py: -------------------------------------------------------------------------------- 1 | from sgmllib import SGMLParser 2 | from urlparse import urlparse 3 | from urlparse import urljoin 4 | 5 | import logging 6 | 7 | 8 | class LinkExtractor(SGMLParser): 9 | """A simple LinkExtractor class""" 10 | 11 | def set_base_url(self, base_url=None): 12 | self.base_url = base_url 13 | 14 | def make_absolute_and_add(self, dict_feed=None): 15 | if 'href' in dict_feed: 16 | p = urlparse(dict_feed['href']) 17 | if p.scheme != "": 18 | self.links.append(dict_feed) 19 | else: 20 | dict_feed['href'] = urljoin(self.base_url, dict_feed['href']) 21 | self.links.append(dict_feed) 22 | 23 | def reset(self): 24 | SGMLParser.reset(self) 25 | self.links = [] 26 | 27 | def start_link(self, attrs): 28 | if not ('rel', 'alternate') in attrs: return 29 | if('type', 'application/rss+xml') in attrs: 30 | self.make_absolute_and_add(dict(attrs)) 31 | if('type', 'application/atom+xml') in attrs: 32 | self.make_absolute_and_add(dict(attrs)) 33 | -------------------------------------------------------------------------------- /feedparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Universal feed parser 3 | 4 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds 5 | 6 | Visit http://feedparser.org/ for the latest version 7 | Visit http://feedparser.org/docs/ for the latest documentation 8 | 9 | Required: Python 2.1 or later 10 | Recommended: Python 2.3 or later 11 | Recommended: CJKCodecs and iconv_codec 12 | """ 13 | 14 | __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" 15 | __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. 16 | 17 | Redistribution and use in source and binary forms, with or without modification, 18 | are permitted provided that the following conditions are met: 19 | 20 | * Redistributions of source code must retain the above copyright notice, 21 | this list of conditions and the following disclaimer. 22 | * Redistributions in binary form must reproduce the above copyright notice, 23 | this list of conditions and the following disclaimer in the documentation 24 | and/or other materials provided with the distribution. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | POSSIBILITY OF SUCH DAMAGE.""" 37 | __author__ = "Mark Pilgrim " 38 | __contributors__ = ["Jason Diamond ", 39 | "John Beimler ", 40 | "Fazal Majid ", 41 | "Aaron Swartz ", 42 | "Kevin Marks "] 43 | _debug = 0 44 | 45 | # HTTP "User-Agent" header to send to servers when downloading feeds. 46 | # If you are embedding feedparser in a larger application, you should 47 | # change this to your application name and URL. 48 | USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ 49 | 50 | # HTTP "Accept" header to send to servers when downloading feeds. If you don't 51 | # want to send an Accept header, set this to None. 52 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" 53 | 54 | # List of preferred XML parsers, by SAX driver name. These will be tried first, 55 | # but if they're not installed, Python will keep searching through its own list 56 | # of pre-installed parsers until it finds one that supports everything we need. 57 | PREFERRED_XML_PARSERS = ["drv_libxml2"] 58 | 59 | # If you want feedparser to automatically run HTML markup through HTML Tidy, set 60 | # this to 1. Requires mxTidy 61 | # or utidylib . 62 | TIDY_MARKUP = 0 63 | 64 | # List of Python interfaces for HTML Tidy, in order of preference. Only useful 65 | # if TIDY_MARKUP = 1 66 | PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] 67 | 68 | # ---------- required modules (should come with any Python distribution) ---------- 69 | import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 70 | try: 71 | from cStringIO import StringIO as _StringIO 72 | except: 73 | from StringIO import StringIO as _StringIO 74 | 75 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- 76 | 77 | # gzip is included with most Python distributions, but may not be available if you compiled your own 78 | try: 79 | import gzip 80 | except: 81 | gzip = None 82 | try: 83 | import zlib 84 | except: 85 | zlib = None 86 | 87 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has 88 | # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the 89 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some 90 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. 91 | try: 92 | import xml.sax 93 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers 94 | from xml.sax.saxutils import escape as _xmlescape 95 | _XML_AVAILABLE = 1 96 | except: 97 | _XML_AVAILABLE = 0 98 | def _xmlescape(data): 99 | data = data.replace('&', '&') 100 | data = data.replace('>', '>') 101 | data = data.replace('<', '<') 102 | return data 103 | 104 | # base64 support for Atom feeds that contain embedded binary data 105 | try: 106 | import base64, binascii 107 | except: 108 | base64 = binascii = None 109 | 110 | # cjkcodecs and iconv_codec provide support for more character encodings. 111 | # Both are available from http://cjkpython.i18n.org/ 112 | try: 113 | import cjkcodecs.aliases 114 | except: 115 | pass 116 | try: 117 | import iconv_codec 118 | except: 119 | pass 120 | 121 | # chardet library auto-detects character encodings 122 | # Download from http://chardet.feedparser.org/ 123 | try: 124 | import chardet 125 | if _debug: 126 | import chardet.constants 127 | chardet.constants._debug = 1 128 | except: 129 | chardet = None 130 | 131 | # ---------- don't touch these ---------- 132 | class ThingsNobodyCaresAboutButMe(Exception): pass 133 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass 134 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass 135 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass 136 | class UndeclaredNamespace(Exception): pass 137 | 138 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 139 | sgmllib.special = re.compile('' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) 437 | 438 | # match namespaces 439 | if tag.find(':') <> -1: 440 | prefix, suffix = tag.split(':', 1) 441 | else: 442 | prefix, suffix = '', tag 443 | prefix = self.namespacemap.get(prefix, prefix) 444 | if prefix: 445 | prefix = prefix + '_' 446 | 447 | # special hack for better tracking of empty textinput/image elements in illformed feeds 448 | if (not prefix) and tag not in ('title', 'link', 'description', 'name'): 449 | self.intextinput = 0 450 | if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): 451 | self.inimage = 0 452 | 453 | # call special handler (if defined) or default handler 454 | methodname = '_start_' + prefix + suffix 455 | try: 456 | method = getattr(self, methodname) 457 | return method(attrsD) 458 | except AttributeError: 459 | return self.push(prefix + suffix, 1) 460 | 461 | def unknown_endtag(self, tag): 462 | if _debug: sys.stderr.write('end %s\n' % tag) 463 | # match namespaces 464 | if tag.find(':') <> -1: 465 | prefix, suffix = tag.split(':', 1) 466 | else: 467 | prefix, suffix = '', tag 468 | prefix = self.namespacemap.get(prefix, prefix) 469 | if prefix: 470 | prefix = prefix + '_' 471 | 472 | # call special handler (if defined) or default handler 473 | methodname = '_end_' + prefix + suffix 474 | try: 475 | method = getattr(self, methodname) 476 | method() 477 | except AttributeError: 478 | self.pop(prefix + suffix) 479 | 480 | # track inline content 481 | if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): 482 | # element declared itself as escaped markup, but it isn't really 483 | self.contentparams['type'] = 'application/xhtml+xml' 484 | if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': 485 | tag = tag.split(':')[-1] 486 | self.handle_data('' % tag, escape=0) 487 | 488 | # track xml:base and xml:lang going out of scope 489 | if self.basestack: 490 | self.basestack.pop() 491 | if self.basestack and self.basestack[-1]: 492 | self.baseuri = self.basestack[-1] 493 | if self.langstack: 494 | self.langstack.pop() 495 | if self.langstack: # and (self.langstack[-1] is not None): 496 | self.lang = self.langstack[-1] 497 | 498 | def handle_charref(self, ref): 499 | # called for each character reference, e.g. for ' ', ref will be '160' 500 | if not self.elementstack: return 501 | ref = ref.lower() 502 | if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): 503 | text = '&#%s;' % ref 504 | else: 505 | if ref[0] == 'x': 506 | c = int(ref[1:], 16) 507 | else: 508 | c = int(ref) 509 | text = unichr(c).encode('utf-8') 510 | self.elementstack[-1][2].append(text) 511 | 512 | def handle_entityref(self, ref): 513 | # called for each entity reference, e.g. for '©', ref will be 'copy' 514 | if not self.elementstack: return 515 | if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) 516 | if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): 517 | text = '&%s;' % ref 518 | else: 519 | # entity resolution graciously donated by Aaron Swartz 520 | def name2cp(k): 521 | import htmlentitydefs 522 | if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 523 | return htmlentitydefs.name2codepoint[k] 524 | k = htmlentitydefs.entitydefs[k] 525 | if k.startswith('&#') and k.endswith(';'): 526 | return int(k[2:-1]) # not in latin-1 527 | return ord(k) 528 | try: name2cp(ref) 529 | except KeyError: text = '&%s;' % ref 530 | else: text = unichr(name2cp(ref)).encode('utf-8') 531 | self.elementstack[-1][2].append(text) 532 | 533 | def handle_data(self, text, escape=1): 534 | # called for each block of plain text, i.e. outside of any tag and 535 | # not containing any character or entity references 536 | if not self.elementstack: return 537 | if escape and self.contentparams.get('type') == 'application/xhtml+xml': 538 | text = _xmlescape(text) 539 | self.elementstack[-1][2].append(text) 540 | 541 | def handle_comment(self, text): 542 | # called for each comment, e.g. 543 | pass 544 | 545 | def handle_pi(self, text): 546 | # called for each processing instruction, e.g. 547 | pass 548 | 549 | def handle_decl(self, text): 550 | pass 551 | 552 | def parse_declaration(self, i): 553 | # override internal declaration handler to handle CDATA blocks 554 | if _debug: sys.stderr.write('entering parse_declaration\n') 555 | if self.rawdata[i:i+9] == '', i) 557 | if k == -1: k = len(self.rawdata) 558 | self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) 559 | return k+3 560 | else: 561 | k = self.rawdata.find('>', i) 562 | return k+1 563 | 564 | def mapContentType(self, contentType): 565 | contentType = contentType.lower() 566 | if contentType == 'text': 567 | contentType = 'text/plain' 568 | elif contentType == 'html': 569 | contentType = 'text/html' 570 | elif contentType == 'xhtml': 571 | contentType = 'application/xhtml+xml' 572 | return contentType 573 | 574 | def trackNamespace(self, prefix, uri): 575 | loweruri = uri.lower() 576 | if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: 577 | self.version = 'rss090' 578 | if loweruri == 'http://purl.org/rss/1.0/' and not self.version: 579 | self.version = 'rss10' 580 | if loweruri == 'http://www.w3.org/2005/atom' and not self.version: 581 | self.version = 'atom10' 582 | if loweruri.find('backend.userland.com/rss') <> -1: 583 | # match any backend.userland.com namespace 584 | uri = 'http://backend.userland.com/rss' 585 | loweruri = uri 586 | if self._matchnamespaces.has_key(loweruri): 587 | self.namespacemap[prefix] = self._matchnamespaces[loweruri] 588 | self.namespacesInUse[self._matchnamespaces[loweruri]] = uri 589 | else: 590 | self.namespacesInUse[prefix or ''] = uri 591 | 592 | def resolveURI(self, uri): 593 | return _urljoin(self.baseuri or '', uri) 594 | 595 | def decodeEntities(self, element, data): 596 | return data 597 | 598 | def push(self, element, expectingText): 599 | self.elementstack.append([element, expectingText, []]) 600 | 601 | def pop(self, element, stripWhitespace=1): 602 | if not self.elementstack: return 603 | if self.elementstack[-1][0] != element: return 604 | 605 | element, expectingText, pieces = self.elementstack.pop() 606 | output = ''.join(pieces) 607 | if stripWhitespace: 608 | output = output.strip() 609 | if not expectingText: return output 610 | 611 | # decode base64 content 612 | if base64 and self.contentparams.get('base64', 0): 613 | try: 614 | output = base64.decodestring(output) 615 | except binascii.Error: 616 | pass 617 | except binascii.Incomplete: 618 | pass 619 | 620 | # resolve relative URIs 621 | if (element in self.can_be_relative_uri) and output: 622 | output = self.resolveURI(output) 623 | 624 | # decode entities within embedded markup 625 | if not self.contentparams.get('base64', 0): 626 | output = self.decodeEntities(element, output) 627 | 628 | # remove temporary cruft from contentparams 629 | try: 630 | del self.contentparams['mode'] 631 | except KeyError: 632 | pass 633 | try: 634 | del self.contentparams['base64'] 635 | except KeyError: 636 | pass 637 | 638 | # resolve relative URIs within embedded markup 639 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: 640 | if element in self.can_contain_relative_uris: 641 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding) 642 | 643 | # sanitize embedded markup 644 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: 645 | if element in self.can_contain_dangerous_markup: 646 | output = _sanitizeHTML(output, self.encoding) 647 | 648 | if self.encoding and type(output) != type(u''): 649 | try: 650 | output = unicode(output, self.encoding) 651 | except: 652 | pass 653 | 654 | # categories/tags/keywords/whatever are handled in _end_category 655 | if element == 'category': 656 | return output 657 | 658 | # store output in appropriate place(s) 659 | if self.inentry and not self.insource: 660 | if element == 'content': 661 | self.entries[-1].setdefault(element, []) 662 | contentparams = copy.deepcopy(self.contentparams) 663 | contentparams['value'] = output 664 | self.entries[-1][element].append(contentparams) 665 | elif element == 'link': 666 | self.entries[-1][element] = output 667 | if output: 668 | self.entries[-1]['links'][-1]['href'] = output 669 | else: 670 | if element == 'description': 671 | element = 'summary' 672 | self.entries[-1][element] = output 673 | if self.incontent: 674 | contentparams = copy.deepcopy(self.contentparams) 675 | contentparams['value'] = output 676 | self.entries[-1][element + '_detail'] = contentparams 677 | elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): 678 | context = self._getContext() 679 | if element == 'description': 680 | element = 'subtitle' 681 | context[element] = output 682 | if element == 'link': 683 | context['links'][-1]['href'] = output 684 | elif self.incontent: 685 | contentparams = copy.deepcopy(self.contentparams) 686 | contentparams['value'] = output 687 | context[element + '_detail'] = contentparams 688 | return output 689 | 690 | def pushContent(self, tag, attrsD, defaultContentType, expectingText): 691 | self.incontent += 1 692 | self.contentparams = FeedParserDict({ 693 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 694 | 'language': self.lang, 695 | 'base': self.baseuri}) 696 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) 697 | self.push(tag, expectingText) 698 | 699 | def popContent(self, tag): 700 | value = self.pop(tag) 701 | self.incontent -= 1 702 | self.contentparams.clear() 703 | return value 704 | 705 | def _mapToStandardPrefix(self, name): 706 | colonpos = name.find(':') 707 | if colonpos <> -1: 708 | prefix = name[:colonpos] 709 | suffix = name[colonpos+1:] 710 | prefix = self.namespacemap.get(prefix, prefix) 711 | name = prefix + ':' + suffix 712 | return name 713 | 714 | def _getAttribute(self, attrsD, name): 715 | return attrsD.get(self._mapToStandardPrefix(name)) 716 | 717 | def _isBase64(self, attrsD, contentparams): 718 | if attrsD.get('mode', '') == 'base64': 719 | return 1 720 | if self.contentparams['type'].startswith('text/'): 721 | return 0 722 | if self.contentparams['type'].endswith('+xml'): 723 | return 0 724 | if self.contentparams['type'].endswith('/xml'): 725 | return 0 726 | return 1 727 | 728 | def _itsAnHrefDamnIt(self, attrsD): 729 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) 730 | if href: 731 | try: 732 | del attrsD['url'] 733 | except KeyError: 734 | pass 735 | try: 736 | del attrsD['uri'] 737 | except KeyError: 738 | pass 739 | attrsD['href'] = href 740 | return attrsD 741 | 742 | def _save(self, key, value): 743 | context = self._getContext() 744 | context.setdefault(key, value) 745 | 746 | def _start_rss(self, attrsD): 747 | versionmap = {'0.91': 'rss091u', 748 | '0.92': 'rss092', 749 | '0.93': 'rss093', 750 | '0.94': 'rss094'} 751 | if not self.version: 752 | attr_version = attrsD.get('version', '') 753 | version = versionmap.get(attr_version) 754 | if version: 755 | self.version = version 756 | elif attr_version.startswith('2.'): 757 | self.version = 'rss20' 758 | else: 759 | self.version = 'rss' 760 | 761 | def _start_dlhottitles(self, attrsD): 762 | self.version = 'hotrss' 763 | 764 | def _start_channel(self, attrsD): 765 | self.infeed = 1 766 | self._cdf_common(attrsD) 767 | _start_feedinfo = _start_channel 768 | 769 | def _cdf_common(self, attrsD): 770 | if attrsD.has_key('lastmod'): 771 | self._start_modified({}) 772 | self.elementstack[-1][-1] = attrsD['lastmod'] 773 | self._end_modified() 774 | if attrsD.has_key('href'): 775 | self._start_link({}) 776 | self.elementstack[-1][-1] = attrsD['href'] 777 | self._end_link() 778 | 779 | def _start_feed(self, attrsD): 780 | self.infeed = 1 781 | versionmap = {'0.1': 'atom01', 782 | '0.2': 'atom02', 783 | '0.3': 'atom03'} 784 | if not self.version: 785 | attr_version = attrsD.get('version') 786 | version = versionmap.get(attr_version) 787 | if version: 788 | self.version = version 789 | else: 790 | self.version = 'atom' 791 | 792 | def _end_channel(self): 793 | self.infeed = 0 794 | _end_feed = _end_channel 795 | 796 | def _start_image(self, attrsD): 797 | self.inimage = 1 798 | self.push('image', 0) 799 | context = self._getContext() 800 | context.setdefault('image', FeedParserDict()) 801 | 802 | def _end_image(self): 803 | self.pop('image') 804 | self.inimage = 0 805 | 806 | def _start_textinput(self, attrsD): 807 | self.intextinput = 1 808 | self.push('textinput', 0) 809 | context = self._getContext() 810 | context.setdefault('textinput', FeedParserDict()) 811 | _start_textInput = _start_textinput 812 | 813 | def _end_textinput(self): 814 | self.pop('textinput') 815 | self.intextinput = 0 816 | _end_textInput = _end_textinput 817 | 818 | def _start_author(self, attrsD): 819 | self.inauthor = 1 820 | self.push('author', 1) 821 | _start_managingeditor = _start_author 822 | _start_dc_author = _start_author 823 | _start_dc_creator = _start_author 824 | _start_itunes_author = _start_author 825 | 826 | def _end_author(self): 827 | self.pop('author') 828 | self.inauthor = 0 829 | self._sync_author_detail() 830 | _end_managingeditor = _end_author 831 | _end_dc_author = _end_author 832 | _end_dc_creator = _end_author 833 | _end_itunes_author = _end_author 834 | 835 | def _start_itunes_owner(self, attrsD): 836 | self.inpublisher = 1 837 | self.push('publisher', 0) 838 | 839 | def _end_itunes_owner(self): 840 | self.pop('publisher') 841 | self.inpublisher = 0 842 | self._sync_author_detail('publisher') 843 | 844 | def _start_contributor(self, attrsD): 845 | self.incontributor = 1 846 | context = self._getContext() 847 | context.setdefault('contributors', []) 848 | context['contributors'].append(FeedParserDict()) 849 | self.push('contributor', 0) 850 | 851 | def _end_contributor(self): 852 | self.pop('contributor') 853 | self.incontributor = 0 854 | 855 | def _start_dc_contributor(self, attrsD): 856 | self.incontributor = 1 857 | context = self._getContext() 858 | context.setdefault('contributors', []) 859 | context['contributors'].append(FeedParserDict()) 860 | self.push('name', 0) 861 | 862 | def _end_dc_contributor(self): 863 | self._end_name() 864 | self.incontributor = 0 865 | 866 | def _start_name(self, attrsD): 867 | self.push('name', 0) 868 | _start_itunes_name = _start_name 869 | 870 | def _end_name(self): 871 | value = self.pop('name') 872 | if self.inpublisher: 873 | self._save_author('name', value, 'publisher') 874 | elif self.inauthor: 875 | self._save_author('name', value) 876 | elif self.incontributor: 877 | self._save_contributor('name', value) 878 | elif self.intextinput: 879 | context = self._getContext() 880 | context['textinput']['name'] = value 881 | _end_itunes_name = _end_name 882 | 883 | def _start_width(self, attrsD): 884 | self.push('width', 0) 885 | 886 | def _end_width(self): 887 | value = self.pop('width') 888 | try: 889 | value = int(value) 890 | except: 891 | value = 0 892 | if self.inimage: 893 | context = self._getContext() 894 | context['image']['width'] = value 895 | 896 | def _start_height(self, attrsD): 897 | self.push('height', 0) 898 | 899 | def _end_height(self): 900 | value = self.pop('height') 901 | try: 902 | value = int(value) 903 | except: 904 | value = 0 905 | if self.inimage: 906 | context = self._getContext() 907 | context['image']['height'] = value 908 | 909 | def _start_url(self, attrsD): 910 | self.push('href', 1) 911 | _start_homepage = _start_url 912 | _start_uri = _start_url 913 | 914 | def _end_url(self): 915 | value = self.pop('href') 916 | if self.inauthor: 917 | self._save_author('href', value) 918 | elif self.incontributor: 919 | self._save_contributor('href', value) 920 | elif self.inimage: 921 | context = self._getContext() 922 | context['image']['href'] = value 923 | elif self.intextinput: 924 | context = self._getContext() 925 | context['textinput']['link'] = value 926 | _end_homepage = _end_url 927 | _end_uri = _end_url 928 | 929 | def _start_email(self, attrsD): 930 | self.push('email', 0) 931 | _start_itunes_email = _start_email 932 | 933 | def _end_email(self): 934 | value = self.pop('email') 935 | if self.inpublisher: 936 | self._save_author('email', value, 'publisher') 937 | elif self.inauthor: 938 | self._save_author('email', value) 939 | elif self.incontributor: 940 | self._save_contributor('email', value) 941 | _end_itunes_email = _end_email 942 | 943 | def _getContext(self): 944 | if self.insource: 945 | context = self.sourcedata 946 | elif self.inentry: 947 | context = self.entries[-1] 948 | else: 949 | context = self.feeddata 950 | return context 951 | 952 | def _save_author(self, key, value, prefix='author'): 953 | context = self._getContext() 954 | context.setdefault(prefix + '_detail', FeedParserDict()) 955 | context[prefix + '_detail'][key] = value 956 | self._sync_author_detail() 957 | 958 | def _save_contributor(self, key, value): 959 | context = self._getContext() 960 | context.setdefault('contributors', [FeedParserDict()]) 961 | context['contributors'][-1][key] = value 962 | 963 | def _sync_author_detail(self, key='author'): 964 | context = self._getContext() 965 | detail = context.get('%s_detail' % key) 966 | if detail: 967 | name = detail.get('name') 968 | email = detail.get('email') 969 | if name and email: 970 | context[key] = '%s (%s)' % (name, email) 971 | elif name: 972 | context[key] = name 973 | elif email: 974 | context[key] = email 975 | else: 976 | author = context.get(key) 977 | if not author: return 978 | emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) 979 | if not emailmatch: return 980 | email = emailmatch.group(0) 981 | # probably a better way to do the following, but it passes all the tests 982 | author = author.replace(email, '') 983 | author = author.replace('()', '') 984 | author = author.strip() 985 | if author and (author[0] == '('): 986 | author = author[1:] 987 | if author and (author[-1] == ')'): 988 | author = author[:-1] 989 | author = author.strip() 990 | context.setdefault('%s_detail' % key, FeedParserDict()) 991 | context['%s_detail' % key]['name'] = author 992 | context['%s_detail' % key]['email'] = email 993 | 994 | def _start_subtitle(self, attrsD): 995 | self.pushContent('subtitle', attrsD, 'text/plain', 1) 996 | _start_tagline = _start_subtitle 997 | _start_itunes_subtitle = _start_subtitle 998 | 999 | def _end_subtitle(self): 1000 | self.popContent('subtitle') 1001 | _end_tagline = _end_subtitle 1002 | _end_itunes_subtitle = _end_subtitle 1003 | 1004 | def _start_rights(self, attrsD): 1005 | self.pushContent('rights', attrsD, 'text/plain', 1) 1006 | _start_dc_rights = _start_rights 1007 | _start_copyright = _start_rights 1008 | 1009 | def _end_rights(self): 1010 | self.popContent('rights') 1011 | _end_dc_rights = _end_rights 1012 | _end_copyright = _end_rights 1013 | 1014 | def _start_item(self, attrsD): 1015 | self.entries.append(FeedParserDict()) 1016 | self.push('item', 0) 1017 | self.inentry = 1 1018 | self.guidislink = 0 1019 | id = self._getAttribute(attrsD, 'rdf:about') 1020 | if id: 1021 | context = self._getContext() 1022 | context['id'] = id 1023 | self._cdf_common(attrsD) 1024 | _start_entry = _start_item 1025 | _start_product = _start_item 1026 | 1027 | def _end_item(self): 1028 | self.pop('item') 1029 | self.inentry = 0 1030 | _end_entry = _end_item 1031 | 1032 | def _start_dc_language(self, attrsD): 1033 | self.push('language', 1) 1034 | _start_language = _start_dc_language 1035 | 1036 | def _end_dc_language(self): 1037 | self.lang = self.pop('language') 1038 | _end_language = _end_dc_language 1039 | 1040 | def _start_dc_publisher(self, attrsD): 1041 | self.push('publisher', 1) 1042 | _start_webmaster = _start_dc_publisher 1043 | 1044 | def _end_dc_publisher(self): 1045 | self.pop('publisher') 1046 | self._sync_author_detail('publisher') 1047 | _end_webmaster = _end_dc_publisher 1048 | 1049 | def _start_published(self, attrsD): 1050 | self.push('published', 1) 1051 | _start_dcterms_issued = _start_published 1052 | _start_issued = _start_published 1053 | 1054 | def _end_published(self): 1055 | value = self.pop('published') 1056 | self._save('published_parsed', _parse_date(value)) 1057 | _end_dcterms_issued = _end_published 1058 | _end_issued = _end_published 1059 | 1060 | def _start_updated(self, attrsD): 1061 | self.push('updated', 1) 1062 | _start_modified = _start_updated 1063 | _start_dcterms_modified = _start_updated 1064 | _start_pubdate = _start_updated 1065 | _start_dc_date = _start_updated 1066 | 1067 | def _end_updated(self): 1068 | value = self.pop('updated') 1069 | parsed_value = _parse_date(value) 1070 | self._save('updated_parsed', parsed_value) 1071 | _end_modified = _end_updated 1072 | _end_dcterms_modified = _end_updated 1073 | _end_pubdate = _end_updated 1074 | _end_dc_date = _end_updated 1075 | 1076 | def _start_created(self, attrsD): 1077 | self.push('created', 1) 1078 | _start_dcterms_created = _start_created 1079 | 1080 | def _end_created(self): 1081 | value = self.pop('created') 1082 | self._save('created_parsed', _parse_date(value)) 1083 | _end_dcterms_created = _end_created 1084 | 1085 | def _start_expirationdate(self, attrsD): 1086 | self.push('expired', 1) 1087 | 1088 | def _end_expirationdate(self): 1089 | self._save('expired_parsed', _parse_date(self.pop('expired'))) 1090 | 1091 | def _start_cc_license(self, attrsD): 1092 | self.push('license', 1) 1093 | value = self._getAttribute(attrsD, 'rdf:resource') 1094 | if value: 1095 | self.elementstack[-1][2].append(value) 1096 | self.pop('license') 1097 | 1098 | def _start_creativecommons_license(self, attrsD): 1099 | self.push('license', 1) 1100 | 1101 | def _end_creativecommons_license(self): 1102 | self.pop('license') 1103 | 1104 | def _addTag(self, term, scheme, label): 1105 | context = self._getContext() 1106 | tags = context.setdefault('tags', []) 1107 | if (not term) and (not scheme) and (not label): return 1108 | value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) 1109 | if value not in tags: 1110 | tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) 1111 | 1112 | def _start_category(self, attrsD): 1113 | if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) 1114 | term = attrsD.get('term') 1115 | scheme = attrsD.get('scheme', attrsD.get('domain')) 1116 | label = attrsD.get('label') 1117 | self._addTag(term, scheme, label) 1118 | self.push('category', 1) 1119 | _start_dc_subject = _start_category 1120 | _start_keywords = _start_category 1121 | 1122 | def _end_itunes_keywords(self): 1123 | for term in self.pop('itunes_keywords').split(): 1124 | self._addTag(term, 'http://www.itunes.com/', None) 1125 | 1126 | def _start_itunes_category(self, attrsD): 1127 | self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) 1128 | self.push('category', 1) 1129 | 1130 | def _end_category(self): 1131 | value = self.pop('category') 1132 | if not value: return 1133 | context = self._getContext() 1134 | tags = context['tags'] 1135 | if value and len(tags) and not tags[-1]['term']: 1136 | tags[-1]['term'] = value 1137 | else: 1138 | self._addTag(value, None, None) 1139 | _end_dc_subject = _end_category 1140 | _end_keywords = _end_category 1141 | _end_itunes_category = _end_category 1142 | 1143 | def _start_cloud(self, attrsD): 1144 | self._getContext()['cloud'] = FeedParserDict(attrsD) 1145 | 1146 | def _start_link(self, attrsD): 1147 | attrsD.setdefault('rel', 'alternate') 1148 | attrsD.setdefault('type', 'text/html') 1149 | attrsD = self._itsAnHrefDamnIt(attrsD) 1150 | if attrsD.has_key('href'): 1151 | attrsD['href'] = self.resolveURI(attrsD['href']) 1152 | expectingText = self.infeed or self.inentry or self.insource 1153 | context = self._getContext() 1154 | context.setdefault('links', []) 1155 | context['links'].append(FeedParserDict(attrsD)) 1156 | if attrsD['rel'] == 'enclosure': 1157 | self._start_enclosure(attrsD) 1158 | if attrsD.has_key('href'): 1159 | expectingText = 0 1160 | if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): 1161 | context['link'] = attrsD['href'] 1162 | else: 1163 | self.push('link', expectingText) 1164 | _start_producturl = _start_link 1165 | 1166 | def _end_link(self): 1167 | value = self.pop('link') 1168 | context = self._getContext() 1169 | if self.intextinput: 1170 | context['textinput']['link'] = value 1171 | if self.inimage: 1172 | context['image']['link'] = value 1173 | _end_producturl = _end_link 1174 | 1175 | def _start_guid(self, attrsD): 1176 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') 1177 | self.push('id', 1) 1178 | 1179 | def _end_guid(self): 1180 | value = self.pop('id') 1181 | self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) 1182 | if self.guidislink: 1183 | # guid acts as link, but only if 'ispermalink' is not present or is 'true', 1184 | # and only if the item doesn't already have a link element 1185 | self._save('link', value) 1186 | 1187 | def _start_title(self, attrsD): 1188 | self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) 1189 | _start_dc_title = _start_title 1190 | _start_media_title = _start_title 1191 | 1192 | def _end_title(self): 1193 | value = self.popContent('title') 1194 | context = self._getContext() 1195 | if self.intextinput: 1196 | context['textinput']['title'] = value 1197 | elif self.inimage: 1198 | context['image']['title'] = value 1199 | _end_dc_title = _end_title 1200 | _end_media_title = _end_title 1201 | 1202 | def _start_description(self, attrsD): 1203 | context = self._getContext() 1204 | if context.has_key('summary'): 1205 | self._summaryKey = 'content' 1206 | self._start_content(attrsD) 1207 | else: 1208 | self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) 1209 | 1210 | def _start_abstract(self, attrsD): 1211 | self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) 1212 | 1213 | def _end_description(self): 1214 | if self._summaryKey == 'content': 1215 | self._end_content() 1216 | else: 1217 | value = self.popContent('description') 1218 | context = self._getContext() 1219 | if self.intextinput: 1220 | context['textinput']['description'] = value 1221 | elif self.inimage: 1222 | context['image']['description'] = value 1223 | self._summaryKey = None 1224 | _end_abstract = _end_description 1225 | 1226 | def _start_info(self, attrsD): 1227 | self.pushContent('info', attrsD, 'text/plain', 1) 1228 | _start_feedburner_browserfriendly = _start_info 1229 | 1230 | def _end_info(self): 1231 | self.popContent('info') 1232 | _end_feedburner_browserfriendly = _end_info 1233 | 1234 | def _start_generator(self, attrsD): 1235 | if attrsD: 1236 | attrsD = self._itsAnHrefDamnIt(attrsD) 1237 | if attrsD.has_key('href'): 1238 | attrsD['href'] = self.resolveURI(attrsD['href']) 1239 | self._getContext()['generator_detail'] = FeedParserDict(attrsD) 1240 | self.push('generator', 1) 1241 | 1242 | def _end_generator(self): 1243 | value = self.pop('generator') 1244 | context = self._getContext() 1245 | if context.has_key('generator_detail'): 1246 | context['generator_detail']['name'] = value 1247 | 1248 | def _start_admin_generatoragent(self, attrsD): 1249 | self.push('generator', 1) 1250 | value = self._getAttribute(attrsD, 'rdf:resource') 1251 | if value: 1252 | self.elementstack[-1][2].append(value) 1253 | self.pop('generator') 1254 | self._getContext()['generator_detail'] = FeedParserDict({'href': value}) 1255 | 1256 | def _start_admin_errorreportsto(self, attrsD): 1257 | self.push('errorreportsto', 1) 1258 | value = self._getAttribute(attrsD, 'rdf:resource') 1259 | if value: 1260 | self.elementstack[-1][2].append(value) 1261 | self.pop('errorreportsto') 1262 | 1263 | def _start_summary(self, attrsD): 1264 | context = self._getContext() 1265 | if context.has_key('summary'): 1266 | self._summaryKey = 'content' 1267 | self._start_content(attrsD) 1268 | else: 1269 | self._summaryKey = 'summary' 1270 | self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) 1271 | _start_itunes_summary = _start_summary 1272 | 1273 | def _end_summary(self): 1274 | if self._summaryKey == 'content': 1275 | self._end_content() 1276 | else: 1277 | self.popContent(self._summaryKey or 'summary') 1278 | self._summaryKey = None 1279 | _end_itunes_summary = _end_summary 1280 | 1281 | def _start_enclosure(self, attrsD): 1282 | attrsD = self._itsAnHrefDamnIt(attrsD) 1283 | self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) 1284 | href = attrsD.get('href') 1285 | if href: 1286 | context = self._getContext() 1287 | if not context.get('id'): 1288 | context['id'] = href 1289 | 1290 | def _start_source(self, attrsD): 1291 | self.insource = 1 1292 | 1293 | def _end_source(self): 1294 | self.insource = 0 1295 | self._getContext()['source'] = copy.deepcopy(self.sourcedata) 1296 | self.sourcedata.clear() 1297 | 1298 | def _start_content(self, attrsD): 1299 | self.pushContent('content', attrsD, 'text/plain', 1) 1300 | src = attrsD.get('src') 1301 | if src: 1302 | self.contentparams['src'] = src 1303 | self.push('content', 1) 1304 | 1305 | def _start_prodlink(self, attrsD): 1306 | self.pushContent('content', attrsD, 'text/html', 1) 1307 | 1308 | def _start_body(self, attrsD): 1309 | self.pushContent('content', attrsD, 'application/xhtml+xml', 1) 1310 | _start_xhtml_body = _start_body 1311 | 1312 | def _start_content_encoded(self, attrsD): 1313 | self.pushContent('content', attrsD, 'text/html', 1) 1314 | _start_fullitem = _start_content_encoded 1315 | 1316 | def _end_content(self): 1317 | copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) 1318 | value = self.popContent('content') 1319 | if copyToDescription: 1320 | self._save('description', value) 1321 | _end_body = _end_content 1322 | _end_xhtml_body = _end_content 1323 | _end_content_encoded = _end_content 1324 | _end_fullitem = _end_content 1325 | _end_prodlink = _end_content 1326 | 1327 | def _start_itunes_image(self, attrsD): 1328 | self.push('itunes_image', 0) 1329 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) 1330 | _start_itunes_link = _start_itunes_image 1331 | 1332 | def _end_itunes_block(self): 1333 | value = self.pop('itunes_block', 0) 1334 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 1335 | 1336 | def _end_itunes_explicit(self): 1337 | value = self.pop('itunes_explicit', 0) 1338 | self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 1339 | 1340 | if _XML_AVAILABLE: 1341 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): 1342 | def __init__(self, baseuri, baselang, encoding): 1343 | if _debug: sys.stderr.write('trying StrictFeedParser\n') 1344 | xml.sax.handler.ContentHandler.__init__(self) 1345 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) 1346 | self.bozo = 0 1347 | self.exc = None 1348 | 1349 | def startPrefixMapping(self, prefix, uri): 1350 | self.trackNamespace(prefix, uri) 1351 | 1352 | def startElementNS(self, name, qname, attrs): 1353 | namespace, localname = name 1354 | lowernamespace = str(namespace or '').lower() 1355 | if lowernamespace.find('backend.userland.com/rss') <> -1: 1356 | # match any backend.userland.com namespace 1357 | namespace = 'http://backend.userland.com/rss' 1358 | lowernamespace = namespace 1359 | if qname and qname.find(':') > 0: 1360 | givenprefix = qname.split(':')[0] 1361 | else: 1362 | givenprefix = None 1363 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) 1364 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): 1365 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix 1366 | if prefix: 1367 | localname = prefix + ':' + localname 1368 | localname = str(localname).lower() 1369 | if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) 1370 | 1371 | # qname implementation is horribly broken in Python 2.1 (it 1372 | # doesn't report any), and slightly broken in Python 2.2 (it 1373 | # doesn't report the xml: namespace). So we match up namespaces 1374 | # with a known list first, and then possibly override them with 1375 | # the qnames the SAX parser gives us (if indeed it gives us any 1376 | # at all). Thanks to MatejC for helping me test this and 1377 | # tirelessly telling me that it didn't work yet. 1378 | attrsD = {} 1379 | for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): 1380 | lowernamespace = (namespace or '').lower() 1381 | prefix = self._matchnamespaces.get(lowernamespace, '') 1382 | if prefix: 1383 | attrlocalname = prefix + ':' + attrlocalname 1384 | attrsD[str(attrlocalname).lower()] = attrvalue 1385 | for qname in attrs.getQNames(): 1386 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname) 1387 | self.unknown_starttag(localname, attrsD.items()) 1388 | 1389 | def characters(self, text): 1390 | self.handle_data(text) 1391 | 1392 | def endElementNS(self, name, qname): 1393 | namespace, localname = name 1394 | lowernamespace = str(namespace or '').lower() 1395 | if qname and qname.find(':') > 0: 1396 | givenprefix = qname.split(':')[0] 1397 | else: 1398 | givenprefix = '' 1399 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) 1400 | if prefix: 1401 | localname = prefix + ':' + localname 1402 | localname = str(localname).lower() 1403 | self.unknown_endtag(localname) 1404 | 1405 | def error(self, exc): 1406 | self.bozo = 1 1407 | self.exc = exc 1408 | 1409 | def fatalError(self, exc): 1410 | self.error(exc) 1411 | raise exc 1412 | 1413 | class _BaseHTMLProcessor(sgmllib.SGMLParser): 1414 | elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 1415 | 'img', 'input', 'isindex', 'link', 'meta', 'param'] 1416 | 1417 | def __init__(self, encoding): 1418 | self.encoding = encoding 1419 | if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) 1420 | sgmllib.SGMLParser.__init__(self) 1421 | 1422 | def reset(self): 1423 | self.pieces = [] 1424 | sgmllib.SGMLParser.reset(self) 1425 | 1426 | def _shorttag_replace(self, match): 1427 | tag = match.group(1) 1428 | if tag in self.elements_no_end_tag: 1429 | return '<' + tag + ' />' 1430 | else: 1431 | return '<' + tag + '>' 1432 | 1433 | def feed(self, data): 1434 | data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace 1436 | data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 1437 | data = data.replace(''', "'") 1438 | data = data.replace('"', '"') 1439 | if self.encoding and type(data) == type(u''): 1440 | data = data.encode(self.encoding) 1441 | sgmllib.SGMLParser.feed(self, data) 1442 | 1443 | def normalize_attrs(self, attrs): 1444 | # utility method to be called by descendants 1445 | attrs = [(k.lower(), v) for k, v in attrs] 1446 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] 1447 | return attrs 1448 | 1449 | def unknown_starttag(self, tag, attrs): 1450 | # called for each start tag 1451 | # attrs is a list of (attr, value) tuples 1452 | # e.g. for
, tag='pre', attrs=[('class', 'screen')]
1453 |         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1454 |         uattrs = []
1455 |         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456 |         for key, value in attrs:
1457 |             if type(value) != type(u''):
1458 |                 value = unicode(value, self.encoding)
1459 |             uattrs.append((unicode(key, self.encoding), value))
1460 |         strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461 |         if tag in self.elements_no_end_tag:
1462 |             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1463 |         else:
1464 |             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1465 | 
1466 |     def unknown_endtag(self, tag):
1467 |         # called for each end tag, e.g. for 
, tag will be 'pre' 1468 | # Reconstruct the original end tag. 1469 | if tag not in self.elements_no_end_tag: 1470 | self.pieces.append("" % locals()) 1471 | 1472 | def handle_charref(self, ref): 1473 | # called for each character reference, e.g. for ' ', ref will be '160' 1474 | # Reconstruct the original character reference. 1475 | self.pieces.append('&#%(ref)s;' % locals()) 1476 | 1477 | def handle_entityref(self, ref): 1478 | # called for each entity reference, e.g. for '©', ref will be 'copy' 1479 | # Reconstruct the original entity reference. 1480 | self.pieces.append('&%(ref)s;' % locals()) 1481 | 1482 | def handle_data(self, text): 1483 | # called for each block of plain text, i.e. outside of any tag and 1484 | # not containing any character or entity references 1485 | # Store the original text verbatim. 1486 | if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) 1487 | self.pieces.append(text) 1488 | 1489 | def handle_comment(self, text): 1490 | # called for each HTML comment, e.g. 1491 | # Reconstruct the original comment. 1492 | self.pieces.append('' % locals()) 1493 | 1494 | def handle_pi(self, text): 1495 | # called for each processing instruction, e.g. 1496 | # Reconstruct original processing instruction. 1497 | self.pieces.append('' % locals()) 1498 | 1499 | def handle_decl(self, text): 1500 | # called for the DOCTYPE, if present, e.g. 1501 | # 1503 | # Reconstruct original DOCTYPE 1504 | self.pieces.append('' % locals()) 1505 | 1506 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match 1507 | def _scan_name(self, i, declstartpos): 1508 | rawdata = self.rawdata 1509 | n = len(rawdata) 1510 | if i == n: 1511 | return None, -1 1512 | m = self._new_declname_match(rawdata, i) 1513 | if m: 1514 | s = m.group() 1515 | name = s.strip() 1516 | if (i + len(s)) == n: 1517 | return None, -1 # end of buffer 1518 | return name.lower(), m.end() 1519 | else: 1520 | self.handle_data(rawdata) 1521 | # self.updatepos(declstartpos, i) 1522 | return None, -1 1523 | 1524 | def output(self): 1525 | '''Return processed HTML as a single string''' 1526 | return ''.join([str(p) for p in self.pieces]) 1527 | 1528 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): 1529 | def __init__(self, baseuri, baselang, encoding): 1530 | sgmllib.SGMLParser.__init__(self) 1531 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) 1532 | 1533 | def decodeEntities(self, element, data): 1534 | data = data.replace('<', '<') 1535 | data = data.replace('<', '<') 1536 | data = data.replace('>', '>') 1537 | data = data.replace('>', '>') 1538 | data = data.replace('&', '&') 1539 | data = data.replace('&', '&') 1540 | data = data.replace('"', '"') 1541 | data = data.replace('"', '"') 1542 | data = data.replace(''', ''') 1543 | data = data.replace(''', ''') 1544 | if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): 1545 | data = data.replace('<', '<') 1546 | data = data.replace('>', '>') 1547 | data = data.replace('&', '&') 1548 | data = data.replace('"', '"') 1549 | data = data.replace(''', "'") 1550 | return data 1551 | 1552 | class _RelativeURIResolver(_BaseHTMLProcessor): 1553 | relative_uris = [('a', 'href'), 1554 | ('applet', 'codebase'), 1555 | ('area', 'href'), 1556 | ('blockquote', 'cite'), 1557 | ('body', 'background'), 1558 | ('del', 'cite'), 1559 | ('form', 'action'), 1560 | ('frame', 'longdesc'), 1561 | ('frame', 'src'), 1562 | ('iframe', 'longdesc'), 1563 | ('iframe', 'src'), 1564 | ('head', 'profile'), 1565 | ('img', 'longdesc'), 1566 | ('img', 'src'), 1567 | ('img', 'usemap'), 1568 | ('input', 'src'), 1569 | ('input', 'usemap'), 1570 | ('ins', 'cite'), 1571 | ('link', 'href'), 1572 | ('object', 'classid'), 1573 | ('object', 'codebase'), 1574 | ('object', 'data'), 1575 | ('object', 'usemap'), 1576 | ('q', 'cite'), 1577 | ('script', 'src')] 1578 | 1579 | def __init__(self, baseuri, encoding): 1580 | _BaseHTMLProcessor.__init__(self, encoding) 1581 | self.baseuri = baseuri 1582 | 1583 | def resolveURI(self, uri): 1584 | return _urljoin(self.baseuri, uri) 1585 | 1586 | def unknown_starttag(self, tag, attrs): 1587 | attrs = self.normalize_attrs(attrs) 1588 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] 1589 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) 1590 | 1591 | def _resolveRelativeURIs(htmlSource, baseURI, encoding): 1592 | if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') 1593 | p = _RelativeURIResolver(baseURI, encoding) 1594 | p.feed(htmlSource) 1595 | return p.output() 1596 | 1597 | class _HTMLSanitizer(_BaseHTMLProcessor): 1598 | acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', 1599 | 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 1600 | 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 1601 | 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 1602 | 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 1603 | 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 1604 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 1605 | 'thead', 'tr', 'tt', 'u', 'ul', 'var'] 1606 | 1607 | acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 1608 | 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 1609 | 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 1610 | 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 1611 | 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 1612 | 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 1613 | 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 1614 | 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 1615 | 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', 1616 | 'usemap', 'valign', 'value', 'vspace', 'width'] 1617 | 1618 | unacceptable_elements_with_end_tag = ['script', 'applet'] 1619 | 1620 | def reset(self): 1621 | _BaseHTMLProcessor.reset(self) 1622 | self.unacceptablestack = 0 1623 | 1624 | def unknown_starttag(self, tag, attrs): 1625 | if not tag in self.acceptable_elements: 1626 | if tag in self.unacceptable_elements_with_end_tag: 1627 | self.unacceptablestack += 1 1628 | return 1629 | attrs = self.normalize_attrs(attrs) 1630 | attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] 1631 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) 1632 | 1633 | def unknown_endtag(self, tag): 1634 | if not tag in self.acceptable_elements: 1635 | if tag in self.unacceptable_elements_with_end_tag: 1636 | self.unacceptablestack -= 1 1637 | return 1638 | _BaseHTMLProcessor.unknown_endtag(self, tag) 1639 | 1640 | def handle_pi(self, text): 1641 | pass 1642 | 1643 | def handle_decl(self, text): 1644 | pass 1645 | 1646 | def handle_data(self, text): 1647 | if not self.unacceptablestack: 1648 | _BaseHTMLProcessor.handle_data(self, text) 1649 | 1650 | def _sanitizeHTML(htmlSource, encoding): 1651 | p = _HTMLSanitizer(encoding) 1652 | p.feed(htmlSource) 1653 | data = p.output() 1654 | if TIDY_MARKUP: 1655 | # loop through list of preferred Tidy interfaces looking for one that's installed, 1656 | # then set up a common _tidy function to wrap the interface-specific API. 1657 | _tidy = None 1658 | for tidy_interface in PREFERRED_TIDY_INTERFACES: 1659 | try: 1660 | if tidy_interface == "uTidy": 1661 | from tidy import parseString as _utidy 1662 | def _tidy(data, **kwargs): 1663 | return str(_utidy(data, **kwargs)) 1664 | break 1665 | elif tidy_interface == "mxTidy": 1666 | from mx.Tidy import Tidy as _mxtidy 1667 | def _tidy(data, **kwargs): 1668 | nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) 1669 | return data 1670 | break 1671 | except: 1672 | pass 1673 | if _tidy: 1674 | utf8 = type(data) == type(u'') 1675 | if utf8: 1676 | data = data.encode('utf-8') 1677 | data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") 1678 | if utf8: 1679 | data = unicode(data, 'utf-8') 1680 | if data.count(''): 1683 | data = data.split('>', 1)[1] 1684 | if data.count('= '2.3.3' 1733 | assert base64 != None 1734 | user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') 1735 | realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] 1736 | self.add_password(realm, host, user, passw) 1737 | retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) 1738 | self.reset_retry_count() 1739 | return retry 1740 | except: 1741 | return self.http_error_default(req, fp, code, msg, headers) 1742 | 1743 | def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): 1744 | """URL, filename, or string --> stream 1745 | 1746 | This function lets you define parsers that take any input source 1747 | (URL, pathname to local or network file, or actual data as a string) 1748 | and deal with it in a uniform manner. Returned object is guaranteed 1749 | to have all the basic stdio read methods (read, readline, readlines). 1750 | Just .close() the object when you're done with it. 1751 | 1752 | If the etag argument is supplied, it will be used as the value of an 1753 | If-None-Match request header. 1754 | 1755 | If the modified argument is supplied, it must be a tuple of 9 integers 1756 | as returned by gmtime() in the standard Python time module. This MUST 1757 | be in GMT (Greenwich Mean Time). The formatted date/time will be used 1758 | as the value of an If-Modified-Since request header. 1759 | 1760 | If the agent argument is supplied, it will be used as the value of a 1761 | User-Agent request header. 1762 | 1763 | If the referrer argument is supplied, it will be used as the value of a 1764 | Referer[sic] request header. 1765 | 1766 | If handlers is supplied, it is a list of handlers used to build a 1767 | urllib2 opener. 1768 | """ 1769 | 1770 | if hasattr(url_file_stream_or_string, 'read'): 1771 | return url_file_stream_or_string 1772 | 1773 | if url_file_stream_or_string == '-': 1774 | return sys.stdin 1775 | 1776 | if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): 1777 | if not agent: 1778 | agent = USER_AGENT 1779 | # test for inline user:password for basic auth 1780 | auth = None 1781 | if base64: 1782 | urltype, rest = urllib.splittype(url_file_stream_or_string) 1783 | realhost, rest = urllib.splithost(rest) 1784 | if realhost: 1785 | user_passwd, realhost = urllib.splituser(realhost) 1786 | if user_passwd: 1787 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) 1788 | auth = base64.encodestring(user_passwd).strip() 1789 | # try to open with urllib2 (to use optional headers) 1790 | request = urllib2.Request(url_file_stream_or_string) 1791 | request.add_header('User-Agent', agent) 1792 | if etag: 1793 | request.add_header('If-None-Match', etag) 1794 | if modified: 1795 | # format into an RFC 1123-compliant timestamp. We can't use 1796 | # time.strftime() since the %a and %b directives can be affected 1797 | # by the current locale, but RFC 2616 states that dates must be 1798 | # in English. 1799 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] 1800 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 1801 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) 1802 | if referrer: 1803 | request.add_header('Referer', referrer) 1804 | if gzip and zlib: 1805 | request.add_header('Accept-encoding', 'gzip, deflate') 1806 | elif gzip: 1807 | request.add_header('Accept-encoding', 'gzip') 1808 | elif zlib: 1809 | request.add_header('Accept-encoding', 'deflate') 1810 | else: 1811 | request.add_header('Accept-encoding', '') 1812 | if auth: 1813 | request.add_header('Authorization', 'Basic %s' % auth) 1814 | if ACCEPT_HEADER: 1815 | request.add_header('Accept', ACCEPT_HEADER) 1816 | request.add_header('A-IM', 'feed') # RFC 3229 support 1817 | opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) 1818 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent 1819 | try: 1820 | return opener.open(request) 1821 | finally: 1822 | opener.close() # JohnD 1823 | 1824 | # try to open with native open function (if url_file_stream_or_string is a filename) 1825 | try: 1826 | return open(url_file_stream_or_string) 1827 | except: 1828 | pass 1829 | 1830 | # treat url_file_stream_or_string as string 1831 | return _StringIO(str(url_file_stream_or_string)) 1832 | 1833 | _date_handlers = [] 1834 | def registerDateHandler(func): 1835 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' 1836 | _date_handlers.insert(0, func) 1837 | 1838 | # ISO-8601 date parsing routines written by Fazal Majid. 1839 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 1840 | # parser is beyond the scope of feedparser and would be a worthwhile addition 1841 | # to the Python library. 1842 | # A single regular expression cannot parse ISO 8601 date formats into groups 1843 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or 1844 | # 0301-04-01), so we use templates instead. 1845 | # Please note the order in templates is significant because we need a 1846 | # greedy match. 1847 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', 1848 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 1849 | '-YY-?MM', '-OOO', '-YY', 1850 | '--MM-?DD', '--MM', 1851 | '---DD', 1852 | 'CC', ''] 1853 | _iso8601_re = [ 1854 | tmpl.replace( 1855 | 'YYYY', r'(?P\d{4})').replace( 1856 | 'YY', r'(?P\d\d)').replace( 1857 | 'MM', r'(?P[01]\d)').replace( 1858 | 'DD', r'(?P[0123]\d)').replace( 1859 | 'OOO', r'(?P[0123]\d\d)').replace( 1860 | 'CC', r'(?P\d\d$)') 1861 | + r'(T?(?P\d{2}):(?P\d{2})' 1862 | + r'(:(?P\d{2}))?' 1863 | + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' 1864 | for tmpl in _iso8601_tmpl] 1865 | del tmpl 1866 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] 1867 | del regex 1868 | def _parse_date_iso8601(dateString): 1869 | '''Parse a variety of ISO-8601-compatible formats like 20040105''' 1870 | m = None 1871 | for _iso8601_match in _iso8601_matches: 1872 | m = _iso8601_match(dateString) 1873 | if m: break 1874 | if not m: return 1875 | if m.span() == (0, 0): return 1876 | params = m.groupdict() 1877 | ordinal = params.get('ordinal', 0) 1878 | if ordinal: 1879 | ordinal = int(ordinal) 1880 | else: 1881 | ordinal = 0 1882 | year = params.get('year', '--') 1883 | if not year or year == '--': 1884 | year = time.gmtime()[0] 1885 | elif len(year) == 2: 1886 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 1887 | year = 100 * int(time.gmtime()[0] / 100) + int(year) 1888 | else: 1889 | year = int(year) 1890 | month = params.get('month', '-') 1891 | if not month or month == '-': 1892 | # ordinals are NOT normalized by mktime, we simulate them 1893 | # by setting month=1, day=ordinal 1894 | if ordinal: 1895 | month = 1 1896 | else: 1897 | month = time.gmtime()[1] 1898 | month = int(month) 1899 | day = params.get('day', 0) 1900 | if not day: 1901 | # see above 1902 | if ordinal: 1903 | day = ordinal 1904 | elif params.get('century', 0) or \ 1905 | params.get('year', 0) or params.get('month', 0): 1906 | day = 1 1907 | else: 1908 | day = time.gmtime()[2] 1909 | else: 1910 | day = int(day) 1911 | # special case of the century - is the first year of the 21st century 1912 | # 2000 or 2001 ? The debate goes on... 1913 | if 'century' in params.keys(): 1914 | year = (int(params['century']) - 1) * 100 + 1 1915 | # in ISO 8601 most fields are optional 1916 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: 1917 | if not params.get(field, None): 1918 | params[field] = 0 1919 | hour = int(params.get('hour', 0)) 1920 | minute = int(params.get('minute', 0)) 1921 | second = int(params.get('second', 0)) 1922 | # weekday is normalized by mktime(), we can ignore it 1923 | weekday = 0 1924 | # daylight savings is complex, but not needed for feedparser's purposes 1925 | # as time zones, if specified, include mention of whether it is active 1926 | # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and 1927 | # and most implementations have DST bugs 1928 | daylight_savings_flag = 0 1929 | tm = [year, month, day, hour, minute, second, weekday, 1930 | ordinal, daylight_savings_flag] 1931 | # ISO 8601 time zone adjustments 1932 | tz = params.get('tz') 1933 | if tz and tz != 'Z': 1934 | if tz[0] == '-': 1935 | tm[3] += int(params.get('tzhour', 0)) 1936 | tm[4] += int(params.get('tzmin', 0)) 1937 | elif tz[0] == '+': 1938 | tm[3] -= int(params.get('tzhour', 0)) 1939 | tm[4] -= int(params.get('tzmin', 0)) 1940 | else: 1941 | return None 1942 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) 1943 | # which is guaranteed to normalize d/m/y/h/m/s. 1944 | # Many implementations have bugs, but we'll pretend they don't. 1945 | return time.localtime(time.mktime(tm)) 1946 | registerDateHandler(_parse_date_iso8601) 1947 | 1948 | # 8-bit date handling routines written by ytrewq1. 1949 | _korean_year = u'\ub144' # b3e2 in euc-kr 1950 | _korean_month = u'\uc6d4' # bff9 in euc-kr 1951 | _korean_day = u'\uc77c' # c0cf in euc-kr 1952 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr 1953 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr 1954 | 1955 | _korean_onblog_date_re = \ 1956 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ 1957 | (_korean_year, _korean_month, _korean_day)) 1958 | _korean_nate_date_re = \ 1959 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ 1960 | (_korean_am, _korean_pm)) 1961 | def _parse_date_onblog(dateString): 1962 | '''Parse a string according to the OnBlog 8-bit date format''' 1963 | m = _korean_onblog_date_re.match(dateString) 1964 | if not m: return 1965 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ 1966 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 1967 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ 1968 | 'zonediff': '+09:00'} 1969 | if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) 1970 | return _parse_date_w3dtf(w3dtfdate) 1971 | registerDateHandler(_parse_date_onblog) 1972 | 1973 | def _parse_date_nate(dateString): 1974 | '''Parse a string according to the Nate 8-bit date format''' 1975 | m = _korean_nate_date_re.match(dateString) 1976 | if not m: return 1977 | hour = int(m.group(5)) 1978 | ampm = m.group(4) 1979 | if (ampm == _korean_pm): 1980 | hour += 12 1981 | hour = str(hour) 1982 | if len(hour) == 1: 1983 | hour = '0' + hour 1984 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ 1985 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 1986 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ 1987 | 'zonediff': '+09:00'} 1988 | if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) 1989 | return _parse_date_w3dtf(w3dtfdate) 1990 | registerDateHandler(_parse_date_nate) 1991 | 1992 | _mssql_date_re = \ 1993 | re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') 1994 | def _parse_date_mssql(dateString): 1995 | '''Parse a string according to the MS SQL date format''' 1996 | m = _mssql_date_re.match(dateString) 1997 | if not m: return 1998 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ 1999 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 2000 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ 2001 | 'zonediff': '+09:00'} 2002 | if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) 2003 | return _parse_date_w3dtf(w3dtfdate) 2004 | registerDateHandler(_parse_date_mssql) 2005 | 2006 | # Unicode strings for Greek date strings 2007 | _greek_months = \ 2008 | { \ 2009 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 2010 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 2011 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 2012 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 2013 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 2014 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 2015 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 2016 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 2017 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 2018 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 2019 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 2020 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 2021 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 2022 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 2023 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 2024 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 2025 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 2026 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 2027 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 2028 | } 2029 | 2030 | _greek_wdays = \ 2031 | { \ 2032 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 2033 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 2034 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 2035 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 2036 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 2037 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 2038 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 2039 | } 2040 | 2041 | _greek_date_format_re = \ 2042 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') 2043 | 2044 | def _parse_date_greek(dateString): 2045 | '''Parse a string according to a Greek 8-bit date format.''' 2046 | m = _greek_date_format_re.match(dateString) 2047 | if not m: return 2048 | try: 2049 | wday = _greek_wdays[m.group(1)] 2050 | month = _greek_months[m.group(3)] 2051 | except: 2052 | return 2053 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ 2054 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ 2055 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ 2056 | 'zonediff': m.group(8)} 2057 | if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) 2058 | return _parse_date_rfc822(rfc822date) 2059 | registerDateHandler(_parse_date_greek) 2060 | 2061 | # Unicode strings for Hungarian date strings 2062 | _hungarian_months = \ 2063 | { \ 2064 | u'janu\u00e1r': u'01', # e1 in iso-8859-2 2065 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2 2066 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2 2067 | u'\u00e1prilis': u'04', # e1 in iso-8859-2 2068 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2 2069 | u'j\u00fanius': u'06', # fa in iso-8859-2 2070 | u'j\u00falius': u'07', # fa in iso-8859-2 2071 | u'augusztus': u'08', 2072 | u'szeptember': u'09', 2073 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2 2074 | u'november': u'11', 2075 | u'december': u'12', 2076 | } 2077 | 2078 | _hungarian_date_format_re = \ 2079 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') 2080 | 2081 | def _parse_date_hungarian(dateString): 2082 | '''Parse a string according to a Hungarian 8-bit date format.''' 2083 | m = _hungarian_date_format_re.match(dateString) 2084 | if not m: return 2085 | try: 2086 | month = _hungarian_months[m.group(2)] 2087 | day = m.group(3) 2088 | if len(day) == 1: 2089 | day = '0' + day 2090 | hour = m.group(4) 2091 | if len(hour) == 1: 2092 | hour = '0' + hour 2093 | except: 2094 | return 2095 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ 2096 | {'year': m.group(1), 'month': month, 'day': day,\ 2097 | 'hour': hour, 'minute': m.group(5),\ 2098 | 'zonediff': m.group(6)} 2099 | if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) 2100 | return _parse_date_w3dtf(w3dtfdate) 2101 | registerDateHandler(_parse_date_hungarian) 2102 | 2103 | # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by 2104 | # Drake and licensed under the Python license. Removed all range checking 2105 | # for month, day, hour, minute, and second, since mktime will normalize 2106 | # these later 2107 | def _parse_date_w3dtf(dateString): 2108 | def __extract_date(m): 2109 | year = int(m.group('year')) 2110 | if year < 100: 2111 | year = 100 * int(time.gmtime()[0] / 100) + int(year) 2112 | if year < 1000: 2113 | return 0, 0, 0 2114 | julian = m.group('julian') 2115 | if julian: 2116 | julian = int(julian) 2117 | month = julian / 30 + 1 2118 | day = julian % 30 + 1 2119 | jday = None 2120 | while jday != julian: 2121 | t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) 2122 | jday = time.gmtime(t)[-2] 2123 | diff = abs(jday - julian) 2124 | if jday > julian: 2125 | if diff < day: 2126 | day = day - diff 2127 | else: 2128 | month = month - 1 2129 | day = 31 2130 | elif jday < julian: 2131 | if day + diff < 28: 2132 | day = day + diff 2133 | else: 2134 | month = month + 1 2135 | return year, month, day 2136 | month = m.group('month') 2137 | day = 1 2138 | if month is None: 2139 | month = 1 2140 | else: 2141 | month = int(month) 2142 | day = m.group('day') 2143 | if day: 2144 | day = int(day) 2145 | else: 2146 | day = 1 2147 | return year, month, day 2148 | 2149 | def __extract_time(m): 2150 | if not m: 2151 | return 0, 0, 0 2152 | hours = m.group('hours') 2153 | if not hours: 2154 | return 0, 0, 0 2155 | hours = int(hours) 2156 | minutes = int(m.group('minutes')) 2157 | seconds = m.group('seconds') 2158 | if seconds: 2159 | seconds = int(seconds) 2160 | else: 2161 | seconds = 0 2162 | return hours, minutes, seconds 2163 | 2164 | def __extract_tzd(m): 2165 | '''Return the Time Zone Designator as an offset in seconds from UTC.''' 2166 | if not m: 2167 | return 0 2168 | tzd = m.group('tzd') 2169 | if not tzd: 2170 | return 0 2171 | if tzd == 'Z': 2172 | return 0 2173 | hours = int(m.group('tzdhours')) 2174 | minutes = m.group('tzdminutes') 2175 | if minutes: 2176 | minutes = int(minutes) 2177 | else: 2178 | minutes = 0 2179 | offset = (hours*60 + minutes) * 60 2180 | if tzd[0] == '+': 2181 | return -offset 2182 | return offset 2183 | 2184 | __date_re = ('(?P\d\d\d\d)' 2185 | '(?:(?P-|)' 2186 | '(?:(?P\d\d\d)' 2187 | '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') 2188 | __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' 2189 | __tzd_rx = re.compile(__tzd_re) 2190 | __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' 2191 | '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' 2192 | + __tzd_re) 2193 | __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) 2194 | __datetime_rx = re.compile(__datetime_re) 2195 | m = __datetime_rx.match(dateString) 2196 | if (m is None) or (m.group() != dateString): return 2197 | gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) 2198 | if gmt[0] == 0: return 2199 | return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) 2200 | registerDateHandler(_parse_date_w3dtf) 2201 | 2202 | def _parse_date_rfc822(dateString): 2203 | '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' 2204 | data = dateString.split() 2205 | if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: 2206 | del data[0] 2207 | if len(data) == 4: 2208 | s = data[3] 2209 | i = s.find('+') 2210 | if i > 0: 2211 | data[3:] = [s[:i], s[i+1:]] 2212 | else: 2213 | data.append('') 2214 | dateString = " ".join(data) 2215 | if len(data) < 5: 2216 | dateString += ' 00:00:00 GMT' 2217 | tm = rfc822.parsedate_tz(dateString) 2218 | if tm: 2219 | return time.gmtime(rfc822.mktime_tz(tm)) 2220 | # rfc822.py defines several time zones, but we define some extra ones. 2221 | # 'ET' is equivalent to 'EST', etc. 2222 | _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} 2223 | rfc822._timezones.update(_additional_timezones) 2224 | registerDateHandler(_parse_date_rfc822) 2225 | 2226 | def _parse_date(dateString): 2227 | '''Parses a variety of date formats into a 9-tuple in GMT''' 2228 | for handler in _date_handlers: 2229 | try: 2230 | date9tuple = handler(dateString) 2231 | if not date9tuple: continue 2232 | if len(date9tuple) != 9: 2233 | if _debug: sys.stderr.write('date handler function must return 9-tuple\n') 2234 | raise ValueError 2235 | map(int, date9tuple) 2236 | return date9tuple 2237 | except Exception, e: 2238 | if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) 2239 | pass 2240 | return None 2241 | 2242 | def _getCharacterEncoding(http_headers, xml_data): 2243 | '''Get the character encoding of the XML document 2244 | 2245 | http_headers is a dictionary 2246 | xml_data is a raw string (not Unicode) 2247 | 2248 | This is so much trickier than it sounds, it's not even funny. 2249 | According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type 2250 | is application/xml, application/*+xml, 2251 | application/xml-external-parsed-entity, or application/xml-dtd, 2252 | the encoding given in the charset parameter of the HTTP Content-Type 2253 | takes precedence over the encoding given in the XML prefix within the 2254 | document, and defaults to 'utf-8' if neither are specified. But, if 2255 | the HTTP Content-Type is text/xml, text/*+xml, or 2256 | text/xml-external-parsed-entity, the encoding given in the XML prefix 2257 | within the document is ALWAYS IGNORED and only the encoding given in 2258 | the charset parameter of the HTTP Content-Type header should be 2259 | respected, and it defaults to 'us-ascii' if not specified. 2260 | 2261 | Furthermore, discussion on the atom-syntax mailing list with the 2262 | author of RFC 3023 leads me to the conclusion that any document 2263 | served with a Content-Type of text/* and no charset parameter 2264 | must be treated as us-ascii. (We now do this.) And also that it 2265 | must always be flagged as non-well-formed. (We now do this too.) 2266 | 2267 | If Content-Type is unspecified (input was local file or non-HTTP source) 2268 | or unrecognized (server just got it totally wrong), then go by the 2269 | encoding given in the XML prefix of the document and default to 2270 | 'iso-8859-1' as per the HTTP specification (RFC 2616). 2271 | 2272 | Then, assuming we didn't find a character encoding in the HTTP headers 2273 | (and the HTTP Content-type allowed us to look in the body), we need 2274 | to sniff the first few bytes of the XML data and try to determine 2275 | whether the encoding is ASCII-compatible. Section F of the XML 2276 | specification shows the way here: 2277 | http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info 2278 | 2279 | If the sniffed encoding is not ASCII-compatible, we need to make it 2280 | ASCII compatible so that we can sniff further into the XML declaration 2281 | to find the encoding attribute, which will tell us the true encoding. 2282 | 2283 | Of course, none of this guarantees that we will be able to parse the 2284 | feed in the declared character encoding (assuming it was declared 2285 | correctly, which many are not). CJKCodecs and iconv_codec help a lot; 2286 | you should definitely install them if you can. 2287 | http://cjkpython.i18n.org/ 2288 | ''' 2289 | 2290 | def _parseHTTPContentType(content_type): 2291 | '''takes HTTP Content-Type header and returns (content type, charset) 2292 | 2293 | If no charset is specified, returns (content type, '') 2294 | If no content type is specified, returns ('', '') 2295 | Both return parameters are guaranteed to be lowercase strings 2296 | ''' 2297 | content_type = content_type or '' 2298 | content_type, params = cgi.parse_header(content_type) 2299 | return content_type, params.get('charset', '').replace("'", '') 2300 | 2301 | sniffed_xml_encoding = '' 2302 | xml_encoding = '' 2303 | true_encoding = '' 2304 | http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) 2305 | # Must sniff for non-ASCII-compatible character encodings before 2306 | # searching for XML declaration. This heuristic is defined in 2307 | # section F of the XML specification: 2308 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info 2309 | try: 2310 | if xml_data[:4] == '\x4c\x6f\xa7\x94': 2311 | # EBCDIC 2312 | xml_data = _ebcdic_to_ascii(xml_data) 2313 | elif xml_data[:4] == '\x00\x3c\x00\x3f': 2314 | # UTF-16BE 2315 | sniffed_xml_encoding = 'utf-16be' 2316 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 2317 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): 2318 | # UTF-16BE with BOM 2319 | sniffed_xml_encoding = 'utf-16be' 2320 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 2321 | elif xml_data[:4] == '\x3c\x00\x3f\x00': 2322 | # UTF-16LE 2323 | sniffed_xml_encoding = 'utf-16le' 2324 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 2325 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): 2326 | # UTF-16LE with BOM 2327 | sniffed_xml_encoding = 'utf-16le' 2328 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 2329 | elif xml_data[:4] == '\x00\x00\x00\x3c': 2330 | # UTF-32BE 2331 | sniffed_xml_encoding = 'utf-32be' 2332 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 2333 | elif xml_data[:4] == '\x3c\x00\x00\x00': 2334 | # UTF-32LE 2335 | sniffed_xml_encoding = 'utf-32le' 2336 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 2337 | elif xml_data[:4] == '\x00\x00\xfe\xff': 2338 | # UTF-32BE with BOM 2339 | sniffed_xml_encoding = 'utf-32be' 2340 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 2341 | elif xml_data[:4] == '\xff\xfe\x00\x00': 2342 | # UTF-32LE with BOM 2343 | sniffed_xml_encoding = 'utf-32le' 2344 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 2345 | elif xml_data[:3] == '\xef\xbb\xbf': 2346 | # UTF-8 with BOM 2347 | sniffed_xml_encoding = 'utf-8' 2348 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 2349 | else: 2350 | # ASCII-compatible 2351 | pass 2352 | xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 2353 | except: 2354 | xml_encoding_match = None 2355 | if xml_encoding_match: 2356 | xml_encoding = xml_encoding_match.groups()[0].lower() 2357 | if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): 2358 | xml_encoding = sniffed_xml_encoding 2359 | acceptable_content_type = 0 2360 | application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') 2361 | text_content_types = ('text/xml', 'text/xml-external-parsed-entity') 2362 | if (http_content_type in application_content_types) or \ 2363 | (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): 2364 | acceptable_content_type = 1 2365 | true_encoding = http_encoding or xml_encoding or 'utf-8' 2366 | elif (http_content_type in text_content_types) or \ 2367 | (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): 2368 | acceptable_content_type = 1 2369 | true_encoding = http_encoding or 'us-ascii' 2370 | elif http_content_type.startswith('text/'): 2371 | true_encoding = http_encoding or 'us-ascii' 2372 | elif http_headers and (not http_headers.has_key('content-type')): 2373 | true_encoding = xml_encoding or 'iso-8859-1' 2374 | else: 2375 | true_encoding = xml_encoding or 'utf-8' 2376 | return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type 2377 | 2378 | def _toUTF8(data, encoding): 2379 | '''Changes an XML data stream on the fly to specify a new encoding 2380 | 2381 | data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already 2382 | encoding is a string recognized by encodings.aliases 2383 | ''' 2384 | if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) 2385 | # strip Byte Order Mark (if present) 2386 | if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): 2387 | if _debug: 2388 | sys.stderr.write('stripping BOM\n') 2389 | if encoding != 'utf-16be': 2390 | sys.stderr.write('trying utf-16be instead\n') 2391 | encoding = 'utf-16be' 2392 | data = data[2:] 2393 | elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): 2394 | if _debug: 2395 | sys.stderr.write('stripping BOM\n') 2396 | if encoding != 'utf-16le': 2397 | sys.stderr.write('trying utf-16le instead\n') 2398 | encoding = 'utf-16le' 2399 | data = data[2:] 2400 | elif data[:3] == '\xef\xbb\xbf': 2401 | if _debug: 2402 | sys.stderr.write('stripping BOM\n') 2403 | if encoding != 'utf-8': 2404 | sys.stderr.write('trying utf-8 instead\n') 2405 | encoding = 'utf-8' 2406 | data = data[3:] 2407 | elif data[:4] == '\x00\x00\xfe\xff': 2408 | if _debug: 2409 | sys.stderr.write('stripping BOM\n') 2410 | if encoding != 'utf-32be': 2411 | sys.stderr.write('trying utf-32be instead\n') 2412 | encoding = 'utf-32be' 2413 | data = data[4:] 2414 | elif data[:4] == '\xff\xfe\x00\x00': 2415 | if _debug: 2416 | sys.stderr.write('stripping BOM\n') 2417 | if encoding != 'utf-32le': 2418 | sys.stderr.write('trying utf-32le instead\n') 2419 | encoding = 'utf-32le' 2420 | data = data[4:] 2421 | newdata = unicode(data, encoding) 2422 | if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) 2423 | declmatch = re.compile('^<\?xml[^>]*?>') 2424 | newdecl = '''''' 2425 | if declmatch.search(newdata): 2426 | newdata = declmatch.sub(newdecl, newdata) 2427 | else: 2428 | newdata = newdecl + u'\n' + newdata 2429 | return newdata.encode('utf-8') 2430 | 2431 | def _stripDoctype(data): 2432 | '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) 2433 | 2434 | rss_version may be 'rss091n' or None 2435 | stripped_data is the same XML document, minus the DOCTYPE 2436 | ''' 2437 | entity_pattern = re.compile(r']*?)>', re.MULTILINE) 2438 | data = entity_pattern.sub('', data) 2439 | doctype_pattern = re.compile(r']*?)>', re.MULTILINE) 2440 | doctype_results = doctype_pattern.findall(data) 2441 | doctype = doctype_results and doctype_results[0] or '' 2442 | if doctype.lower().count('netscape'): 2443 | version = 'rss091n' 2444 | else: 2445 | version = None 2446 | data = doctype_pattern.sub('', data) 2447 | return version, data 2448 | 2449 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): 2450 | '''Parse a feed from a URL, file, stream, or string''' 2451 | result = FeedParserDict() 2452 | result['feed'] = FeedParserDict() 2453 | result['entries'] = [] 2454 | if _XML_AVAILABLE: 2455 | result['bozo'] = 0 2456 | if type(handlers) == types.InstanceType: 2457 | handlers = [handlers] 2458 | try: 2459 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) 2460 | data = f.read() 2461 | except Exception, e: 2462 | result['bozo'] = 1 2463 | result['bozo_exception'] = e 2464 | data = '' 2465 | f = None 2466 | 2467 | # if feed is gzip-compressed, decompress it 2468 | if f and data and hasattr(f, 'headers'): 2469 | if gzip and f.headers.get('content-encoding', '') == 'gzip': 2470 | try: 2471 | data = gzip.GzipFile(fileobj=_StringIO(data)).read() 2472 | except Exception, e: 2473 | # Some feeds claim to be gzipped but they're not, so 2474 | # we get garbage. Ideally, we should re-request the 2475 | # feed without the 'Accept-encoding: gzip' header, 2476 | # but we don't. 2477 | result['bozo'] = 1 2478 | result['bozo_exception'] = e 2479 | data = '' 2480 | elif zlib and f.headers.get('content-encoding', '') == 'deflate': 2481 | try: 2482 | data = zlib.decompress(data, -zlib.MAX_WBITS) 2483 | except Exception, e: 2484 | result['bozo'] = 1 2485 | result['bozo_exception'] = e 2486 | data = '' 2487 | 2488 | # save HTTP headers 2489 | if hasattr(f, 'info'): 2490 | info = f.info() 2491 | result['etag'] = info.getheader('ETag') 2492 | last_modified = info.getheader('Last-Modified') 2493 | if last_modified: 2494 | result['modified'] = _parse_date(last_modified) 2495 | if hasattr(f, 'url'): 2496 | result['href'] = f.url 2497 | result['status'] = 200 2498 | if hasattr(f, 'status'): 2499 | result['status'] = f.status 2500 | if hasattr(f, 'headers'): 2501 | result['headers'] = f.headers.dict 2502 | if hasattr(f, 'close'): 2503 | f.close() 2504 | 2505 | # there are four encodings to keep track of: 2506 | # - http_encoding is the encoding declared in the Content-Type HTTP header 2507 | # - xml_encoding is the encoding declared in the ; changed 2674 | # project name 2675 | #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); 2676 | # removed unnecessary urllib code -- urllib2 should always be available anyway; 2677 | # return actual url, status, and full HTTP headers (as result['url'], 2678 | # result['status'], and result['headers']) if parsing a remote feed over HTTP -- 2679 | # this should pass all the HTTP tests at ; 2680 | # added the latest namespace-of-the-week for RSS 2.0 2681 | #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom 2682 | # User-Agent (otherwise urllib2 sends two, which confuses some servers) 2683 | #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for 2684 | # inline and as used in some RSS 2.0 feeds 2685 | #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or 2686 | # textInput, and also to return the character encoding (if specified) 2687 | #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking 2688 | # nested divs within content (JohnD); fixed missing sys import (JohanS); 2689 | # fixed regular expression to capture XML character encoding (Andrei); 2690 | # added support for Atom 0.3-style links; fixed bug with textInput tracking; 2691 | # added support for cloud (MartijnP); added support for multiple 2692 | # category/dc:subject (MartijnP); normalize content model: 'description' gets 2693 | # description (which can come from description, summary, or full content if no 2694 | # description), 'content' gets dict of base/language/type/value (which can come 2695 | # from content:encoded, xhtml:body, content, or fullitem); 2696 | # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang 2697 | # tracking; fixed bug tracking unknown tags; fixed bug tracking content when 2698 | # element is not in default namespace (like Pocketsoap feed); 2699 | # resolve relative URLs in link, guid, docs, url, comments, wfw:comment, 2700 | # wfw:commentRSS; resolve relative URLs within embedded HTML markup in 2701 | # description, xhtml:body, content, content:encoded, title, subtitle, 2702 | # summary, info, tagline, and copyright; added support for pingback and 2703 | # trackback namespaces 2704 | #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback 2705 | # namespaces, as opposed to 2.6 when I said I did but didn't really; 2706 | # sanitize HTML markup within some elements; added mxTidy support (if 2707 | # installed) to tidy HTML markup within some elements; fixed indentation 2708 | # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available 2709 | # (FazalM); universal date parsing and normalization (FazalM): 'created', modified', 2710 | # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', 2711 | # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' 2712 | # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa 2713 | #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory 2714 | # leak not closing url opener (JohnD); added dc:publisher support (MarekK); 2715 | # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) 2716 | #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in 2717 | # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); 2718 | # fixed relative URI processing for guid (skadz); added ICBM support; added 2719 | # base64 support 2720 | #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many 2721 | # blogspot.com sites); added _debug variable 2722 | #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing 2723 | #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); 2724 | # added several new supported namespaces; fixed bug tracking naked markup in 2725 | # description; added support for enclosure; added support for source; re-added 2726 | # support for cloud which got dropped somehow; added support for expirationDate 2727 | #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking 2728 | # xml:base URI, one for documents that don't define one explicitly and one for 2729 | # documents that define an outer and an inner xml:base that goes out of scope 2730 | # before the end of the document 2731 | #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level 2732 | #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] 2733 | # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; 2734 | # added support for creativeCommons:license and cc:license; added support for 2735 | # full Atom content model in title, tagline, info, copyright, summary; fixed bug 2736 | # with gzip encoding (not always telling server we support it when we do) 2737 | #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail 2738 | # (dictionary of 'name', 'url', 'email'); map author to author_detail if author 2739 | # contains name + email address 2740 | #3.0b8 - 1/28/2004 - MAP - added support for contributor 2741 | #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added 2742 | # support for summary 2743 | #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from 2744 | # xml.util.iso8601 2745 | #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain 2746 | # dangerous markup; fiddled with decodeEntities (not right); liberalized 2747 | # date parsing even further 2748 | #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); 2749 | # added support to Atom 0.2 subtitle; added support for Atom content model 2750 | # in copyright; better sanitizing of dangerous HTML elements with end tags 2751 | # (script, frameset) 2752 | #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, 2753 | # etc.) in embedded markup, in either HTML or XHTML form (
,
,
) 2754 | #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under 2755 | # Python 2.1 2756 | #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; 2757 | # fixed bug capturing author and contributor URL; fixed bug resolving relative 2758 | # links in author and contributor URL; fixed bug resolvin relative links in 2759 | # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's 2760 | # namespace tests, and included them permanently in the test suite with his 2761 | # permission; fixed namespace handling under Python 2.1 2762 | #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) 2763 | #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 2764 | #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); 2765 | # use libxml2 (if available) 2766 | #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author 2767 | # name was in parentheses; removed ultra-problematic mxTidy support; patch to 2768 | # workaround crash in PyXML/expat when encountering invalid entities 2769 | # (MarkMoraes); support for textinput/textInput 2770 | #3.0b20 - 4/7/2004 - MAP - added CDF support 2771 | #3.0b21 - 4/14/2004 - MAP - added Hot RSS support 2772 | #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in 2773 | # results dict; changed results dict to allow getting values with results.key 2774 | # as well as results[key]; work around embedded illformed HTML with half 2775 | # a DOCTYPE; work around malformed Content-Type header; if character encoding 2776 | # is wrong, try several common ones before falling back to regexes (if this 2777 | # works, bozo_exception is set to CharacterEncodingOverride); fixed character 2778 | # encoding issues in BaseHTMLProcessor by tracking encoding and converting 2779 | # from Unicode to raw strings before feeding data to sgmllib.SGMLParser; 2780 | # convert each value in results to Unicode (if possible), even if using 2781 | # regex-based parsing 2782 | #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain 2783 | # high-bit characters in attributes in embedded HTML in description (thanks 2784 | # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in 2785 | # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking 2786 | # about a mapped key 2787 | #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and 2788 | # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could 2789 | # cause the same encoding to be tried twice (even if it failed the first time); 2790 | # fixed DOCTYPE stripping when DOCTYPE contained entity declarations; 2791 | # better textinput and image tracking in illformed RSS 1.0 feeds 2792 | #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed 2793 | # my blink tag tests 2794 | #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that 2795 | # failed to parse utf-16 encoded feeds; made source into a FeedParserDict; 2796 | # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; 2797 | # added support for image; refactored parse() fallback logic to try other 2798 | # encodings if SAX parsing fails (previously it would only try other encodings 2799 | # if re-encoding failed); remove unichr madness in normalize_attrs now that 2800 | # we're properly tracking encoding in and out of BaseHTMLProcessor; set 2801 | # feed.language from root-level xml:lang; set entry.id from rdf:about; 2802 | # send Accept header 2803 | #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between 2804 | # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are 2805 | # windows-1252); fixed regression that could cause the same encoding to be 2806 | # tried twice (even if it failed the first time) 2807 | #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; 2808 | # recover from malformed content-type header parameter with no equals sign 2809 | # ('text/xml; charset:iso-8859-1') 2810 | #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities 2811 | # to Unicode equivalents in illformed feeds (aaronsw); added and 2812 | # passed tests for converting character entities to Unicode equivalents 2813 | # in illformed feeds (aaronsw); test for valid parsers when setting 2814 | # XML_AVAILABLE; make version and encoding available when server returns 2815 | # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like 2816 | # digest auth or proxy support); add code to parse username/password 2817 | # out of url and send as basic authentication; expose downloading-related 2818 | # exceptions in bozo_exception (aaronsw); added __contains__ method to 2819 | # FeedParserDict (aaronsw); added publisher_detail (aaronsw) 2820 | #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always 2821 | # convert feed to UTF-8 before passing to XML parser; completely revamped 2822 | # logic for determining character encoding and attempting XML parsing 2823 | # (much faster); increased default timeout to 20 seconds; test for presence 2824 | # of Location header on redirects; added tests for many alternate character 2825 | # encodings; support various EBCDIC encodings; support UTF-16BE and 2826 | # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support 2827 | # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no 2828 | # XML parsers are available; added support for 'Content-encoding: deflate'; 2829 | # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules 2830 | # are available 2831 | #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure 2832 | # problem tracking xml:base and xml:lang if element declares it, child 2833 | # doesn't, first grandchild redeclares it, and second grandchild doesn't; 2834 | # refactored date parsing; defined public registerDateHandler so callers 2835 | # can add support for additional date formats at runtime; added support 2836 | # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added 2837 | # zopeCompatibilityHack() which turns FeedParserDict into a regular 2838 | # dictionary, required for Zope compatibility, and also makes command- 2839 | # line debugging easier because pprint module formats real dictionaries 2840 | # better than dictionary-like objects; added NonXMLContentType exception, 2841 | # which is stored in bozo_exception when a feed is served with a non-XML 2842 | # media type such as 'text/plain'; respect Content-Language as default 2843 | # language if not xml:lang is present; cloud dict is now FeedParserDict; 2844 | # generator dict is now FeedParserDict; better tracking of xml:lang, 2845 | # including support for xml:lang='' to unset the current language; 2846 | # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default 2847 | # namespace; don't overwrite final status on redirects (scenarios: 2848 | # redirecting to a URL that returns 304, redirecting to a URL that 2849 | # redirects to another URL with a different type of redirect); add 2850 | # support for HTTP 303 redirects 2851 | #4.0 - MAP - support for relative URIs in xml:base attribute; fixed 2852 | # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; 2853 | # support for Atom 1.0; support for iTunes extensions; new 'tags' for 2854 | # categories/keywords/etc. as array of dict 2855 | # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 2856 | # terminology; parse RFC 822-style dates with no time; lots of other 2857 | # bug fixes 2858 | #4.1 - MAP - removed socket timeout; added support for chardet library -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | - kind: Subscription 4 | properties: 5 | - name: jid 6 | - name: created_at 7 | direction: desc 8 | 9 | - kind: Subscription 10 | properties: 11 | - name: jid 12 | - name: feed 13 | direction: asc 14 | 15 | # AUTOGENERATED 16 | 17 | # This index.yaml is automatically updated whenever the dev_appserver 18 | # detects that a new type of query is run. If you want to manage the 19 | # index.yaml file manually, remove the above marker line (the line 20 | # saying "# AUTOGENERATED"). If you want to manage some indexes 21 | # manually, move them above the marker line. The index.yaml file is 22 | # automatically uploaded to the admin console when you next deploy 23 | # your application using appcfg.py. 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import base64 4 | import urllib 5 | import logging 6 | import feedparser 7 | from google.appengine.api import xmpp 8 | from google.appengine.ext import webapp 9 | from google.appengine.ext.webapp.util import run_wsgi_app 10 | from google.appengine.ext.webapp import xmpp_handlers 11 | from google.appengine.ext.webapp import template 12 | from google.appengine.ext import db 13 | from google.appengine.api import urlfetch 14 | from google.appengine.runtime import apiproxy_errors 15 | from google.appengine.api.app_identity import get_application_id 16 | from google.appengine.api import memcache 17 | 18 | import extractlinks 19 | from extractlinks import LinkExtractor 20 | 21 | SUPERFEEDR_LOGIN = "" 22 | SUPERFEEDR_PASSWORD = "" 23 | appname = get_application_id() 24 | 25 | ## 26 | # the function that sends subscriptions/unsubscriptions to Superfeedr 27 | def superfeedr(mode, subscription): 28 | post_data = { 29 | 'hub.mode' : mode, 30 | 'hub.callback' : "http://" + appname + ".appspot.com/hubbub/" + subscription.key().name(), 31 | 'hub.topic' : subscription.feed, 32 | 'hub.verify' : 'async', 33 | 'hub.verify_token' : '', 34 | } 35 | base64string = base64.encodestring('%s:%s' % (SUPERFEEDR_LOGIN, SUPERFEEDR_PASSWORD))[:-1] 36 | form_data = urllib.urlencode(post_data) 37 | result = urlfetch.fetch(url="http://superfeedr.com/hubbub", 38 | payload=form_data, 39 | method=urlfetch.POST, 40 | headers={"Authorization": "Basic "+ base64string, 'Content-Type': 'application/x-www-form-urlencoded'}, 41 | deadline=10) 42 | # logging.info('Result of %s to %s => %s (%d)',mode, subscription.feed, result.content, result.status_code ) 43 | 44 | return result 45 | 46 | 47 | ## 48 | # The subscription model that matches a feed and a jid. 49 | class Subscription(db.Model): 50 | feed = db.LinkProperty(required=True) 51 | jid = db.StringProperty(required=True) 52 | created_at = db.DateTimeProperty(required=True, auto_now_add=True) 53 | 54 | ## 55 | # The subscribe page. Useful for those who want to subscribe easily via a web page 56 | class SubscribePage(webapp.RequestHandler): 57 | 58 | def get(self): 59 | feeds = [] 60 | if self.request.get("resource"): 61 | feeds = memcache.get(self.request.get("resource")) 62 | if feeds is not None: 63 | # good 64 | logging.debug("Memcache hit.") 65 | else: 66 | logging.debug("Memcache miss.") 67 | try: 68 | result = urlfetch.fetch(url=self.request.get("resource"), deadline=10) 69 | parser = LinkExtractor() 70 | parser.set_base_url(self.request.get("resource")) 71 | parser.feed(result.content) 72 | if parser.links: 73 | feeds = parser.links 74 | else: 75 | feeds = [] 76 | 77 | if not feeds: 78 | # Let's check if by any chance this is actually not a feed? 79 | data = feedparser.parse(result.content) 80 | mimeType = "application/atom+xml" 81 | href = self.request.get("resource") 82 | if re.match("atom", data.version): 83 | mimeType = "application/atom+xml" 84 | feeds = [{'title': data.feed.title, 'rel': 'self', 'type': mimeType, 'href': href}] 85 | 86 | except: 87 | feeds = [] 88 | 89 | if not memcache.set(self.request.get("resource"), feeds, 86400): 90 | logging.error("Memcache set failed.") 91 | else: 92 | logging.debug("Memcache set.") 93 | 94 | self.response.out.write(template.render(os.path.join(os.path.dirname(__file__), 'templates', "subscribe.html"), {'appname': appname, 'feeds': feeds})) 95 | 96 | ## 97 | # The web app interface 98 | class MainPage(webapp.RequestHandler): 99 | 100 | def get(self): 101 | self.redirect('http://blog.superfeedr.com/notifixlight/') 102 | 103 | ## 104 | # The HubbubSusbcriber 105 | class HubbubSubscriber(webapp.RequestHandler): 106 | 107 | ## 108 | # Called upon notification 109 | def post(self, feed_sekret): 110 | subscription = None 111 | try: subscription = Subscription.get_by_key_name(feed_sekret) 112 | except apiproxy_errors.OverQuotaError, error_message: 113 | logging.error(error_message) 114 | pass 115 | if(subscription == None): 116 | if self.request.get("hub.mode") == "unsubscribe" : 117 | # Let superfeedr unsusbscribe this. 118 | # Even though we have no record of it. 119 | self.response.set_status(200) 120 | self.response.out.write(self.request.get('hub.challenge')) 121 | else: 122 | self.response.set_status(404) 123 | self.response.out.write("Sorry, no feed."); 124 | else: 125 | body = self.request.body.decode('utf-8') 126 | data = feedparser.parse(self.request.body) 127 | logging.info('Found %d entries in %s', len(data.entries), subscription.feed) 128 | try: 129 | feed_title = data.feed.title 130 | except AttributeError: 131 | feed_title = '' 132 | for entry in data.entries: 133 | link = entry.get('link', '') 134 | title = entry.get('title', '') 135 | logging.info('Found entry with title = "%s", ' 136 | 'link = "%s"', 137 | title, link) 138 | user_address = subscription.jid 139 | msg = "'" + feed_title + "' : " + title + "\n" + link 140 | status_code = xmpp.send_message(user_address, msg) 141 | self.response.set_status(200) 142 | self.response.out.write("Alright. Saved."); 143 | 144 | def get(self, feed_sekret): 145 | subscription = None 146 | try: subscription = Subscription.get_by_key_name(feed_sekret) 147 | except apiproxy_errors.OverQuotaError, error_message: 148 | logging.error(error_message) 149 | pass 150 | if(subscription == None): 151 | if self.request.get("hub.mode") == "unsubscribe" : 152 | # Let superfeedr unsusbscribe this. 153 | # Even though we have no record of it. 154 | self.response.set_status(200) 155 | self.response.out.write(self.request.get('hub.challenge')) 156 | else: 157 | self.response.set_status(404) 158 | self.response.out.write("Sorry, no feed."); 159 | else: 160 | # Let's confirm to the subscriber that he'll get notifications for this feed. 161 | user_address = subscription.jid 162 | if(self.request.get("hub.mode") == "subscribe"): 163 | msg = "You're now subscribed to " + subscription.feed 164 | xmpp.send_message(user_address, msg) 165 | self.response.out.write(self.request.get('hub.challenge')) 166 | self.response.set_status(200) 167 | elif(self.request.get("hub.mode") == "unsubscribe"): 168 | msg = "You're not anymore subscribed to " + subscription.feed 169 | xmpp.send_message(user_address, msg) 170 | self.response.out.write(self.request.get('hub.challenge')) 171 | self.response.set_status(200) 172 | 173 | ## 174 | # The XMPP App interface 175 | class XMPPHandler(xmpp_handlers.CommandHandler): 176 | 177 | # Asking to subscribe to a feed 178 | def subscribe_command(self, message=None): 179 | message = xmpp.Message(self.request.POST) 180 | subscriber = message.sender.rpartition("/")[0] 181 | subscription = Subscription(key_name=hashlib.sha224(message.arg + subscriber).hexdigest(), feed=message.arg, jid=subscriber) 182 | subscription.put() # saves the subscription 183 | result = superfeedr("subscribe", subscription) 184 | if result.status_code == 204: 185 | # logging.info("Subscription success! %s", message.arg) 186 | message.reply("Successfully subscribed to " + message.arg + "!") 187 | elif result.status_code == 202: 188 | message.reply("Subscribing to " + message.arg + ", you should get a confirmation soon.") 189 | else: 190 | message.reply("Could not subscribe to " + message.arg + ", looks like AppEngine got a small glitch. Please try again!") 191 | logging.error("Sorry, couldn't subscribe ( Status %s - Error %s) to %s", message.arg, result.status_code, result.content) 192 | 193 | ## 194 | # Asking to unsubscribe to a feed 195 | def unsubscribe_command(self, message=None): 196 | message = xmpp.Message(self.request.POST) 197 | subscriber = message.sender.rpartition("/")[0] 198 | if message.arg == "all": 199 | query = Subscription.all().filter("jid =",subscriber).order("feed") 200 | subscriptions = query.fetch(query.count() + 1) 201 | for subscription in subscriptions: 202 | subscription.delete() 203 | db.delete(subscriptions) 204 | message.reply("Well done! We deleted all your subscriptions!") 205 | else : 206 | subscription = Subscription.get_by_key_name(hashlib.sha224(message.arg + subscriber).hexdigest()) 207 | if(subscription == None): 208 | message.reply("Looks like you were not susbcribed to " + message.arg) 209 | else: 210 | result = superfeedr("unsubscribe", subscription) 211 | subscription.delete() # deletes the subscription 212 | message.reply("Well done! You're not subscribed anymore to " + message.arg) 213 | 214 | ## 215 | # List subscriptions by page 216 | # 100/page 217 | # page default to 1 218 | def list_command(self, message=None): 219 | message = xmpp.Message(self.request.POST) 220 | subscriber = message.sender.rpartition("/")[0] 221 | query = Subscription.all().filter("jid =",subscriber).order("feed") 222 | count = query.count() 223 | if count == 0: 224 | message.reply("Seems you subscribed nothing yet. Type\n /subscribe http://twitter.com/statuses/user_timeline/43417156.rss\nto play around.") 225 | else: 226 | page_index = int(message.arg or 1) 227 | if count%100 == 0: 228 | pages_count = count/100 229 | else: 230 | pages_count = count/100 + 1 231 | 232 | page_index = min(page_index, pages_count) 233 | offset = (page_index - 1) * 100 234 | subscriptions = query.fetch(100, offset) 235 | message.reply("Your have %d subscriptions in total: page %d/%d \n" % (count,page_index,pages_count)) 236 | feed_list = [s.feed for s in subscriptions] 237 | message.reply("\n".join(feed_list)) 238 | 239 | ## 240 | # Asking for help 241 | def hello_command(self, message=None): 242 | message = xmpp.Message(self.request.POST) 243 | message.reply("Oh, Hai! " + appname 244 | + " is a small app to help you subscribe to your favorite feeds and get their updates via IM. It's powered by Superfeedr (http://superfeedr.com) and its magic powers!. ") 245 | message.reply("Make it better : http://github.com/superfeedr/notifixlight.") 246 | message.reply("For more info, type /help.") 247 | 248 | ## 249 | # Asking for help 250 | def help_command(self, message=None): 251 | message = xmpp.Message(self.request.POST) 252 | help_msg = "It's not even alpha ready, but you could play with following commands:\n\n" \ 253 | "/hello -> about me\n\n" \ 254 | "/subscribe \n/unsubscribe -> subscribe or unsubscribe to a feed\n\n" \ 255 | "/list -> list subscriptions, default to page 1\n\n" \ 256 | "/help -> get help message\n" 257 | message.reply(help_msg) 258 | message.reply(message.body) 259 | 260 | ## 261 | # All other commants 262 | def unhandled_command(self, message=None): 263 | message = xmpp.Message(self.request.POST) 264 | message.reply("Please, type /help for help.") 265 | 266 | ## 267 | # Sent for any message. 268 | def text_message(self, message=None): 269 | message = xmpp.Message(self.request.POST) 270 | message.reply("Echooooo (when you're done playing, type /help) > " + message.body) 271 | 272 | application = webapp.WSGIApplication([ 273 | ('/_ah/xmpp/message/chat/', XMPPHandler), 274 | ('/', MainPage), 275 | ('/subscribe', SubscribePage), 276 | ('/hubbub/(.*)', HubbubSubscriber) 277 | ], debug=True) 278 | 279 | def main(): 280 | run_wsgi_app(application) 281 | 282 | if __name__ == "__main__": 283 | main() 284 | 285 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | Notifixlite 6 | 35 | 36 | 37 |

Using Jabber: Say /hello to
{{ appname }}@appspot.com*.
It's awesome, too.

38 | * Yes, you have to use a Jabber/XMPP client. Add {{ appname }}@appspot.com as a friend. 39 |

Powered by Superfeedr on Google App Engine - Take that bad code and make it better.

40 | 41 | 42 | -------------------------------------------------------------------------------- /templates/subscribe.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | Notifixlite 6 | 35 | 36 | 37 | {% for feed in feeds %} 38 |

subscribe to {{feed.title}}

39 | {% endfor %} 40 |

We use Subtome for subscriptions. Next time you see a subscribe button, you'll be able to pick {{appname}} to subscribe. 41 |

Powered by Superfeedr on Google App Engine - Take that bad code and make it better.

42 |