61 | # or utidylib .
62 | TIDY_MARKUP = 0
63 |
64 | # List of Python interfaces for HTML Tidy, in order of preference. Only useful
65 | # if TIDY_MARKUP = 1
66 | PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
67 |
68 | # ---------- required modules (should come with any Python distribution) ----------
69 | import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
70 | try:
71 | from cStringIO import StringIO as _StringIO
72 | except:
73 | from StringIO import StringIO as _StringIO
74 |
75 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
76 |
77 | # gzip is included with most Python distributions, but may not be available if you compiled your own
78 | try:
79 | import gzip
80 | except:
81 | gzip = None
82 | try:
83 | import zlib
84 | except:
85 | zlib = None
86 |
87 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has
88 | # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
89 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
90 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
91 | try:
92 | import xml.sax
93 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
94 | from xml.sax.saxutils import escape as _xmlescape
95 | _XML_AVAILABLE = 1
96 | except:
97 | _XML_AVAILABLE = 0
98 | def _xmlescape(data):
99 | data = data.replace('&', '&')
100 | data = data.replace('>', '>')
101 | data = data.replace('<', '<')
102 | return data
103 |
104 | # base64 support for Atom feeds that contain embedded binary data
105 | try:
106 | import base64, binascii
107 | except:
108 | base64 = binascii = None
109 |
110 | # cjkcodecs and iconv_codec provide support for more character encodings.
111 | # Both are available from http://cjkpython.i18n.org/
112 | try:
113 | import cjkcodecs.aliases
114 | except:
115 | pass
116 | try:
117 | import iconv_codec
118 | except:
119 | pass
120 |
121 | # chardet library auto-detects character encodings
122 | # Download from http://chardet.feedparser.org/
123 | try:
124 | import chardet
125 | if _debug:
126 | import chardet.constants
127 | chardet.constants._debug = 1
128 | except:
129 | chardet = None
130 |
131 | # ---------- don't touch these ----------
132 | class ThingsNobodyCaresAboutButMe(Exception): pass
133 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
134 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
135 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
136 | class UndeclaredNamespace(Exception): pass
137 |
138 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
139 | sgmllib.special = re.compile('' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
437 |
438 | # match namespaces
439 | if tag.find(':') <> -1:
440 | prefix, suffix = tag.split(':', 1)
441 | else:
442 | prefix, suffix = '', tag
443 | prefix = self.namespacemap.get(prefix, prefix)
444 | if prefix:
445 | prefix = prefix + '_'
446 |
447 | # special hack for better tracking of empty textinput/image elements in illformed feeds
448 | if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
449 | self.intextinput = 0
450 | if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
451 | self.inimage = 0
452 |
453 | # call special handler (if defined) or default handler
454 | methodname = '_start_' + prefix + suffix
455 | try:
456 | method = getattr(self, methodname)
457 | return method(attrsD)
458 | except AttributeError:
459 | return self.push(prefix + suffix, 1)
460 |
461 | def unknown_endtag(self, tag):
462 | if _debug: sys.stderr.write('end %s\n' % tag)
463 | # match namespaces
464 | if tag.find(':') <> -1:
465 | prefix, suffix = tag.split(':', 1)
466 | else:
467 | prefix, suffix = '', tag
468 | prefix = self.namespacemap.get(prefix, prefix)
469 | if prefix:
470 | prefix = prefix + '_'
471 |
472 | # call special handler (if defined) or default handler
473 | methodname = '_end_' + prefix + suffix
474 | try:
475 | method = getattr(self, methodname)
476 | method()
477 | except AttributeError:
478 | self.pop(prefix + suffix)
479 |
480 | # track inline content
481 | if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
482 | # element declared itself as escaped markup, but it isn't really
483 | self.contentparams['type'] = 'application/xhtml+xml'
484 | if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
485 | tag = tag.split(':')[-1]
486 | self.handle_data('%s>' % tag, escape=0)
487 |
488 | # track xml:base and xml:lang going out of scope
489 | if self.basestack:
490 | self.basestack.pop()
491 | if self.basestack and self.basestack[-1]:
492 | self.baseuri = self.basestack[-1]
493 | if self.langstack:
494 | self.langstack.pop()
495 | if self.langstack: # and (self.langstack[-1] is not None):
496 | self.lang = self.langstack[-1]
497 |
498 | def handle_charref(self, ref):
499 | # called for each character reference, e.g. for ' ', ref will be '160'
500 | if not self.elementstack: return
501 | ref = ref.lower()
502 | if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
503 | text = '%s;' % ref
504 | else:
505 | if ref[0] == 'x':
506 | c = int(ref[1:], 16)
507 | else:
508 | c = int(ref)
509 | text = unichr(c).encode('utf-8')
510 | self.elementstack[-1][2].append(text)
511 |
512 | def handle_entityref(self, ref):
513 | # called for each entity reference, e.g. for '©', ref will be 'copy'
514 | if not self.elementstack: return
515 | if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
516 | if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
517 | text = '&%s;' % ref
518 | else:
519 | # entity resolution graciously donated by Aaron Swartz
520 | def name2cp(k):
521 | import htmlentitydefs
522 | if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
523 | return htmlentitydefs.name2codepoint[k]
524 | k = htmlentitydefs.entitydefs[k]
525 | if k.startswith('') and k.endswith(';'):
526 | return int(k[2:-1]) # not in latin-1
527 | return ord(k)
528 | try: name2cp(ref)
529 | except KeyError: text = '&%s;' % ref
530 | else: text = unichr(name2cp(ref)).encode('utf-8')
531 | self.elementstack[-1][2].append(text)
532 |
533 | def handle_data(self, text, escape=1):
534 | # called for each block of plain text, i.e. outside of any tag and
535 | # not containing any character or entity references
536 | if not self.elementstack: return
537 | if escape and self.contentparams.get('type') == 'application/xhtml+xml':
538 | text = _xmlescape(text)
539 | self.elementstack[-1][2].append(text)
540 |
541 | def handle_comment(self, text):
542 | # called for each comment, e.g.
543 | pass
544 |
545 | def handle_pi(self, text):
546 | # called for each processing instruction, e.g.
547 | pass
548 |
549 | def handle_decl(self, text):
550 | pass
551 |
552 | def parse_declaration(self, i):
553 | # override internal declaration handler to handle CDATA blocks
554 | if _debug: sys.stderr.write('entering parse_declaration\n')
555 | if self.rawdata[i:i+9] == '', i)
557 | if k == -1: k = len(self.rawdata)
558 | self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
559 | return k+3
560 | else:
561 | k = self.rawdata.find('>', i)
562 | return k+1
563 |
564 | def mapContentType(self, contentType):
565 | contentType = contentType.lower()
566 | if contentType == 'text':
567 | contentType = 'text/plain'
568 | elif contentType == 'html':
569 | contentType = 'text/html'
570 | elif contentType == 'xhtml':
571 | contentType = 'application/xhtml+xml'
572 | return contentType
573 |
574 | def trackNamespace(self, prefix, uri):
575 | loweruri = uri.lower()
576 | if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
577 | self.version = 'rss090'
578 | if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
579 | self.version = 'rss10'
580 | if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
581 | self.version = 'atom10'
582 | if loweruri.find('backend.userland.com/rss') <> -1:
583 | # match any backend.userland.com namespace
584 | uri = 'http://backend.userland.com/rss'
585 | loweruri = uri
586 | if self._matchnamespaces.has_key(loweruri):
587 | self.namespacemap[prefix] = self._matchnamespaces[loweruri]
588 | self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
589 | else:
590 | self.namespacesInUse[prefix or ''] = uri
591 |
592 | def resolveURI(self, uri):
593 | return _urljoin(self.baseuri or '', uri)
594 |
595 | def decodeEntities(self, element, data):
596 | return data
597 |
598 | def push(self, element, expectingText):
599 | self.elementstack.append([element, expectingText, []])
600 |
601 | def pop(self, element, stripWhitespace=1):
602 | if not self.elementstack: return
603 | if self.elementstack[-1][0] != element: return
604 |
605 | element, expectingText, pieces = self.elementstack.pop()
606 | output = ''.join(pieces)
607 | if stripWhitespace:
608 | output = output.strip()
609 | if not expectingText: return output
610 |
611 | # decode base64 content
612 | if base64 and self.contentparams.get('base64', 0):
613 | try:
614 | output = base64.decodestring(output)
615 | except binascii.Error:
616 | pass
617 | except binascii.Incomplete:
618 | pass
619 |
620 | # resolve relative URIs
621 | if (element in self.can_be_relative_uri) and output:
622 | output = self.resolveURI(output)
623 |
624 | # decode entities within embedded markup
625 | if not self.contentparams.get('base64', 0):
626 | output = self.decodeEntities(element, output)
627 |
628 | # remove temporary cruft from contentparams
629 | try:
630 | del self.contentparams['mode']
631 | except KeyError:
632 | pass
633 | try:
634 | del self.contentparams['base64']
635 | except KeyError:
636 | pass
637 |
638 | # resolve relative URIs within embedded markup
639 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
640 | if element in self.can_contain_relative_uris:
641 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
642 |
643 | # sanitize embedded markup
644 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
645 | if element in self.can_contain_dangerous_markup:
646 | output = _sanitizeHTML(output, self.encoding)
647 |
648 | if self.encoding and type(output) != type(u''):
649 | try:
650 | output = unicode(output, self.encoding)
651 | except:
652 | pass
653 |
654 | # categories/tags/keywords/whatever are handled in _end_category
655 | if element == 'category':
656 | return output
657 |
658 | # store output in appropriate place(s)
659 | if self.inentry and not self.insource:
660 | if element == 'content':
661 | self.entries[-1].setdefault(element, [])
662 | contentparams = copy.deepcopy(self.contentparams)
663 | contentparams['value'] = output
664 | self.entries[-1][element].append(contentparams)
665 | elif element == 'link':
666 | self.entries[-1][element] = output
667 | if output:
668 | self.entries[-1]['links'][-1]['href'] = output
669 | else:
670 | if element == 'description':
671 | element = 'summary'
672 | self.entries[-1][element] = output
673 | if self.incontent:
674 | contentparams = copy.deepcopy(self.contentparams)
675 | contentparams['value'] = output
676 | self.entries[-1][element + '_detail'] = contentparams
677 | elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
678 | context = self._getContext()
679 | if element == 'description':
680 | element = 'subtitle'
681 | context[element] = output
682 | if element == 'link':
683 | context['links'][-1]['href'] = output
684 | elif self.incontent:
685 | contentparams = copy.deepcopy(self.contentparams)
686 | contentparams['value'] = output
687 | context[element + '_detail'] = contentparams
688 | return output
689 |
690 | def pushContent(self, tag, attrsD, defaultContentType, expectingText):
691 | self.incontent += 1
692 | self.contentparams = FeedParserDict({
693 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
694 | 'language': self.lang,
695 | 'base': self.baseuri})
696 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
697 | self.push(tag, expectingText)
698 |
699 | def popContent(self, tag):
700 | value = self.pop(tag)
701 | self.incontent -= 1
702 | self.contentparams.clear()
703 | return value
704 |
705 | def _mapToStandardPrefix(self, name):
706 | colonpos = name.find(':')
707 | if colonpos <> -1:
708 | prefix = name[:colonpos]
709 | suffix = name[colonpos+1:]
710 | prefix = self.namespacemap.get(prefix, prefix)
711 | name = prefix + ':' + suffix
712 | return name
713 |
714 | def _getAttribute(self, attrsD, name):
715 | return attrsD.get(self._mapToStandardPrefix(name))
716 |
717 | def _isBase64(self, attrsD, contentparams):
718 | if attrsD.get('mode', '') == 'base64':
719 | return 1
720 | if self.contentparams['type'].startswith('text/'):
721 | return 0
722 | if self.contentparams['type'].endswith('+xml'):
723 | return 0
724 | if self.contentparams['type'].endswith('/xml'):
725 | return 0
726 | return 1
727 |
728 | def _itsAnHrefDamnIt(self, attrsD):
729 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
730 | if href:
731 | try:
732 | del attrsD['url']
733 | except KeyError:
734 | pass
735 | try:
736 | del attrsD['uri']
737 | except KeyError:
738 | pass
739 | attrsD['href'] = href
740 | return attrsD
741 |
742 | def _save(self, key, value):
743 | context = self._getContext()
744 | context.setdefault(key, value)
745 |
746 | def _start_rss(self, attrsD):
747 | versionmap = {'0.91': 'rss091u',
748 | '0.92': 'rss092',
749 | '0.93': 'rss093',
750 | '0.94': 'rss094'}
751 | if not self.version:
752 | attr_version = attrsD.get('version', '')
753 | version = versionmap.get(attr_version)
754 | if version:
755 | self.version = version
756 | elif attr_version.startswith('2.'):
757 | self.version = 'rss20'
758 | else:
759 | self.version = 'rss'
760 |
761 | def _start_dlhottitles(self, attrsD):
762 | self.version = 'hotrss'
763 |
764 | def _start_channel(self, attrsD):
765 | self.infeed = 1
766 | self._cdf_common(attrsD)
767 | _start_feedinfo = _start_channel
768 |
769 | def _cdf_common(self, attrsD):
770 | if attrsD.has_key('lastmod'):
771 | self._start_modified({})
772 | self.elementstack[-1][-1] = attrsD['lastmod']
773 | self._end_modified()
774 | if attrsD.has_key('href'):
775 | self._start_link({})
776 | self.elementstack[-1][-1] = attrsD['href']
777 | self._end_link()
778 |
779 | def _start_feed(self, attrsD):
780 | self.infeed = 1
781 | versionmap = {'0.1': 'atom01',
782 | '0.2': 'atom02',
783 | '0.3': 'atom03'}
784 | if not self.version:
785 | attr_version = attrsD.get('version')
786 | version = versionmap.get(attr_version)
787 | if version:
788 | self.version = version
789 | else:
790 | self.version = 'atom'
791 |
792 | def _end_channel(self):
793 | self.infeed = 0
794 | _end_feed = _end_channel
795 |
796 | def _start_image(self, attrsD):
797 | self.inimage = 1
798 | self.push('image', 0)
799 | context = self._getContext()
800 | context.setdefault('image', FeedParserDict())
801 |
802 | def _end_image(self):
803 | self.pop('image')
804 | self.inimage = 0
805 |
806 | def _start_textinput(self, attrsD):
807 | self.intextinput = 1
808 | self.push('textinput', 0)
809 | context = self._getContext()
810 | context.setdefault('textinput', FeedParserDict())
811 | _start_textInput = _start_textinput
812 |
813 | def _end_textinput(self):
814 | self.pop('textinput')
815 | self.intextinput = 0
816 | _end_textInput = _end_textinput
817 |
818 | def _start_author(self, attrsD):
819 | self.inauthor = 1
820 | self.push('author', 1)
821 | _start_managingeditor = _start_author
822 | _start_dc_author = _start_author
823 | _start_dc_creator = _start_author
824 | _start_itunes_author = _start_author
825 |
826 | def _end_author(self):
827 | self.pop('author')
828 | self.inauthor = 0
829 | self._sync_author_detail()
830 | _end_managingeditor = _end_author
831 | _end_dc_author = _end_author
832 | _end_dc_creator = _end_author
833 | _end_itunes_author = _end_author
834 |
835 | def _start_itunes_owner(self, attrsD):
836 | self.inpublisher = 1
837 | self.push('publisher', 0)
838 |
839 | def _end_itunes_owner(self):
840 | self.pop('publisher')
841 | self.inpublisher = 0
842 | self._sync_author_detail('publisher')
843 |
844 | def _start_contributor(self, attrsD):
845 | self.incontributor = 1
846 | context = self._getContext()
847 | context.setdefault('contributors', [])
848 | context['contributors'].append(FeedParserDict())
849 | self.push('contributor', 0)
850 |
851 | def _end_contributor(self):
852 | self.pop('contributor')
853 | self.incontributor = 0
854 |
855 | def _start_dc_contributor(self, attrsD):
856 | self.incontributor = 1
857 | context = self._getContext()
858 | context.setdefault('contributors', [])
859 | context['contributors'].append(FeedParserDict())
860 | self.push('name', 0)
861 |
862 | def _end_dc_contributor(self):
863 | self._end_name()
864 | self.incontributor = 0
865 |
866 | def _start_name(self, attrsD):
867 | self.push('name', 0)
868 | _start_itunes_name = _start_name
869 |
870 | def _end_name(self):
871 | value = self.pop('name')
872 | if self.inpublisher:
873 | self._save_author('name', value, 'publisher')
874 | elif self.inauthor:
875 | self._save_author('name', value)
876 | elif self.incontributor:
877 | self._save_contributor('name', value)
878 | elif self.intextinput:
879 | context = self._getContext()
880 | context['textinput']['name'] = value
881 | _end_itunes_name = _end_name
882 |
883 | def _start_width(self, attrsD):
884 | self.push('width', 0)
885 |
886 | def _end_width(self):
887 | value = self.pop('width')
888 | try:
889 | value = int(value)
890 | except:
891 | value = 0
892 | if self.inimage:
893 | context = self._getContext()
894 | context['image']['width'] = value
895 |
896 | def _start_height(self, attrsD):
897 | self.push('height', 0)
898 |
899 | def _end_height(self):
900 | value = self.pop('height')
901 | try:
902 | value = int(value)
903 | except:
904 | value = 0
905 | if self.inimage:
906 | context = self._getContext()
907 | context['image']['height'] = value
908 |
909 | def _start_url(self, attrsD):
910 | self.push('href', 1)
911 | _start_homepage = _start_url
912 | _start_uri = _start_url
913 |
914 | def _end_url(self):
915 | value = self.pop('href')
916 | if self.inauthor:
917 | self._save_author('href', value)
918 | elif self.incontributor:
919 | self._save_contributor('href', value)
920 | elif self.inimage:
921 | context = self._getContext()
922 | context['image']['href'] = value
923 | elif self.intextinput:
924 | context = self._getContext()
925 | context['textinput']['link'] = value
926 | _end_homepage = _end_url
927 | _end_uri = _end_url
928 |
929 | def _start_email(self, attrsD):
930 | self.push('email', 0)
931 | _start_itunes_email = _start_email
932 |
933 | def _end_email(self):
934 | value = self.pop('email')
935 | if self.inpublisher:
936 | self._save_author('email', value, 'publisher')
937 | elif self.inauthor:
938 | self._save_author('email', value)
939 | elif self.incontributor:
940 | self._save_contributor('email', value)
941 | _end_itunes_email = _end_email
942 |
943 | def _getContext(self):
944 | if self.insource:
945 | context = self.sourcedata
946 | elif self.inentry:
947 | context = self.entries[-1]
948 | else:
949 | context = self.feeddata
950 | return context
951 |
952 | def _save_author(self, key, value, prefix='author'):
953 | context = self._getContext()
954 | context.setdefault(prefix + '_detail', FeedParserDict())
955 | context[prefix + '_detail'][key] = value
956 | self._sync_author_detail()
957 |
958 | def _save_contributor(self, key, value):
959 | context = self._getContext()
960 | context.setdefault('contributors', [FeedParserDict()])
961 | context['contributors'][-1][key] = value
962 |
963 | def _sync_author_detail(self, key='author'):
964 | context = self._getContext()
965 | detail = context.get('%s_detail' % key)
966 | if detail:
967 | name = detail.get('name')
968 | email = detail.get('email')
969 | if name and email:
970 | context[key] = '%s (%s)' % (name, email)
971 | elif name:
972 | context[key] = name
973 | elif email:
974 | context[key] = email
975 | else:
976 | author = context.get(key)
977 | if not author: return
978 | emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
979 | if not emailmatch: return
980 | email = emailmatch.group(0)
981 | # probably a better way to do the following, but it passes all the tests
982 | author = author.replace(email, '')
983 | author = author.replace('()', '')
984 | author = author.strip()
985 | if author and (author[0] == '('):
986 | author = author[1:]
987 | if author and (author[-1] == ')'):
988 | author = author[:-1]
989 | author = author.strip()
990 | context.setdefault('%s_detail' % key, FeedParserDict())
991 | context['%s_detail' % key]['name'] = author
992 | context['%s_detail' % key]['email'] = email
993 |
994 | def _start_subtitle(self, attrsD):
995 | self.pushContent('subtitle', attrsD, 'text/plain', 1)
996 | _start_tagline = _start_subtitle
997 | _start_itunes_subtitle = _start_subtitle
998 |
999 | def _end_subtitle(self):
1000 | self.popContent('subtitle')
1001 | _end_tagline = _end_subtitle
1002 | _end_itunes_subtitle = _end_subtitle
1003 |
1004 | def _start_rights(self, attrsD):
1005 | self.pushContent('rights', attrsD, 'text/plain', 1)
1006 | _start_dc_rights = _start_rights
1007 | _start_copyright = _start_rights
1008 |
1009 | def _end_rights(self):
1010 | self.popContent('rights')
1011 | _end_dc_rights = _end_rights
1012 | _end_copyright = _end_rights
1013 |
1014 | def _start_item(self, attrsD):
1015 | self.entries.append(FeedParserDict())
1016 | self.push('item', 0)
1017 | self.inentry = 1
1018 | self.guidislink = 0
1019 | id = self._getAttribute(attrsD, 'rdf:about')
1020 | if id:
1021 | context = self._getContext()
1022 | context['id'] = id
1023 | self._cdf_common(attrsD)
1024 | _start_entry = _start_item
1025 | _start_product = _start_item
1026 |
1027 | def _end_item(self):
1028 | self.pop('item')
1029 | self.inentry = 0
1030 | _end_entry = _end_item
1031 |
1032 | def _start_dc_language(self, attrsD):
1033 | self.push('language', 1)
1034 | _start_language = _start_dc_language
1035 |
1036 | def _end_dc_language(self):
1037 | self.lang = self.pop('language')
1038 | _end_language = _end_dc_language
1039 |
1040 | def _start_dc_publisher(self, attrsD):
1041 | self.push('publisher', 1)
1042 | _start_webmaster = _start_dc_publisher
1043 |
1044 | def _end_dc_publisher(self):
1045 | self.pop('publisher')
1046 | self._sync_author_detail('publisher')
1047 | _end_webmaster = _end_dc_publisher
1048 |
1049 | def _start_published(self, attrsD):
1050 | self.push('published', 1)
1051 | _start_dcterms_issued = _start_published
1052 | _start_issued = _start_published
1053 |
1054 | def _end_published(self):
1055 | value = self.pop('published')
1056 | self._save('published_parsed', _parse_date(value))
1057 | _end_dcterms_issued = _end_published
1058 | _end_issued = _end_published
1059 |
1060 | def _start_updated(self, attrsD):
1061 | self.push('updated', 1)
1062 | _start_modified = _start_updated
1063 | _start_dcterms_modified = _start_updated
1064 | _start_pubdate = _start_updated
1065 | _start_dc_date = _start_updated
1066 |
1067 | def _end_updated(self):
1068 | value = self.pop('updated')
1069 | parsed_value = _parse_date(value)
1070 | self._save('updated_parsed', parsed_value)
1071 | _end_modified = _end_updated
1072 | _end_dcterms_modified = _end_updated
1073 | _end_pubdate = _end_updated
1074 | _end_dc_date = _end_updated
1075 |
1076 | def _start_created(self, attrsD):
1077 | self.push('created', 1)
1078 | _start_dcterms_created = _start_created
1079 |
1080 | def _end_created(self):
1081 | value = self.pop('created')
1082 | self._save('created_parsed', _parse_date(value))
1083 | _end_dcterms_created = _end_created
1084 |
1085 | def _start_expirationdate(self, attrsD):
1086 | self.push('expired', 1)
1087 |
1088 | def _end_expirationdate(self):
1089 | self._save('expired_parsed', _parse_date(self.pop('expired')))
1090 |
1091 | def _start_cc_license(self, attrsD):
1092 | self.push('license', 1)
1093 | value = self._getAttribute(attrsD, 'rdf:resource')
1094 | if value:
1095 | self.elementstack[-1][2].append(value)
1096 | self.pop('license')
1097 |
1098 | def _start_creativecommons_license(self, attrsD):
1099 | self.push('license', 1)
1100 |
1101 | def _end_creativecommons_license(self):
1102 | self.pop('license')
1103 |
1104 | def _addTag(self, term, scheme, label):
1105 | context = self._getContext()
1106 | tags = context.setdefault('tags', [])
1107 | if (not term) and (not scheme) and (not label): return
1108 | value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1109 | if value not in tags:
1110 | tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1111 |
1112 | def _start_category(self, attrsD):
1113 | if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1114 | term = attrsD.get('term')
1115 | scheme = attrsD.get('scheme', attrsD.get('domain'))
1116 | label = attrsD.get('label')
1117 | self._addTag(term, scheme, label)
1118 | self.push('category', 1)
1119 | _start_dc_subject = _start_category
1120 | _start_keywords = _start_category
1121 |
1122 | def _end_itunes_keywords(self):
1123 | for term in self.pop('itunes_keywords').split():
1124 | self._addTag(term, 'http://www.itunes.com/', None)
1125 |
1126 | def _start_itunes_category(self, attrsD):
1127 | self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1128 | self.push('category', 1)
1129 |
1130 | def _end_category(self):
1131 | value = self.pop('category')
1132 | if not value: return
1133 | context = self._getContext()
1134 | tags = context['tags']
1135 | if value and len(tags) and not tags[-1]['term']:
1136 | tags[-1]['term'] = value
1137 | else:
1138 | self._addTag(value, None, None)
1139 | _end_dc_subject = _end_category
1140 | _end_keywords = _end_category
1141 | _end_itunes_category = _end_category
1142 |
1143 | def _start_cloud(self, attrsD):
1144 | self._getContext()['cloud'] = FeedParserDict(attrsD)
1145 |
1146 | def _start_link(self, attrsD):
1147 | attrsD.setdefault('rel', 'alternate')
1148 | attrsD.setdefault('type', 'text/html')
1149 | attrsD = self._itsAnHrefDamnIt(attrsD)
1150 | if attrsD.has_key('href'):
1151 | attrsD['href'] = self.resolveURI(attrsD['href'])
1152 | expectingText = self.infeed or self.inentry or self.insource
1153 | context = self._getContext()
1154 | context.setdefault('links', [])
1155 | context['links'].append(FeedParserDict(attrsD))
1156 | if attrsD['rel'] == 'enclosure':
1157 | self._start_enclosure(attrsD)
1158 | if attrsD.has_key('href'):
1159 | expectingText = 0
1160 | if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1161 | context['link'] = attrsD['href']
1162 | else:
1163 | self.push('link', expectingText)
1164 | _start_producturl = _start_link
1165 |
1166 | def _end_link(self):
1167 | value = self.pop('link')
1168 | context = self._getContext()
1169 | if self.intextinput:
1170 | context['textinput']['link'] = value
1171 | if self.inimage:
1172 | context['image']['link'] = value
1173 | _end_producturl = _end_link
1174 |
1175 | def _start_guid(self, attrsD):
1176 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1177 | self.push('id', 1)
1178 |
1179 | def _end_guid(self):
1180 | value = self.pop('id')
1181 | self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1182 | if self.guidislink:
1183 | # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1184 | # and only if the item doesn't already have a link element
1185 | self._save('link', value)
1186 |
1187 | def _start_title(self, attrsD):
1188 | self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1189 | _start_dc_title = _start_title
1190 | _start_media_title = _start_title
1191 |
1192 | def _end_title(self):
1193 | value = self.popContent('title')
1194 | context = self._getContext()
1195 | if self.intextinput:
1196 | context['textinput']['title'] = value
1197 | elif self.inimage:
1198 | context['image']['title'] = value
1199 | _end_dc_title = _end_title
1200 | _end_media_title = _end_title
1201 |
1202 | def _start_description(self, attrsD):
1203 | context = self._getContext()
1204 | if context.has_key('summary'):
1205 | self._summaryKey = 'content'
1206 | self._start_content(attrsD)
1207 | else:
1208 | self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1209 |
1210 | def _start_abstract(self, attrsD):
1211 | self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1212 |
1213 | def _end_description(self):
1214 | if self._summaryKey == 'content':
1215 | self._end_content()
1216 | else:
1217 | value = self.popContent('description')
1218 | context = self._getContext()
1219 | if self.intextinput:
1220 | context['textinput']['description'] = value
1221 | elif self.inimage:
1222 | context['image']['description'] = value
1223 | self._summaryKey = None
1224 | _end_abstract = _end_description
1225 |
1226 | def _start_info(self, attrsD):
1227 | self.pushContent('info', attrsD, 'text/plain', 1)
1228 | _start_feedburner_browserfriendly = _start_info
1229 |
1230 | def _end_info(self):
1231 | self.popContent('info')
1232 | _end_feedburner_browserfriendly = _end_info
1233 |
1234 | def _start_generator(self, attrsD):
1235 | if attrsD:
1236 | attrsD = self._itsAnHrefDamnIt(attrsD)
1237 | if attrsD.has_key('href'):
1238 | attrsD['href'] = self.resolveURI(attrsD['href'])
1239 | self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1240 | self.push('generator', 1)
1241 |
1242 | def _end_generator(self):
1243 | value = self.pop('generator')
1244 | context = self._getContext()
1245 | if context.has_key('generator_detail'):
1246 | context['generator_detail']['name'] = value
1247 |
1248 | def _start_admin_generatoragent(self, attrsD):
1249 | self.push('generator', 1)
1250 | value = self._getAttribute(attrsD, 'rdf:resource')
1251 | if value:
1252 | self.elementstack[-1][2].append(value)
1253 | self.pop('generator')
1254 | self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1255 |
1256 | def _start_admin_errorreportsto(self, attrsD):
1257 | self.push('errorreportsto', 1)
1258 | value = self._getAttribute(attrsD, 'rdf:resource')
1259 | if value:
1260 | self.elementstack[-1][2].append(value)
1261 | self.pop('errorreportsto')
1262 |
1263 | def _start_summary(self, attrsD):
1264 | context = self._getContext()
1265 | if context.has_key('summary'):
1266 | self._summaryKey = 'content'
1267 | self._start_content(attrsD)
1268 | else:
1269 | self._summaryKey = 'summary'
1270 | self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1271 | _start_itunes_summary = _start_summary
1272 |
1273 | def _end_summary(self):
1274 | if self._summaryKey == 'content':
1275 | self._end_content()
1276 | else:
1277 | self.popContent(self._summaryKey or 'summary')
1278 | self._summaryKey = None
1279 | _end_itunes_summary = _end_summary
1280 |
1281 | def _start_enclosure(self, attrsD):
1282 | attrsD = self._itsAnHrefDamnIt(attrsD)
1283 | self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1284 | href = attrsD.get('href')
1285 | if href:
1286 | context = self._getContext()
1287 | if not context.get('id'):
1288 | context['id'] = href
1289 |
1290 | def _start_source(self, attrsD):
1291 | self.insource = 1
1292 |
1293 | def _end_source(self):
1294 | self.insource = 0
1295 | self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1296 | self.sourcedata.clear()
1297 |
1298 | def _start_content(self, attrsD):
1299 | self.pushContent('content', attrsD, 'text/plain', 1)
1300 | src = attrsD.get('src')
1301 | if src:
1302 | self.contentparams['src'] = src
1303 | self.push('content', 1)
1304 |
1305 | def _start_prodlink(self, attrsD):
1306 | self.pushContent('content', attrsD, 'text/html', 1)
1307 |
1308 | def _start_body(self, attrsD):
1309 | self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1310 | _start_xhtml_body = _start_body
1311 |
1312 | def _start_content_encoded(self, attrsD):
1313 | self.pushContent('content', attrsD, 'text/html', 1)
1314 | _start_fullitem = _start_content_encoded
1315 |
1316 | def _end_content(self):
1317 | copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1318 | value = self.popContent('content')
1319 | if copyToDescription:
1320 | self._save('description', value)
1321 | _end_body = _end_content
1322 | _end_xhtml_body = _end_content
1323 | _end_content_encoded = _end_content
1324 | _end_fullitem = _end_content
1325 | _end_prodlink = _end_content
1326 |
1327 | def _start_itunes_image(self, attrsD):
1328 | self.push('itunes_image', 0)
1329 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1330 | _start_itunes_link = _start_itunes_image
1331 |
1332 | def _end_itunes_block(self):
1333 | value = self.pop('itunes_block', 0)
1334 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1335 |
1336 | def _end_itunes_explicit(self):
1337 | value = self.pop('itunes_explicit', 0)
1338 | self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1339 |
1340 | if _XML_AVAILABLE:
1341 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1342 | def __init__(self, baseuri, baselang, encoding):
1343 | if _debug: sys.stderr.write('trying StrictFeedParser\n')
1344 | xml.sax.handler.ContentHandler.__init__(self)
1345 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1346 | self.bozo = 0
1347 | self.exc = None
1348 |
1349 | def startPrefixMapping(self, prefix, uri):
1350 | self.trackNamespace(prefix, uri)
1351 |
1352 | def startElementNS(self, name, qname, attrs):
1353 | namespace, localname = name
1354 | lowernamespace = str(namespace or '').lower()
1355 | if lowernamespace.find('backend.userland.com/rss') <> -1:
1356 | # match any backend.userland.com namespace
1357 | namespace = 'http://backend.userland.com/rss'
1358 | lowernamespace = namespace
1359 | if qname and qname.find(':') > 0:
1360 | givenprefix = qname.split(':')[0]
1361 | else:
1362 | givenprefix = None
1363 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1364 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1365 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1366 | if prefix:
1367 | localname = prefix + ':' + localname
1368 | localname = str(localname).lower()
1369 | if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1370 |
1371 | # qname implementation is horribly broken in Python 2.1 (it
1372 | # doesn't report any), and slightly broken in Python 2.2 (it
1373 | # doesn't report the xml: namespace). So we match up namespaces
1374 | # with a known list first, and then possibly override them with
1375 | # the qnames the SAX parser gives us (if indeed it gives us any
1376 | # at all). Thanks to MatejC for helping me test this and
1377 | # tirelessly telling me that it didn't work yet.
1378 | attrsD = {}
1379 | for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1380 | lowernamespace = (namespace or '').lower()
1381 | prefix = self._matchnamespaces.get(lowernamespace, '')
1382 | if prefix:
1383 | attrlocalname = prefix + ':' + attrlocalname
1384 | attrsD[str(attrlocalname).lower()] = attrvalue
1385 | for qname in attrs.getQNames():
1386 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1387 | self.unknown_starttag(localname, attrsD.items())
1388 |
1389 | def characters(self, text):
1390 | self.handle_data(text)
1391 |
1392 | def endElementNS(self, name, qname):
1393 | namespace, localname = name
1394 | lowernamespace = str(namespace or '').lower()
1395 | if qname and qname.find(':') > 0:
1396 | givenprefix = qname.split(':')[0]
1397 | else:
1398 | givenprefix = ''
1399 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1400 | if prefix:
1401 | localname = prefix + ':' + localname
1402 | localname = str(localname).lower()
1403 | self.unknown_endtag(localname)
1404 |
1405 | def error(self, exc):
1406 | self.bozo = 1
1407 | self.exc = exc
1408 |
1409 | def fatalError(self, exc):
1410 | self.error(exc)
1411 | raise exc
1412 |
1413 | class _BaseHTMLProcessor(sgmllib.SGMLParser):
1414 | elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1415 | 'img', 'input', 'isindex', 'link', 'meta', 'param']
1416 |
1417 | def __init__(self, encoding):
1418 | self.encoding = encoding
1419 | if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1420 | sgmllib.SGMLParser.__init__(self)
1421 |
1422 | def reset(self):
1423 | self.pieces = []
1424 | sgmllib.SGMLParser.reset(self)
1425 |
1426 | def _shorttag_replace(self, match):
1427 | tag = match.group(1)
1428 | if tag in self.elements_no_end_tag:
1429 | return '<' + tag + ' />'
1430 | else:
1431 | return '<' + tag + '>' + tag + '>'
1432 |
1433 | def feed(self, data):
1434 | data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1436 | data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1437 | data = data.replace(''', "'")
1438 | data = data.replace('"', '"')
1439 | if self.encoding and type(data) == type(u''):
1440 | data = data.encode(self.encoding)
1441 | sgmllib.SGMLParser.feed(self, data)
1442 |
1443 | def normalize_attrs(self, attrs):
1444 | # utility method to be called by descendants
1445 | attrs = [(k.lower(), v) for k, v in attrs]
1446 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1447 | return attrs
1448 |
1449 | def unknown_starttag(self, tag, attrs):
1450 | # called for each start tag
1451 | # attrs is a list of (attr, value) tuples
1452 | # e.g. for , tag='pre', attrs=[('class', 'screen')]
1453 | if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1454 | uattrs = []
1455 | # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456 | for key, value in attrs:
1457 | if type(value) != type(u''):
1458 | value = unicode(value, self.encoding)
1459 | uattrs.append((unicode(key, self.encoding), value))
1460 | strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461 | if tag in self.elements_no_end_tag:
1462 | self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1463 | else:
1464 | self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1465 |
1466 | def unknown_endtag(self, tag):
1467 | # called for each end tag, e.g. for
, tag will be 'pre'
1468 | # Reconstruct the original end tag.
1469 | if tag not in self.elements_no_end_tag:
1470 | self.pieces.append("%(tag)s>" % locals())
1471 |
1472 | def handle_charref(self, ref):
1473 | # called for each character reference, e.g. for ' ', ref will be '160'
1474 | # Reconstruct the original character reference.
1475 | self.pieces.append('%(ref)s;' % locals())
1476 |
1477 | def handle_entityref(self, ref):
1478 | # called for each entity reference, e.g. for '©', ref will be 'copy'
1479 | # Reconstruct the original entity reference.
1480 | self.pieces.append('&%(ref)s;' % locals())
1481 |
1482 | def handle_data(self, text):
1483 | # called for each block of plain text, i.e. outside of any tag and
1484 | # not containing any character or entity references
1485 | # Store the original text verbatim.
1486 | if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1487 | self.pieces.append(text)
1488 |
1489 | def handle_comment(self, text):
1490 | # called for each HTML comment, e.g.
1491 | # Reconstruct the original comment.
1492 | self.pieces.append('' % locals())
1493 |
1494 | def handle_pi(self, text):
1495 | # called for each processing instruction, e.g.
1496 | # Reconstruct original processing instruction.
1497 | self.pieces.append('%(text)s>' % locals())
1498 |
1499 | def handle_decl(self, text):
1500 | # called for the DOCTYPE, if present, e.g.
1501 | #
1503 | # Reconstruct original DOCTYPE
1504 | self.pieces.append('' % locals())
1505 |
1506 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1507 | def _scan_name(self, i, declstartpos):
1508 | rawdata = self.rawdata
1509 | n = len(rawdata)
1510 | if i == n:
1511 | return None, -1
1512 | m = self._new_declname_match(rawdata, i)
1513 | if m:
1514 | s = m.group()
1515 | name = s.strip()
1516 | if (i + len(s)) == n:
1517 | return None, -1 # end of buffer
1518 | return name.lower(), m.end()
1519 | else:
1520 | self.handle_data(rawdata)
1521 | # self.updatepos(declstartpos, i)
1522 | return None, -1
1523 |
1524 | def output(self):
1525 | '''Return processed HTML as a single string'''
1526 | return ''.join([str(p) for p in self.pieces])
1527 |
1528 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1529 | def __init__(self, baseuri, baselang, encoding):
1530 | sgmllib.SGMLParser.__init__(self)
1531 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1532 |
1533 | def decodeEntities(self, element, data):
1534 | data = data.replace('<', '<')
1535 | data = data.replace('<', '<')
1536 | data = data.replace('>', '>')
1537 | data = data.replace('>', '>')
1538 | data = data.replace('&', '&')
1539 | data = data.replace('&', '&')
1540 | data = data.replace('"', '"')
1541 | data = data.replace('"', '"')
1542 | data = data.replace(''', ''')
1543 | data = data.replace(''', ''')
1544 | if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1545 | data = data.replace('<', '<')
1546 | data = data.replace('>', '>')
1547 | data = data.replace('&', '&')
1548 | data = data.replace('"', '"')
1549 | data = data.replace(''', "'")
1550 | return data
1551 |
1552 | class _RelativeURIResolver(_BaseHTMLProcessor):
1553 | relative_uris = [('a', 'href'),
1554 | ('applet', 'codebase'),
1555 | ('area', 'href'),
1556 | ('blockquote', 'cite'),
1557 | ('body', 'background'),
1558 | ('del', 'cite'),
1559 | ('form', 'action'),
1560 | ('frame', 'longdesc'),
1561 | ('frame', 'src'),
1562 | ('iframe', 'longdesc'),
1563 | ('iframe', 'src'),
1564 | ('head', 'profile'),
1565 | ('img', 'longdesc'),
1566 | ('img', 'src'),
1567 | ('img', 'usemap'),
1568 | ('input', 'src'),
1569 | ('input', 'usemap'),
1570 | ('ins', 'cite'),
1571 | ('link', 'href'),
1572 | ('object', 'classid'),
1573 | ('object', 'codebase'),
1574 | ('object', 'data'),
1575 | ('object', 'usemap'),
1576 | ('q', 'cite'),
1577 | ('script', 'src')]
1578 |
1579 | def __init__(self, baseuri, encoding):
1580 | _BaseHTMLProcessor.__init__(self, encoding)
1581 | self.baseuri = baseuri
1582 |
1583 | def resolveURI(self, uri):
1584 | return _urljoin(self.baseuri, uri)
1585 |
1586 | def unknown_starttag(self, tag, attrs):
1587 | attrs = self.normalize_attrs(attrs)
1588 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1589 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1590 |
1591 | def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1592 | if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1593 | p = _RelativeURIResolver(baseURI, encoding)
1594 | p.feed(htmlSource)
1595 | return p.output()
1596 |
1597 | class _HTMLSanitizer(_BaseHTMLProcessor):
1598 | acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1599 | 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1600 | 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1601 | 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1602 | 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1603 | 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1604 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1605 | 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1606 |
1607 | acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1608 | 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1609 | 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1610 | 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1611 | 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1612 | 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1613 | 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1614 | 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1615 | 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1616 | 'usemap', 'valign', 'value', 'vspace', 'width']
1617 |
1618 | unacceptable_elements_with_end_tag = ['script', 'applet']
1619 |
1620 | def reset(self):
1621 | _BaseHTMLProcessor.reset(self)
1622 | self.unacceptablestack = 0
1623 |
1624 | def unknown_starttag(self, tag, attrs):
1625 | if not tag in self.acceptable_elements:
1626 | if tag in self.unacceptable_elements_with_end_tag:
1627 | self.unacceptablestack += 1
1628 | return
1629 | attrs = self.normalize_attrs(attrs)
1630 | attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1631 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1632 |
1633 | def unknown_endtag(self, tag):
1634 | if not tag in self.acceptable_elements:
1635 | if tag in self.unacceptable_elements_with_end_tag:
1636 | self.unacceptablestack -= 1
1637 | return
1638 | _BaseHTMLProcessor.unknown_endtag(self, tag)
1639 |
1640 | def handle_pi(self, text):
1641 | pass
1642 |
1643 | def handle_decl(self, text):
1644 | pass
1645 |
1646 | def handle_data(self, text):
1647 | if not self.unacceptablestack:
1648 | _BaseHTMLProcessor.handle_data(self, text)
1649 |
1650 | def _sanitizeHTML(htmlSource, encoding):
1651 | p = _HTMLSanitizer(encoding)
1652 | p.feed(htmlSource)
1653 | data = p.output()
1654 | if TIDY_MARKUP:
1655 | # loop through list of preferred Tidy interfaces looking for one that's installed,
1656 | # then set up a common _tidy function to wrap the interface-specific API.
1657 | _tidy = None
1658 | for tidy_interface in PREFERRED_TIDY_INTERFACES:
1659 | try:
1660 | if tidy_interface == "uTidy":
1661 | from tidy import parseString as _utidy
1662 | def _tidy(data, **kwargs):
1663 | return str(_utidy(data, **kwargs))
1664 | break
1665 | elif tidy_interface == "mxTidy":
1666 | from mx.Tidy import Tidy as _mxtidy
1667 | def _tidy(data, **kwargs):
1668 | nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1669 | return data
1670 | break
1671 | except:
1672 | pass
1673 | if _tidy:
1674 | utf8 = type(data) == type(u'')
1675 | if utf8:
1676 | data = data.encode('utf-8')
1677 | data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1678 | if utf8:
1679 | data = unicode(data, 'utf-8')
1680 | if data.count(''):
1683 | data = data.split('>', 1)[1]
1684 | if data.count('= '2.3.3'
1733 | assert base64 != None
1734 | user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1735 | realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1736 | self.add_password(realm, host, user, passw)
1737 | retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1738 | self.reset_retry_count()
1739 | return retry
1740 | except:
1741 | return self.http_error_default(req, fp, code, msg, headers)
1742 |
1743 | def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1744 | """URL, filename, or string --> stream
1745 |
1746 | This function lets you define parsers that take any input source
1747 | (URL, pathname to local or network file, or actual data as a string)
1748 | and deal with it in a uniform manner. Returned object is guaranteed
1749 | to have all the basic stdio read methods (read, readline, readlines).
1750 | Just .close() the object when you're done with it.
1751 |
1752 | If the etag argument is supplied, it will be used as the value of an
1753 | If-None-Match request header.
1754 |
1755 | If the modified argument is supplied, it must be a tuple of 9 integers
1756 | as returned by gmtime() in the standard Python time module. This MUST
1757 | be in GMT (Greenwich Mean Time). The formatted date/time will be used
1758 | as the value of an If-Modified-Since request header.
1759 |
1760 | If the agent argument is supplied, it will be used as the value of a
1761 | User-Agent request header.
1762 |
1763 | If the referrer argument is supplied, it will be used as the value of a
1764 | Referer[sic] request header.
1765 |
1766 | If handlers is supplied, it is a list of handlers used to build a
1767 | urllib2 opener.
1768 | """
1769 |
1770 | if hasattr(url_file_stream_or_string, 'read'):
1771 | return url_file_stream_or_string
1772 |
1773 | if url_file_stream_or_string == '-':
1774 | return sys.stdin
1775 |
1776 | if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1777 | if not agent:
1778 | agent = USER_AGENT
1779 | # test for inline user:password for basic auth
1780 | auth = None
1781 | if base64:
1782 | urltype, rest = urllib.splittype(url_file_stream_or_string)
1783 | realhost, rest = urllib.splithost(rest)
1784 | if realhost:
1785 | user_passwd, realhost = urllib.splituser(realhost)
1786 | if user_passwd:
1787 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1788 | auth = base64.encodestring(user_passwd).strip()
1789 | # try to open with urllib2 (to use optional headers)
1790 | request = urllib2.Request(url_file_stream_or_string)
1791 | request.add_header('User-Agent', agent)
1792 | if etag:
1793 | request.add_header('If-None-Match', etag)
1794 | if modified:
1795 | # format into an RFC 1123-compliant timestamp. We can't use
1796 | # time.strftime() since the %a and %b directives can be affected
1797 | # by the current locale, but RFC 2616 states that dates must be
1798 | # in English.
1799 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1800 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1801 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1802 | if referrer:
1803 | request.add_header('Referer', referrer)
1804 | if gzip and zlib:
1805 | request.add_header('Accept-encoding', 'gzip, deflate')
1806 | elif gzip:
1807 | request.add_header('Accept-encoding', 'gzip')
1808 | elif zlib:
1809 | request.add_header('Accept-encoding', 'deflate')
1810 | else:
1811 | request.add_header('Accept-encoding', '')
1812 | if auth:
1813 | request.add_header('Authorization', 'Basic %s' % auth)
1814 | if ACCEPT_HEADER:
1815 | request.add_header('Accept', ACCEPT_HEADER)
1816 | request.add_header('A-IM', 'feed') # RFC 3229 support
1817 | opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1818 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1819 | try:
1820 | return opener.open(request)
1821 | finally:
1822 | opener.close() # JohnD
1823 |
1824 | # try to open with native open function (if url_file_stream_or_string is a filename)
1825 | try:
1826 | return open(url_file_stream_or_string)
1827 | except:
1828 | pass
1829 |
1830 | # treat url_file_stream_or_string as string
1831 | return _StringIO(str(url_file_stream_or_string))
1832 |
1833 | _date_handlers = []
1834 | def registerDateHandler(func):
1835 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1836 | _date_handlers.insert(0, func)
1837 |
1838 | # ISO-8601 date parsing routines written by Fazal Majid.
1839 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1840 | # parser is beyond the scope of feedparser and would be a worthwhile addition
1841 | # to the Python library.
1842 | # A single regular expression cannot parse ISO 8601 date formats into groups
1843 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1844 | # 0301-04-01), so we use templates instead.
1845 | # Please note the order in templates is significant because we need a
1846 | # greedy match.
1847 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1848 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1849 | '-YY-?MM', '-OOO', '-YY',
1850 | '--MM-?DD', '--MM',
1851 | '---DD',
1852 | 'CC', '']
1853 | _iso8601_re = [
1854 | tmpl.replace(
1855 | 'YYYY', r'(?P\d{4})').replace(
1856 | 'YY', r'(?P\d\d)').replace(
1857 | 'MM', r'(?P[01]\d)').replace(
1858 | 'DD', r'(?P[0123]\d)').replace(
1859 | 'OOO', r'(?P[0123]\d\d)').replace(
1860 | 'CC', r'(?P\d\d$)')
1861 | + r'(T?(?P\d{2}):(?P\d{2})'
1862 | + r'(:(?P\d{2}))?'
1863 | + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?'
1864 | for tmpl in _iso8601_tmpl]
1865 | del tmpl
1866 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1867 | del regex
1868 | def _parse_date_iso8601(dateString):
1869 | '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1870 | m = None
1871 | for _iso8601_match in _iso8601_matches:
1872 | m = _iso8601_match(dateString)
1873 | if m: break
1874 | if not m: return
1875 | if m.span() == (0, 0): return
1876 | params = m.groupdict()
1877 | ordinal = params.get('ordinal', 0)
1878 | if ordinal:
1879 | ordinal = int(ordinal)
1880 | else:
1881 | ordinal = 0
1882 | year = params.get('year', '--')
1883 | if not year or year == '--':
1884 | year = time.gmtime()[0]
1885 | elif len(year) == 2:
1886 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1887 | year = 100 * int(time.gmtime()[0] / 100) + int(year)
1888 | else:
1889 | year = int(year)
1890 | month = params.get('month', '-')
1891 | if not month or month == '-':
1892 | # ordinals are NOT normalized by mktime, we simulate them
1893 | # by setting month=1, day=ordinal
1894 | if ordinal:
1895 | month = 1
1896 | else:
1897 | month = time.gmtime()[1]
1898 | month = int(month)
1899 | day = params.get('day', 0)
1900 | if not day:
1901 | # see above
1902 | if ordinal:
1903 | day = ordinal
1904 | elif params.get('century', 0) or \
1905 | params.get('year', 0) or params.get('month', 0):
1906 | day = 1
1907 | else:
1908 | day = time.gmtime()[2]
1909 | else:
1910 | day = int(day)
1911 | # special case of the century - is the first year of the 21st century
1912 | # 2000 or 2001 ? The debate goes on...
1913 | if 'century' in params.keys():
1914 | year = (int(params['century']) - 1) * 100 + 1
1915 | # in ISO 8601 most fields are optional
1916 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1917 | if not params.get(field, None):
1918 | params[field] = 0
1919 | hour = int(params.get('hour', 0))
1920 | minute = int(params.get('minute', 0))
1921 | second = int(params.get('second', 0))
1922 | # weekday is normalized by mktime(), we can ignore it
1923 | weekday = 0
1924 | # daylight savings is complex, but not needed for feedparser's purposes
1925 | # as time zones, if specified, include mention of whether it is active
1926 | # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1927 | # and most implementations have DST bugs
1928 | daylight_savings_flag = 0
1929 | tm = [year, month, day, hour, minute, second, weekday,
1930 | ordinal, daylight_savings_flag]
1931 | # ISO 8601 time zone adjustments
1932 | tz = params.get('tz')
1933 | if tz and tz != 'Z':
1934 | if tz[0] == '-':
1935 | tm[3] += int(params.get('tzhour', 0))
1936 | tm[4] += int(params.get('tzmin', 0))
1937 | elif tz[0] == '+':
1938 | tm[3] -= int(params.get('tzhour', 0))
1939 | tm[4] -= int(params.get('tzmin', 0))
1940 | else:
1941 | return None
1942 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1943 | # which is guaranteed to normalize d/m/y/h/m/s.
1944 | # Many implementations have bugs, but we'll pretend they don't.
1945 | return time.localtime(time.mktime(tm))
1946 | registerDateHandler(_parse_date_iso8601)
1947 |
1948 | # 8-bit date handling routines written by ytrewq1.
1949 | _korean_year = u'\ub144' # b3e2 in euc-kr
1950 | _korean_month = u'\uc6d4' # bff9 in euc-kr
1951 | _korean_day = u'\uc77c' # c0cf in euc-kr
1952 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1953 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1954 |
1955 | _korean_onblog_date_re = \
1956 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1957 | (_korean_year, _korean_month, _korean_day))
1958 | _korean_nate_date_re = \
1959 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1960 | (_korean_am, _korean_pm))
1961 | def _parse_date_onblog(dateString):
1962 | '''Parse a string according to the OnBlog 8-bit date format'''
1963 | m = _korean_onblog_date_re.match(dateString)
1964 | if not m: return
1965 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1966 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1967 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1968 | 'zonediff': '+09:00'}
1969 | if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1970 | return _parse_date_w3dtf(w3dtfdate)
1971 | registerDateHandler(_parse_date_onblog)
1972 |
1973 | def _parse_date_nate(dateString):
1974 | '''Parse a string according to the Nate 8-bit date format'''
1975 | m = _korean_nate_date_re.match(dateString)
1976 | if not m: return
1977 | hour = int(m.group(5))
1978 | ampm = m.group(4)
1979 | if (ampm == _korean_pm):
1980 | hour += 12
1981 | hour = str(hour)
1982 | if len(hour) == 1:
1983 | hour = '0' + hour
1984 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1985 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1986 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1987 | 'zonediff': '+09:00'}
1988 | if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1989 | return _parse_date_w3dtf(w3dtfdate)
1990 | registerDateHandler(_parse_date_nate)
1991 |
1992 | _mssql_date_re = \
1993 | re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1994 | def _parse_date_mssql(dateString):
1995 | '''Parse a string according to the MS SQL date format'''
1996 | m = _mssql_date_re.match(dateString)
1997 | if not m: return
1998 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1999 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2000 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2001 | 'zonediff': '+09:00'}
2002 | if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2003 | return _parse_date_w3dtf(w3dtfdate)
2004 | registerDateHandler(_parse_date_mssql)
2005 |
2006 | # Unicode strings for Greek date strings
2007 | _greek_months = \
2008 | { \
2009 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2010 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2011 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2012 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2013 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2014 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2015 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2016 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2017 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2018 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2019 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2020 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2021 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2022 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2023 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2024 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2025 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2026 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2027 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2028 | }
2029 |
2030 | _greek_wdays = \
2031 | { \
2032 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2033 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2034 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2035 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2036 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2037 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2038 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2039 | }
2040 |
2041 | _greek_date_format_re = \
2042 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2043 |
2044 | def _parse_date_greek(dateString):
2045 | '''Parse a string according to a Greek 8-bit date format.'''
2046 | m = _greek_date_format_re.match(dateString)
2047 | if not m: return
2048 | try:
2049 | wday = _greek_wdays[m.group(1)]
2050 | month = _greek_months[m.group(3)]
2051 | except:
2052 | return
2053 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2054 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2055 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2056 | 'zonediff': m.group(8)}
2057 | if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2058 | return _parse_date_rfc822(rfc822date)
2059 | registerDateHandler(_parse_date_greek)
2060 |
2061 | # Unicode strings for Hungarian date strings
2062 | _hungarian_months = \
2063 | { \
2064 | u'janu\u00e1r': u'01', # e1 in iso-8859-2
2065 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2066 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2067 | u'\u00e1prilis': u'04', # e1 in iso-8859-2
2068 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2069 | u'j\u00fanius': u'06', # fa in iso-8859-2
2070 | u'j\u00falius': u'07', # fa in iso-8859-2
2071 | u'augusztus': u'08',
2072 | u'szeptember': u'09',
2073 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2074 | u'november': u'11',
2075 | u'december': u'12',
2076 | }
2077 |
2078 | _hungarian_date_format_re = \
2079 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2080 |
2081 | def _parse_date_hungarian(dateString):
2082 | '''Parse a string according to a Hungarian 8-bit date format.'''
2083 | m = _hungarian_date_format_re.match(dateString)
2084 | if not m: return
2085 | try:
2086 | month = _hungarian_months[m.group(2)]
2087 | day = m.group(3)
2088 | if len(day) == 1:
2089 | day = '0' + day
2090 | hour = m.group(4)
2091 | if len(hour) == 1:
2092 | hour = '0' + hour
2093 | except:
2094 | return
2095 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2096 | {'year': m.group(1), 'month': month, 'day': day,\
2097 | 'hour': hour, 'minute': m.group(5),\
2098 | 'zonediff': m.group(6)}
2099 | if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2100 | return _parse_date_w3dtf(w3dtfdate)
2101 | registerDateHandler(_parse_date_hungarian)
2102 |
2103 | # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2104 | # Drake and licensed under the Python license. Removed all range checking
2105 | # for month, day, hour, minute, and second, since mktime will normalize
2106 | # these later
2107 | def _parse_date_w3dtf(dateString):
2108 | def __extract_date(m):
2109 | year = int(m.group('year'))
2110 | if year < 100:
2111 | year = 100 * int(time.gmtime()[0] / 100) + int(year)
2112 | if year < 1000:
2113 | return 0, 0, 0
2114 | julian = m.group('julian')
2115 | if julian:
2116 | julian = int(julian)
2117 | month = julian / 30 + 1
2118 | day = julian % 30 + 1
2119 | jday = None
2120 | while jday != julian:
2121 | t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2122 | jday = time.gmtime(t)[-2]
2123 | diff = abs(jday - julian)
2124 | if jday > julian:
2125 | if diff < day:
2126 | day = day - diff
2127 | else:
2128 | month = month - 1
2129 | day = 31
2130 | elif jday < julian:
2131 | if day + diff < 28:
2132 | day = day + diff
2133 | else:
2134 | month = month + 1
2135 | return year, month, day
2136 | month = m.group('month')
2137 | day = 1
2138 | if month is None:
2139 | month = 1
2140 | else:
2141 | month = int(month)
2142 | day = m.group('day')
2143 | if day:
2144 | day = int(day)
2145 | else:
2146 | day = 1
2147 | return year, month, day
2148 |
2149 | def __extract_time(m):
2150 | if not m:
2151 | return 0, 0, 0
2152 | hours = m.group('hours')
2153 | if not hours:
2154 | return 0, 0, 0
2155 | hours = int(hours)
2156 | minutes = int(m.group('minutes'))
2157 | seconds = m.group('seconds')
2158 | if seconds:
2159 | seconds = int(seconds)
2160 | else:
2161 | seconds = 0
2162 | return hours, minutes, seconds
2163 |
2164 | def __extract_tzd(m):
2165 | '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2166 | if not m:
2167 | return 0
2168 | tzd = m.group('tzd')
2169 | if not tzd:
2170 | return 0
2171 | if tzd == 'Z':
2172 | return 0
2173 | hours = int(m.group('tzdhours'))
2174 | minutes = m.group('tzdminutes')
2175 | if minutes:
2176 | minutes = int(minutes)
2177 | else:
2178 | minutes = 0
2179 | offset = (hours*60 + minutes) * 60
2180 | if tzd[0] == '+':
2181 | return -offset
2182 | return offset
2183 |
2184 | __date_re = ('(?P\d\d\d\d)'
2185 | '(?:(?P-|)'
2186 | '(?:(?P\d\d\d)'
2187 | '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?')
2188 | __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)'
2189 | __tzd_rx = re.compile(__tzd_re)
2190 | __time_re = ('(?P\d\d)(?P:|)(?P\d\d)'
2191 | '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?'
2192 | + __tzd_re)
2193 | __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2194 | __datetime_rx = re.compile(__datetime_re)
2195 | m = __datetime_rx.match(dateString)
2196 | if (m is None) or (m.group() != dateString): return
2197 | gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2198 | if gmt[0] == 0: return
2199 | return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2200 | registerDateHandler(_parse_date_w3dtf)
2201 |
2202 | def _parse_date_rfc822(dateString):
2203 | '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2204 | data = dateString.split()
2205 | if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2206 | del data[0]
2207 | if len(data) == 4:
2208 | s = data[3]
2209 | i = s.find('+')
2210 | if i > 0:
2211 | data[3:] = [s[:i], s[i+1:]]
2212 | else:
2213 | data.append('')
2214 | dateString = " ".join(data)
2215 | if len(data) < 5:
2216 | dateString += ' 00:00:00 GMT'
2217 | tm = rfc822.parsedate_tz(dateString)
2218 | if tm:
2219 | return time.gmtime(rfc822.mktime_tz(tm))
2220 | # rfc822.py defines several time zones, but we define some extra ones.
2221 | # 'ET' is equivalent to 'EST', etc.
2222 | _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2223 | rfc822._timezones.update(_additional_timezones)
2224 | registerDateHandler(_parse_date_rfc822)
2225 |
2226 | def _parse_date(dateString):
2227 | '''Parses a variety of date formats into a 9-tuple in GMT'''
2228 | for handler in _date_handlers:
2229 | try:
2230 | date9tuple = handler(dateString)
2231 | if not date9tuple: continue
2232 | if len(date9tuple) != 9:
2233 | if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2234 | raise ValueError
2235 | map(int, date9tuple)
2236 | return date9tuple
2237 | except Exception, e:
2238 | if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2239 | pass
2240 | return None
2241 |
2242 | def _getCharacterEncoding(http_headers, xml_data):
2243 | '''Get the character encoding of the XML document
2244 |
2245 | http_headers is a dictionary
2246 | xml_data is a raw string (not Unicode)
2247 |
2248 | This is so much trickier than it sounds, it's not even funny.
2249 | According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2250 | is application/xml, application/*+xml,
2251 | application/xml-external-parsed-entity, or application/xml-dtd,
2252 | the encoding given in the charset parameter of the HTTP Content-Type
2253 | takes precedence over the encoding given in the XML prefix within the
2254 | document, and defaults to 'utf-8' if neither are specified. But, if
2255 | the HTTP Content-Type is text/xml, text/*+xml, or
2256 | text/xml-external-parsed-entity, the encoding given in the XML prefix
2257 | within the document is ALWAYS IGNORED and only the encoding given in
2258 | the charset parameter of the HTTP Content-Type header should be
2259 | respected, and it defaults to 'us-ascii' if not specified.
2260 |
2261 | Furthermore, discussion on the atom-syntax mailing list with the
2262 | author of RFC 3023 leads me to the conclusion that any document
2263 | served with a Content-Type of text/* and no charset parameter
2264 | must be treated as us-ascii. (We now do this.) And also that it
2265 | must always be flagged as non-well-formed. (We now do this too.)
2266 |
2267 | If Content-Type is unspecified (input was local file or non-HTTP source)
2268 | or unrecognized (server just got it totally wrong), then go by the
2269 | encoding given in the XML prefix of the document and default to
2270 | 'iso-8859-1' as per the HTTP specification (RFC 2616).
2271 |
2272 | Then, assuming we didn't find a character encoding in the HTTP headers
2273 | (and the HTTP Content-type allowed us to look in the body), we need
2274 | to sniff the first few bytes of the XML data and try to determine
2275 | whether the encoding is ASCII-compatible. Section F of the XML
2276 | specification shows the way here:
2277 | http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2278 |
2279 | If the sniffed encoding is not ASCII-compatible, we need to make it
2280 | ASCII compatible so that we can sniff further into the XML declaration
2281 | to find the encoding attribute, which will tell us the true encoding.
2282 |
2283 | Of course, none of this guarantees that we will be able to parse the
2284 | feed in the declared character encoding (assuming it was declared
2285 | correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2286 | you should definitely install them if you can.
2287 | http://cjkpython.i18n.org/
2288 | '''
2289 |
2290 | def _parseHTTPContentType(content_type):
2291 | '''takes HTTP Content-Type header and returns (content type, charset)
2292 |
2293 | If no charset is specified, returns (content type, '')
2294 | If no content type is specified, returns ('', '')
2295 | Both return parameters are guaranteed to be lowercase strings
2296 | '''
2297 | content_type = content_type or ''
2298 | content_type, params = cgi.parse_header(content_type)
2299 | return content_type, params.get('charset', '').replace("'", '')
2300 |
2301 | sniffed_xml_encoding = ''
2302 | xml_encoding = ''
2303 | true_encoding = ''
2304 | http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2305 | # Must sniff for non-ASCII-compatible character encodings before
2306 | # searching for XML declaration. This heuristic is defined in
2307 | # section F of the XML specification:
2308 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2309 | try:
2310 | if xml_data[:4] == '\x4c\x6f\xa7\x94':
2311 | # EBCDIC
2312 | xml_data = _ebcdic_to_ascii(xml_data)
2313 | elif xml_data[:4] == '\x00\x3c\x00\x3f':
2314 | # UTF-16BE
2315 | sniffed_xml_encoding = 'utf-16be'
2316 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2317 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2318 | # UTF-16BE with BOM
2319 | sniffed_xml_encoding = 'utf-16be'
2320 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2321 | elif xml_data[:4] == '\x3c\x00\x3f\x00':
2322 | # UTF-16LE
2323 | sniffed_xml_encoding = 'utf-16le'
2324 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2325 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2326 | # UTF-16LE with BOM
2327 | sniffed_xml_encoding = 'utf-16le'
2328 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2329 | elif xml_data[:4] == '\x00\x00\x00\x3c':
2330 | # UTF-32BE
2331 | sniffed_xml_encoding = 'utf-32be'
2332 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2333 | elif xml_data[:4] == '\x3c\x00\x00\x00':
2334 | # UTF-32LE
2335 | sniffed_xml_encoding = 'utf-32le'
2336 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2337 | elif xml_data[:4] == '\x00\x00\xfe\xff':
2338 | # UTF-32BE with BOM
2339 | sniffed_xml_encoding = 'utf-32be'
2340 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2341 | elif xml_data[:4] == '\xff\xfe\x00\x00':
2342 | # UTF-32LE with BOM
2343 | sniffed_xml_encoding = 'utf-32le'
2344 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2345 | elif xml_data[:3] == '\xef\xbb\xbf':
2346 | # UTF-8 with BOM
2347 | sniffed_xml_encoding = 'utf-8'
2348 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2349 | else:
2350 | # ASCII-compatible
2351 | pass
2352 | xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2353 | except:
2354 | xml_encoding_match = None
2355 | if xml_encoding_match:
2356 | xml_encoding = xml_encoding_match.groups()[0].lower()
2357 | if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2358 | xml_encoding = sniffed_xml_encoding
2359 | acceptable_content_type = 0
2360 | application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2361 | text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2362 | if (http_content_type in application_content_types) or \
2363 | (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2364 | acceptable_content_type = 1
2365 | true_encoding = http_encoding or xml_encoding or 'utf-8'
2366 | elif (http_content_type in text_content_types) or \
2367 | (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2368 | acceptable_content_type = 1
2369 | true_encoding = http_encoding or 'us-ascii'
2370 | elif http_content_type.startswith('text/'):
2371 | true_encoding = http_encoding or 'us-ascii'
2372 | elif http_headers and (not http_headers.has_key('content-type')):
2373 | true_encoding = xml_encoding or 'iso-8859-1'
2374 | else:
2375 | true_encoding = xml_encoding or 'utf-8'
2376 | return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2377 |
2378 | def _toUTF8(data, encoding):
2379 | '''Changes an XML data stream on the fly to specify a new encoding
2380 |
2381 | data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2382 | encoding is a string recognized by encodings.aliases
2383 | '''
2384 | if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2385 | # strip Byte Order Mark (if present)
2386 | if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2387 | if _debug:
2388 | sys.stderr.write('stripping BOM\n')
2389 | if encoding != 'utf-16be':
2390 | sys.stderr.write('trying utf-16be instead\n')
2391 | encoding = 'utf-16be'
2392 | data = data[2:]
2393 | elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2394 | if _debug:
2395 | sys.stderr.write('stripping BOM\n')
2396 | if encoding != 'utf-16le':
2397 | sys.stderr.write('trying utf-16le instead\n')
2398 | encoding = 'utf-16le'
2399 | data = data[2:]
2400 | elif data[:3] == '\xef\xbb\xbf':
2401 | if _debug:
2402 | sys.stderr.write('stripping BOM\n')
2403 | if encoding != 'utf-8':
2404 | sys.stderr.write('trying utf-8 instead\n')
2405 | encoding = 'utf-8'
2406 | data = data[3:]
2407 | elif data[:4] == '\x00\x00\xfe\xff':
2408 | if _debug:
2409 | sys.stderr.write('stripping BOM\n')
2410 | if encoding != 'utf-32be':
2411 | sys.stderr.write('trying utf-32be instead\n')
2412 | encoding = 'utf-32be'
2413 | data = data[4:]
2414 | elif data[:4] == '\xff\xfe\x00\x00':
2415 | if _debug:
2416 | sys.stderr.write('stripping BOM\n')
2417 | if encoding != 'utf-32le':
2418 | sys.stderr.write('trying utf-32le instead\n')
2419 | encoding = 'utf-32le'
2420 | data = data[4:]
2421 | newdata = unicode(data, encoding)
2422 | if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2423 | declmatch = re.compile('^<\?xml[^>]*?>')
2424 | newdecl = ''''''
2425 | if declmatch.search(newdata):
2426 | newdata = declmatch.sub(newdecl, newdata)
2427 | else:
2428 | newdata = newdecl + u'\n' + newdata
2429 | return newdata.encode('utf-8')
2430 |
2431 | def _stripDoctype(data):
2432 | '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2433 |
2434 | rss_version may be 'rss091n' or None
2435 | stripped_data is the same XML document, minus the DOCTYPE
2436 | '''
2437 | entity_pattern = re.compile(r']*?)>', re.MULTILINE)
2438 | data = entity_pattern.sub('', data)
2439 | doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
2440 | doctype_results = doctype_pattern.findall(data)
2441 | doctype = doctype_results and doctype_results[0] or ''
2442 | if doctype.lower().count('netscape'):
2443 | version = 'rss091n'
2444 | else:
2445 | version = None
2446 | data = doctype_pattern.sub('', data)
2447 | return version, data
2448 |
2449 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2450 | '''Parse a feed from a URL, file, stream, or string'''
2451 | result = FeedParserDict()
2452 | result['feed'] = FeedParserDict()
2453 | result['entries'] = []
2454 | if _XML_AVAILABLE:
2455 | result['bozo'] = 0
2456 | if type(handlers) == types.InstanceType:
2457 | handlers = [handlers]
2458 | try:
2459 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2460 | data = f.read()
2461 | except Exception, e:
2462 | result['bozo'] = 1
2463 | result['bozo_exception'] = e
2464 | data = ''
2465 | f = None
2466 |
2467 | # if feed is gzip-compressed, decompress it
2468 | if f and data and hasattr(f, 'headers'):
2469 | if gzip and f.headers.get('content-encoding', '') == 'gzip':
2470 | try:
2471 | data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2472 | except Exception, e:
2473 | # Some feeds claim to be gzipped but they're not, so
2474 | # we get garbage. Ideally, we should re-request the
2475 | # feed without the 'Accept-encoding: gzip' header,
2476 | # but we don't.
2477 | result['bozo'] = 1
2478 | result['bozo_exception'] = e
2479 | data = ''
2480 | elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2481 | try:
2482 | data = zlib.decompress(data, -zlib.MAX_WBITS)
2483 | except Exception, e:
2484 | result['bozo'] = 1
2485 | result['bozo_exception'] = e
2486 | data = ''
2487 |
2488 | # save HTTP headers
2489 | if hasattr(f, 'info'):
2490 | info = f.info()
2491 | result['etag'] = info.getheader('ETag')
2492 | last_modified = info.getheader('Last-Modified')
2493 | if last_modified:
2494 | result['modified'] = _parse_date(last_modified)
2495 | if hasattr(f, 'url'):
2496 | result['href'] = f.url
2497 | result['status'] = 200
2498 | if hasattr(f, 'status'):
2499 | result['status'] = f.status
2500 | if hasattr(f, 'headers'):
2501 | result['headers'] = f.headers.dict
2502 | if hasattr(f, 'close'):
2503 | f.close()
2504 |
2505 | # there are four encodings to keep track of:
2506 | # - http_encoding is the encoding declared in the Content-Type HTTP header
2507 | # - xml_encoding is the encoding declared in the ; changed
2674 | # project name
2675 | #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2676 | # removed unnecessary urllib code -- urllib2 should always be available anyway;
2677 | # return actual url, status, and full HTTP headers (as result['url'],
2678 | # result['status'], and result['headers']) if parsing a remote feed over HTTP --
2679 | # this should pass all the HTTP tests at ;
2680 | # added the latest namespace-of-the-week for RSS 2.0
2681 | #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2682 | # User-Agent (otherwise urllib2 sends two, which confuses some servers)
2683 | #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2684 | # inline and as used in some RSS 2.0 feeds
2685 | #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2686 | # textInput, and also to return the character encoding (if specified)
2687 | #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2688 | # nested divs within content (JohnD); fixed missing sys import (JohanS);
2689 | # fixed regular expression to capture XML character encoding (Andrei);
2690 | # added support for Atom 0.3-style links; fixed bug with textInput tracking;
2691 | # added support for cloud (MartijnP); added support for multiple
2692 | # category/dc:subject (MartijnP); normalize content model: 'description' gets
2693 | # description (which can come from description, summary, or full content if no
2694 | # description), 'content' gets dict of base/language/type/value (which can come
2695 | # from content:encoded, xhtml:body, content, or fullitem);
2696 | # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2697 | # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2698 | # element is not in default namespace (like Pocketsoap feed);
2699 | # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2700 | # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2701 | # description, xhtml:body, content, content:encoded, title, subtitle,
2702 | # summary, info, tagline, and copyright; added support for pingback and
2703 | # trackback namespaces
2704 | #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2705 | # namespaces, as opposed to 2.6 when I said I did but didn't really;
2706 | # sanitize HTML markup within some elements; added mxTidy support (if
2707 | # installed) to tidy HTML markup within some elements; fixed indentation
2708 | # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2709 | # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2710 | # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2711 | # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2712 | # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2713 | #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2714 | # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2715 | # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2716 | #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in
2717 | # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2718 | # fixed relative URI processing for guid (skadz); added ICBM support; added
2719 | # base64 support
2720 | #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2721 | # blogspot.com sites); added _debug variable
2722 | #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2723 | #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2724 | # added several new supported namespaces; fixed bug tracking naked markup in
2725 | # description; added support for enclosure; added support for source; re-added
2726 | # support for cloud which got dropped somehow; added support for expirationDate
2727 | #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2728 | # xml:base URI, one for documents that don't define one explicitly and one for
2729 | # documents that define an outer and an inner xml:base that goes out of scope
2730 | # before the end of the document
2731 | #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2732 | #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2733 | # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2734 | # added support for creativeCommons:license and cc:license; added support for
2735 | # full Atom content model in title, tagline, info, copyright, summary; fixed bug
2736 | # with gzip encoding (not always telling server we support it when we do)
2737 | #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2738 | # (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2739 | # contains name + email address
2740 | #3.0b8 - 1/28/2004 - MAP - added support for contributor
2741 | #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2742 | # support for summary
2743 | #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2744 | # xml.util.iso8601
2745 | #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2746 | # dangerous markup; fiddled with decodeEntities (not right); liberalized
2747 | # date parsing even further
2748 | #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2749 | # added support to Atom 0.2 subtitle; added support for Atom content model
2750 | # in copyright; better sanitizing of dangerous HTML elements with end tags
2751 | # (script, frameset)
2752 | #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2753 | # etc.) in embedded markup, in either HTML or XHTML form (
,
,
)
2754 | #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2755 | # Python 2.1
2756 | #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2757 | # fixed bug capturing author and contributor URL; fixed bug resolving relative
2758 | # links in author and contributor URL; fixed bug resolvin relative links in
2759 | # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2760 | # namespace tests, and included them permanently in the test suite with his
2761 | # permission; fixed namespace handling under Python 2.1
2762 | #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2763 | #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2764 | #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2765 | # use libxml2 (if available)
2766 | #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2767 | # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2768 | # workaround crash in PyXML/expat when encountering invalid entities
2769 | # (MarkMoraes); support for textinput/textInput
2770 | #3.0b20 - 4/7/2004 - MAP - added CDF support
2771 | #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2772 | #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2773 | # results dict; changed results dict to allow getting values with results.key
2774 | # as well as results[key]; work around embedded illformed HTML with half
2775 | # a DOCTYPE; work around malformed Content-Type header; if character encoding
2776 | # is wrong, try several common ones before falling back to regexes (if this
2777 | # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2778 | # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2779 | # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2780 | # convert each value in results to Unicode (if possible), even if using
2781 | # regex-based parsing
2782 | #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2783 | # high-bit characters in attributes in embedded HTML in description (thanks
2784 | # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2785 | # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2786 | # about a mapped key
2787 | #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2788 | # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2789 | # cause the same encoding to be tried twice (even if it failed the first time);
2790 | # fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2791 | # better textinput and image tracking in illformed RSS 1.0 feeds
2792 | #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2793 | # my blink tag tests
2794 | #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2795 | # failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2796 | # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2797 | # added support for image; refactored parse() fallback logic to try other
2798 | # encodings if SAX parsing fails (previously it would only try other encodings
2799 | # if re-encoding failed); remove unichr madness in normalize_attrs now that
2800 | # we're properly tracking encoding in and out of BaseHTMLProcessor; set
2801 | # feed.language from root-level xml:lang; set entry.id from rdf:about;
2802 | # send Accept header
2803 | #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2804 | # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2805 | # windows-1252); fixed regression that could cause the same encoding to be
2806 | # tried twice (even if it failed the first time)
2807 | #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2808 | # recover from malformed content-type header parameter with no equals sign
2809 | # ('text/xml; charset:iso-8859-1')
2810 | #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2811 | # to Unicode equivalents in illformed feeds (aaronsw); added and
2812 | # passed tests for converting character entities to Unicode equivalents
2813 | # in illformed feeds (aaronsw); test for valid parsers when setting
2814 | # XML_AVAILABLE; make version and encoding available when server returns
2815 | # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2816 | # digest auth or proxy support); add code to parse username/password
2817 | # out of url and send as basic authentication; expose downloading-related
2818 | # exceptions in bozo_exception (aaronsw); added __contains__ method to
2819 | # FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2820 | #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2821 | # convert feed to UTF-8 before passing to XML parser; completely revamped
2822 | # logic for determining character encoding and attempting XML parsing
2823 | # (much faster); increased default timeout to 20 seconds; test for presence
2824 | # of Location header on redirects; added tests for many alternate character
2825 | # encodings; support various EBCDIC encodings; support UTF-16BE and
2826 | # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2827 | # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2828 | # XML parsers are available; added support for 'Content-encoding: deflate';
2829 | # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2830 | # are available
2831 | #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2832 | # problem tracking xml:base and xml:lang if element declares it, child
2833 | # doesn't, first grandchild redeclares it, and second grandchild doesn't;
2834 | # refactored date parsing; defined public registerDateHandler so callers
2835 | # can add support for additional date formats at runtime; added support
2836 | # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2837 | # zopeCompatibilityHack() which turns FeedParserDict into a regular
2838 | # dictionary, required for Zope compatibility, and also makes command-
2839 | # line debugging easier because pprint module formats real dictionaries
2840 | # better than dictionary-like objects; added NonXMLContentType exception,
2841 | # which is stored in bozo_exception when a feed is served with a non-XML
2842 | # media type such as 'text/plain'; respect Content-Language as default
2843 | # language if not xml:lang is present; cloud dict is now FeedParserDict;
2844 | # generator dict is now FeedParserDict; better tracking of xml:lang,
2845 | # including support for xml:lang='' to unset the current language;
2846 | # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2847 | # namespace; don't overwrite final status on redirects (scenarios:
2848 | # redirecting to a URL that returns 304, redirecting to a URL that
2849 | # redirects to another URL with a different type of redirect); add
2850 | # support for HTTP 303 redirects
2851 | #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2852 | # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2853 | # support for Atom 1.0; support for iTunes extensions; new 'tags' for
2854 | # categories/keywords/etc. as array of dict
2855 | # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2856 | # terminology; parse RFC 822-style dates with no time; lots of other
2857 | # bug fixes
2858 | #4.1 - MAP - removed socket timeout; added support for chardet library
--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
1 | indexes:
2 |
3 | - kind: Subscription
4 | properties:
5 | - name: jid
6 | - name: created_at
7 | direction: desc
8 |
9 | - kind: Subscription
10 | properties:
11 | - name: jid
12 | - name: feed
13 | direction: asc
14 |
15 | # AUTOGENERATED
16 |
17 | # This index.yaml is automatically updated whenever the dev_appserver
18 | # detects that a new type of query is run. If you want to manage the
19 | # index.yaml file manually, remove the above marker line (the line
20 | # saying "# AUTOGENERATED"). If you want to manage some indexes
21 | # manually, move them above the marker line. The index.yaml file is
22 | # automatically uploaded to the admin console when you next deploy
23 | # your application using appcfg.py.
24 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import hashlib
3 | import base64
4 | import urllib
5 | import logging
6 | import feedparser
7 | from google.appengine.api import xmpp
8 | from google.appengine.ext import webapp
9 | from google.appengine.ext.webapp.util import run_wsgi_app
10 | from google.appengine.ext.webapp import xmpp_handlers
11 | from google.appengine.ext.webapp import template
12 | from google.appengine.ext import db
13 | from google.appengine.api import urlfetch
14 | from google.appengine.runtime import apiproxy_errors
15 | from google.appengine.api.app_identity import get_application_id
16 | from google.appengine.api import memcache
17 |
18 | import extractlinks
19 | from extractlinks import LinkExtractor
20 |
21 | SUPERFEEDR_LOGIN = ""
22 | SUPERFEEDR_PASSWORD = ""
23 | appname = get_application_id()
24 |
25 | ##
26 | # the function that sends subscriptions/unsubscriptions to Superfeedr
27 | def superfeedr(mode, subscription):
28 | post_data = {
29 | 'hub.mode' : mode,
30 | 'hub.callback' : "http://" + appname + ".appspot.com/hubbub/" + subscription.key().name(),
31 | 'hub.topic' : subscription.feed,
32 | 'hub.verify' : 'async',
33 | 'hub.verify_token' : '',
34 | }
35 | base64string = base64.encodestring('%s:%s' % (SUPERFEEDR_LOGIN, SUPERFEEDR_PASSWORD))[:-1]
36 | form_data = urllib.urlencode(post_data)
37 | result = urlfetch.fetch(url="http://superfeedr.com/hubbub",
38 | payload=form_data,
39 | method=urlfetch.POST,
40 | headers={"Authorization": "Basic "+ base64string, 'Content-Type': 'application/x-www-form-urlencoded'},
41 | deadline=10)
42 | # logging.info('Result of %s to %s => %s (%d)',mode, subscription.feed, result.content, result.status_code )
43 |
44 | return result
45 |
46 |
47 | ##
48 | # The subscription model that matches a feed and a jid.
49 | class Subscription(db.Model):
50 | feed = db.LinkProperty(required=True)
51 | jid = db.StringProperty(required=True)
52 | created_at = db.DateTimeProperty(required=True, auto_now_add=True)
53 |
54 | ##
55 | # The subscribe page. Useful for those who want to subscribe easily via a web page
56 | class SubscribePage(webapp.RequestHandler):
57 |
58 | def get(self):
59 | feeds = []
60 | if self.request.get("resource"):
61 | feeds = memcache.get(self.request.get("resource"))
62 | if feeds is not None:
63 | # good
64 | logging.debug("Memcache hit.")
65 | else:
66 | logging.debug("Memcache miss.")
67 | try:
68 | result = urlfetch.fetch(url=self.request.get("resource"), deadline=10)
69 | parser = LinkExtractor()
70 | parser.set_base_url(self.request.get("resource"))
71 | parser.feed(result.content)
72 | if parser.links:
73 | feeds = parser.links
74 | else:
75 | feeds = []
76 |
77 | if not feeds:
78 | # Let's check if by any chance this is actually not a feed?
79 | data = feedparser.parse(result.content)
80 | mimeType = "application/atom+xml"
81 | href = self.request.get("resource")
82 | if re.match("atom", data.version):
83 | mimeType = "application/atom+xml"
84 | feeds = [{'title': data.feed.title, 'rel': 'self', 'type': mimeType, 'href': href}]
85 |
86 | except:
87 | feeds = []
88 |
89 | if not memcache.set(self.request.get("resource"), feeds, 86400):
90 | logging.error("Memcache set failed.")
91 | else:
92 | logging.debug("Memcache set.")
93 |
94 | self.response.out.write(template.render(os.path.join(os.path.dirname(__file__), 'templates', "subscribe.html"), {'appname': appname, 'feeds': feeds}))
95 |
96 | ##
97 | # The web app interface
98 | class MainPage(webapp.RequestHandler):
99 |
100 | def get(self):
101 | self.redirect('http://blog.superfeedr.com/notifixlight/')
102 |
103 | ##
104 | # The HubbubSusbcriber
105 | class HubbubSubscriber(webapp.RequestHandler):
106 |
107 | ##
108 | # Called upon notification
109 | def post(self, feed_sekret):
110 | subscription = None
111 | try: subscription = Subscription.get_by_key_name(feed_sekret)
112 | except apiproxy_errors.OverQuotaError, error_message:
113 | logging.error(error_message)
114 | pass
115 | if(subscription == None):
116 | if self.request.get("hub.mode") == "unsubscribe" :
117 | # Let superfeedr unsusbscribe this.
118 | # Even though we have no record of it.
119 | self.response.set_status(200)
120 | self.response.out.write(self.request.get('hub.challenge'))
121 | else:
122 | self.response.set_status(404)
123 | self.response.out.write("Sorry, no feed.");
124 | else:
125 | body = self.request.body.decode('utf-8')
126 | data = feedparser.parse(self.request.body)
127 | logging.info('Found %d entries in %s', len(data.entries), subscription.feed)
128 | try:
129 | feed_title = data.feed.title
130 | except AttributeError:
131 | feed_title = ''
132 | for entry in data.entries:
133 | link = entry.get('link', '')
134 | title = entry.get('title', '')
135 | logging.info('Found entry with title = "%s", '
136 | 'link = "%s"',
137 | title, link)
138 | user_address = subscription.jid
139 | msg = "'" + feed_title + "' : " + title + "\n" + link
140 | status_code = xmpp.send_message(user_address, msg)
141 | self.response.set_status(200)
142 | self.response.out.write("Alright. Saved.");
143 |
144 | def get(self, feed_sekret):
145 | subscription = None
146 | try: subscription = Subscription.get_by_key_name(feed_sekret)
147 | except apiproxy_errors.OverQuotaError, error_message:
148 | logging.error(error_message)
149 | pass
150 | if(subscription == None):
151 | if self.request.get("hub.mode") == "unsubscribe" :
152 | # Let superfeedr unsusbscribe this.
153 | # Even though we have no record of it.
154 | self.response.set_status(200)
155 | self.response.out.write(self.request.get('hub.challenge'))
156 | else:
157 | self.response.set_status(404)
158 | self.response.out.write("Sorry, no feed.");
159 | else:
160 | # Let's confirm to the subscriber that he'll get notifications for this feed.
161 | user_address = subscription.jid
162 | if(self.request.get("hub.mode") == "subscribe"):
163 | msg = "You're now subscribed to " + subscription.feed
164 | xmpp.send_message(user_address, msg)
165 | self.response.out.write(self.request.get('hub.challenge'))
166 | self.response.set_status(200)
167 | elif(self.request.get("hub.mode") == "unsubscribe"):
168 | msg = "You're not anymore subscribed to " + subscription.feed
169 | xmpp.send_message(user_address, msg)
170 | self.response.out.write(self.request.get('hub.challenge'))
171 | self.response.set_status(200)
172 |
173 | ##
174 | # The XMPP App interface
175 | class XMPPHandler(xmpp_handlers.CommandHandler):
176 |
177 | # Asking to subscribe to a feed
178 | def subscribe_command(self, message=None):
179 | message = xmpp.Message(self.request.POST)
180 | subscriber = message.sender.rpartition("/")[0]
181 | subscription = Subscription(key_name=hashlib.sha224(message.arg + subscriber).hexdigest(), feed=message.arg, jid=subscriber)
182 | subscription.put() # saves the subscription
183 | result = superfeedr("subscribe", subscription)
184 | if result.status_code == 204:
185 | # logging.info("Subscription success! %s", message.arg)
186 | message.reply("Successfully subscribed to " + message.arg + "!")
187 | elif result.status_code == 202:
188 | message.reply("Subscribing to " + message.arg + ", you should get a confirmation soon.")
189 | else:
190 | message.reply("Could not subscribe to " + message.arg + ", looks like AppEngine got a small glitch. Please try again!")
191 | logging.error("Sorry, couldn't subscribe ( Status %s - Error %s) to %s", message.arg, result.status_code, result.content)
192 |
193 | ##
194 | # Asking to unsubscribe to a feed
195 | def unsubscribe_command(self, message=None):
196 | message = xmpp.Message(self.request.POST)
197 | subscriber = message.sender.rpartition("/")[0]
198 | if message.arg == "all":
199 | query = Subscription.all().filter("jid =",subscriber).order("feed")
200 | subscriptions = query.fetch(query.count() + 1)
201 | for subscription in subscriptions:
202 | subscription.delete()
203 | db.delete(subscriptions)
204 | message.reply("Well done! We deleted all your subscriptions!")
205 | else :
206 | subscription = Subscription.get_by_key_name(hashlib.sha224(message.arg + subscriber).hexdigest())
207 | if(subscription == None):
208 | message.reply("Looks like you were not susbcribed to " + message.arg)
209 | else:
210 | result = superfeedr("unsubscribe", subscription)
211 | subscription.delete() # deletes the subscription
212 | message.reply("Well done! You're not subscribed anymore to " + message.arg)
213 |
214 | ##
215 | # List subscriptions by page
216 | # 100/page
217 | # page default to 1
218 | def list_command(self, message=None):
219 | message = xmpp.Message(self.request.POST)
220 | subscriber = message.sender.rpartition("/")[0]
221 | query = Subscription.all().filter("jid =",subscriber).order("feed")
222 | count = query.count()
223 | if count == 0:
224 | message.reply("Seems you subscribed nothing yet. Type\n /subscribe http://twitter.com/statuses/user_timeline/43417156.rss\nto play around.")
225 | else:
226 | page_index = int(message.arg or 1)
227 | if count%100 == 0:
228 | pages_count = count/100
229 | else:
230 | pages_count = count/100 + 1
231 |
232 | page_index = min(page_index, pages_count)
233 | offset = (page_index - 1) * 100
234 | subscriptions = query.fetch(100, offset)
235 | message.reply("Your have %d subscriptions in total: page %d/%d \n" % (count,page_index,pages_count))
236 | feed_list = [s.feed for s in subscriptions]
237 | message.reply("\n".join(feed_list))
238 |
239 | ##
240 | # Asking for help
241 | def hello_command(self, message=None):
242 | message = xmpp.Message(self.request.POST)
243 | message.reply("Oh, Hai! " + appname
244 | + " is a small app to help you subscribe to your favorite feeds and get their updates via IM. It's powered by Superfeedr (http://superfeedr.com) and its magic powers!. ")
245 | message.reply("Make it better : http://github.com/superfeedr/notifixlight.")
246 | message.reply("For more info, type /help.")
247 |
248 | ##
249 | # Asking for help
250 | def help_command(self, message=None):
251 | message = xmpp.Message(self.request.POST)
252 | help_msg = "It's not even alpha ready, but you could play with following commands:\n\n" \
253 | "/hello -> about me\n\n" \
254 | "/subscribe \n/unsubscribe -> subscribe or unsubscribe to a feed\n\n" \
255 | "/list -> list subscriptions, default to page 1\n\n" \
256 | "/help -> get help message\n"
257 | message.reply(help_msg)
258 | message.reply(message.body)
259 |
260 | ##
261 | # All other commants
262 | def unhandled_command(self, message=None):
263 | message = xmpp.Message(self.request.POST)
264 | message.reply("Please, type /help for help.")
265 |
266 | ##
267 | # Sent for any message.
268 | def text_message(self, message=None):
269 | message = xmpp.Message(self.request.POST)
270 | message.reply("Echooooo (when you're done playing, type /help) > " + message.body)
271 |
272 | application = webapp.WSGIApplication([
273 | ('/_ah/xmpp/message/chat/', XMPPHandler),
274 | ('/', MainPage),
275 | ('/subscribe', SubscribePage),
276 | ('/hubbub/(.*)', HubbubSubscriber)
277 | ], debug=True)
278 |
279 | def main():
280 | run_wsgi_app(application)
281 |
282 | if __name__ == "__main__":
283 | main()
284 |
285 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 | Notifixlite
6 |
35 |
36 |
37 |
38 | * Yes, you have to use a Jabber/XMPP client. Add {{ appname }}@appspot.com as a friend.
39 | Powered by Superfeedr on Google App Engine - Take that bad code and make it better.
40 |
41 |
42 |
--------------------------------------------------------------------------------
/templates/subscribe.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 | Notifixlite
6 |
35 |
36 |
37 | {% for feed in feeds %}
38 |
39 | {% endfor %}
40 | We use Subtome for subscriptions. Next time you see a subscribe button, you'll be able to pick {{appname}} to subscribe.
41 |
Powered by Superfeedr on Google App Engine - Take that bad code and make it better.
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------