':
844 | depth = 0
845 | for piece in pieces[:-1]:
846 | if piece.startswith(''):
847 | depth -= 1
848 | if depth == 0:
849 | break
850 | elif piece.startswith('<') and not piece.endswith('/>'):
851 | depth += 1
852 | else:
853 | pieces = pieces[1:-1]
854 |
855 | # Ensure each piece is a str for Python 3
856 | for (i, v) in enumerate(pieces):
857 | if not isinstance(v, unicode):
858 | pieces[i] = v.decode('utf-8')
859 |
860 | output = u''.join(pieces)
861 | if stripWhitespace:
862 | output = output.strip()
863 | if not expectingText:
864 | return output
865 |
866 | # decode base64 content
867 | if base64 and self.contentparams.get('base64', 0):
868 | try:
869 | output = _base64decode(output)
870 | except binascii.Error:
871 | pass
872 | except binascii.Incomplete:
873 | pass
874 | except TypeError:
875 | # In Python 3, base64 takes and outputs bytes, not str
876 | # This may not be the most correct way to accomplish this
877 | output = _base64decode(output.encode('utf-8')).decode('utf-8')
878 |
879 | # resolve relative URIs
880 | if (element in self.can_be_relative_uri) and output:
881 | output = self.resolveURI(output)
882 |
883 | # decode entities within embedded markup
884 | if not self.contentparams.get('base64', 0):
885 | output = self.decodeEntities(element, output)
886 |
887 | # some feed formats require consumers to guess
888 | # whether the content is html or plain text
889 | if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
890 | if self.lookslikehtml(output):
891 | self.contentparams['type'] = u'text/html'
892 |
893 | # remove temporary cruft from contentparams
894 | try:
895 | del self.contentparams['mode']
896 | except KeyError:
897 | pass
898 | try:
899 | del self.contentparams['base64']
900 | except KeyError:
901 | pass
902 |
903 | is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
904 | # resolve relative URIs within embedded markup
905 | if is_htmlish and RESOLVE_RELATIVE_URIS:
906 | if element in self.can_contain_relative_uris:
907 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
908 |
909 | # sanitize embedded markup
910 | if is_htmlish and SANITIZE_HTML:
911 | if element in self.can_contain_dangerous_markup:
912 | output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
913 |
914 | if self.encoding and not isinstance(output, unicode):
915 | output = output.decode(self.encoding, 'ignore')
916 |
917 | # address common error where people take data that is already
918 | # utf-8, presume that it is iso-8859-1, and re-encode it.
919 | if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
920 | try:
921 | output = output.encode('iso-8859-1').decode('utf-8')
922 | except (UnicodeEncodeError, UnicodeDecodeError):
923 | pass
924 |
925 | # map win-1252 extensions to the proper code points
926 | if isinstance(output, unicode):
927 | output = output.translate(_cp1252)
928 |
929 | # categories/tags/keywords/whatever are handled in _end_category
930 | if element == 'category':
931 | return output
932 |
933 | if element == 'title' and -1 < self.title_depth <= self.depth:
934 | return output
935 |
936 | # store output in appropriate place(s)
937 | if self.inentry and not self.insource:
938 | if element == 'content':
939 | self.entries[-1].setdefault(element, [])
940 | contentparams = copy.deepcopy(self.contentparams)
941 | contentparams['value'] = output
942 | self.entries[-1][element].append(contentparams)
943 | elif element == 'link':
944 | if not self.inimage:
945 | # query variables in urls in link elements are improperly
946 | # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
947 | # unhandled character references. fix this special case.
948 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
949 | self.entries[-1][element] = output
950 | if output:
951 | self.entries[-1]['links'][-1]['href'] = output
952 | else:
953 | if element == 'description':
954 | element = 'summary'
955 | old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
956 | if old_value_depth is None or self.depth <= old_value_depth:
957 | self.property_depth_map[self.entries[-1]][element] = self.depth
958 | self.entries[-1][element] = output
959 | if self.incontent:
960 | contentparams = copy.deepcopy(self.contentparams)
961 | contentparams['value'] = output
962 | self.entries[-1][element + '_detail'] = contentparams
963 | elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
964 | context = self._getContext()
965 | if element == 'description':
966 | element = 'subtitle'
967 | context[element] = output
968 | if element == 'link':
969 | # fix query variables; see above for the explanation
970 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
971 | context[element] = output
972 | context['links'][-1]['href'] = output
973 | elif self.incontent:
974 | contentparams = copy.deepcopy(self.contentparams)
975 | contentparams['value'] = output
976 | context[element + '_detail'] = contentparams
977 | return output
978 |
979 | def pushContent(self, tag, attrsD, defaultContentType, expectingText):
980 | self.incontent += 1
981 | if self.lang:
982 | self.lang=self.lang.replace('_','-')
983 | self.contentparams = FeedParserDict({
984 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
985 | 'language': self.lang,
986 | 'base': self.baseuri})
987 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
988 | self.push(tag, expectingText)
989 |
990 | def popContent(self, tag):
991 | value = self.pop(tag)
992 | self.incontent -= 1
993 | self.contentparams.clear()
994 | return value
995 |
996 | # a number of elements in a number of RSS variants are nominally plain
997 | # text, but this is routinely ignored. This is an attempt to detect
998 | # the most common cases. As false positives often result in silent
999 | # data loss, this function errs on the conservative side.
1000 | @staticmethod
1001 | def lookslikehtml(s):
1002 | # must have a close tag or an entity reference to qualify
1003 | if not (re.search(r'(\w+)>',s) or re.search("?\w+;",s)):
1004 | return
1005 |
1006 | # all tags must be in a restricted subset of valid HTML tags
1007 | if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1008 | re.findall(r'?(\w+)',s)):
1009 | return
1010 |
1011 | # all entities must have been defined as valid HTML entities
1012 | if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1013 | return
1014 |
1015 | return 1
1016 |
1017 | def _mapToStandardPrefix(self, name):
1018 | colonpos = name.find(':')
1019 | if colonpos <> -1:
1020 | prefix = name[:colonpos]
1021 | suffix = name[colonpos+1:]
1022 | prefix = self.namespacemap.get(prefix, prefix)
1023 | name = prefix + ':' + suffix
1024 | return name
1025 |
1026 | def _getAttribute(self, attrsD, name):
1027 | return attrsD.get(self._mapToStandardPrefix(name))
1028 |
1029 | def _isBase64(self, attrsD, contentparams):
1030 | if attrsD.get('mode', '') == 'base64':
1031 | return 1
1032 | if self.contentparams['type'].startswith(u'text/'):
1033 | return 0
1034 | if self.contentparams['type'].endswith(u'+xml'):
1035 | return 0
1036 | if self.contentparams['type'].endswith(u'/xml'):
1037 | return 0
1038 | return 1
1039 |
1040 | def _itsAnHrefDamnIt(self, attrsD):
1041 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1042 | if href:
1043 | try:
1044 | del attrsD['url']
1045 | except KeyError:
1046 | pass
1047 | try:
1048 | del attrsD['uri']
1049 | except KeyError:
1050 | pass
1051 | attrsD['href'] = href
1052 | return attrsD
1053 |
1054 | def _save(self, key, value, overwrite=False):
1055 | context = self._getContext()
1056 | if overwrite:
1057 | context[key] = value
1058 | else:
1059 | context.setdefault(key, value)
1060 |
1061 | def _start_rss(self, attrsD):
1062 | versionmap = {'0.91': u'rss091u',
1063 | '0.92': u'rss092',
1064 | '0.93': u'rss093',
1065 | '0.94': u'rss094'}
1066 | #If we're here then this is an RSS feed.
1067 | #If we don't have a version or have a version that starts with something
1068 | #other than RSS then there's been a mistake. Correct it.
1069 | if not self.version or not self.version.startswith(u'rss'):
1070 | attr_version = attrsD.get('version', '')
1071 | version = versionmap.get(attr_version)
1072 | if version:
1073 | self.version = version
1074 | elif attr_version.startswith('2.'):
1075 | self.version = u'rss20'
1076 | else:
1077 | self.version = u'rss'
1078 |
1079 | def _start_channel(self, attrsD):
1080 | self.infeed = 1
1081 | self._cdf_common(attrsD)
1082 |
1083 | def _cdf_common(self, attrsD):
1084 | if 'lastmod' in attrsD:
1085 | self._start_modified({})
1086 | self.elementstack[-1][-1] = attrsD['lastmod']
1087 | self._end_modified()
1088 | if 'href' in attrsD:
1089 | self._start_link({})
1090 | self.elementstack[-1][-1] = attrsD['href']
1091 | self._end_link()
1092 |
1093 | def _start_feed(self, attrsD):
1094 | self.infeed = 1
1095 | versionmap = {'0.1': u'atom01',
1096 | '0.2': u'atom02',
1097 | '0.3': u'atom03'}
1098 | if not self.version:
1099 | attr_version = attrsD.get('version')
1100 | version = versionmap.get(attr_version)
1101 | if version:
1102 | self.version = version
1103 | else:
1104 | self.version = u'atom'
1105 |
1106 | def _end_channel(self):
1107 | self.infeed = 0
1108 | _end_feed = _end_channel
1109 |
1110 | def _start_image(self, attrsD):
1111 | context = self._getContext()
1112 | if not self.inentry:
1113 | context.setdefault('image', FeedParserDict())
1114 | self.inimage = 1
1115 | self.title_depth = -1
1116 | self.push('image', 0)
1117 |
1118 | def _end_image(self):
1119 | self.pop('image')
1120 | self.inimage = 0
1121 |
1122 | def _start_textinput(self, attrsD):
1123 | context = self._getContext()
1124 | context.setdefault('textinput', FeedParserDict())
1125 | self.intextinput = 1
1126 | self.title_depth = -1
1127 | self.push('textinput', 0)
1128 | _start_textInput = _start_textinput
1129 |
1130 | def _end_textinput(self):
1131 | self.pop('textinput')
1132 | self.intextinput = 0
1133 | _end_textInput = _end_textinput
1134 |
1135 | def _start_author(self, attrsD):
1136 | self.inauthor = 1
1137 | self.push('author', 1)
1138 | # Append a new FeedParserDict when expecting an author
1139 | context = self._getContext()
1140 | context.setdefault('authors', [])
1141 | context['authors'].append(FeedParserDict())
1142 | _start_managingeditor = _start_author
1143 | _start_dc_author = _start_author
1144 | _start_dc_creator = _start_author
1145 | _start_itunes_author = _start_author
1146 |
1147 | def _end_author(self):
1148 | self.pop('author')
1149 | self.inauthor = 0
1150 | self._sync_author_detail()
1151 | _end_managingeditor = _end_author
1152 | _end_dc_author = _end_author
1153 | _end_dc_creator = _end_author
1154 | _end_itunes_author = _end_author
1155 |
1156 | def _start_itunes_owner(self, attrsD):
1157 | self.inpublisher = 1
1158 | self.push('publisher', 0)
1159 |
1160 | def _end_itunes_owner(self):
1161 | self.pop('publisher')
1162 | self.inpublisher = 0
1163 | self._sync_author_detail('publisher')
1164 |
1165 | def _start_contributor(self, attrsD):
1166 | self.incontributor = 1
1167 | context = self._getContext()
1168 | context.setdefault('contributors', [])
1169 | context['contributors'].append(FeedParserDict())
1170 | self.push('contributor', 0)
1171 |
1172 | def _end_contributor(self):
1173 | self.pop('contributor')
1174 | self.incontributor = 0
1175 |
1176 | def _start_dc_contributor(self, attrsD):
1177 | self.incontributor = 1
1178 | context = self._getContext()
1179 | context.setdefault('contributors', [])
1180 | context['contributors'].append(FeedParserDict())
1181 | self.push('name', 0)
1182 |
1183 | def _end_dc_contributor(self):
1184 | self._end_name()
1185 | self.incontributor = 0
1186 |
1187 | def _start_name(self, attrsD):
1188 | self.push('name', 0)
1189 | _start_itunes_name = _start_name
1190 |
1191 | def _end_name(self):
1192 | value = self.pop('name')
1193 | if self.inpublisher:
1194 | self._save_author('name', value, 'publisher')
1195 | elif self.inauthor:
1196 | self._save_author('name', value)
1197 | elif self.incontributor:
1198 | self._save_contributor('name', value)
1199 | elif self.intextinput:
1200 | context = self._getContext()
1201 | context['name'] = value
1202 | _end_itunes_name = _end_name
1203 |
1204 | def _start_width(self, attrsD):
1205 | self.push('width', 0)
1206 |
1207 | def _end_width(self):
1208 | value = self.pop('width')
1209 | try:
1210 | value = int(value)
1211 | except ValueError:
1212 | value = 0
1213 | if self.inimage:
1214 | context = self._getContext()
1215 | context['width'] = value
1216 |
1217 | def _start_height(self, attrsD):
1218 | self.push('height', 0)
1219 |
1220 | def _end_height(self):
1221 | value = self.pop('height')
1222 | try:
1223 | value = int(value)
1224 | except ValueError:
1225 | value = 0
1226 | if self.inimage:
1227 | context = self._getContext()
1228 | context['height'] = value
1229 |
1230 | def _start_url(self, attrsD):
1231 | self.push('href', 1)
1232 | _start_homepage = _start_url
1233 | _start_uri = _start_url
1234 |
1235 | def _end_url(self):
1236 | value = self.pop('href')
1237 | if self.inauthor:
1238 | self._save_author('href', value)
1239 | elif self.incontributor:
1240 | self._save_contributor('href', value)
1241 | _end_homepage = _end_url
1242 | _end_uri = _end_url
1243 |
1244 | def _start_email(self, attrsD):
1245 | self.push('email', 0)
1246 | _start_itunes_email = _start_email
1247 |
1248 | def _end_email(self):
1249 | value = self.pop('email')
1250 | if self.inpublisher:
1251 | self._save_author('email', value, 'publisher')
1252 | elif self.inauthor:
1253 | self._save_author('email', value)
1254 | elif self.incontributor:
1255 | self._save_contributor('email', value)
1256 | _end_itunes_email = _end_email
1257 |
1258 | def _getContext(self):
1259 | if self.insource:
1260 | context = self.sourcedata
1261 | elif self.inimage and 'image' in self.feeddata:
1262 | context = self.feeddata['image']
1263 | elif self.intextinput:
1264 | context = self.feeddata['textinput']
1265 | elif self.inentry:
1266 | context = self.entries[-1]
1267 | else:
1268 | context = self.feeddata
1269 | return context
1270 |
1271 | def _save_author(self, key, value, prefix='author'):
1272 | context = self._getContext()
1273 | context.setdefault(prefix + '_detail', FeedParserDict())
1274 | context[prefix + '_detail'][key] = value
1275 | self._sync_author_detail()
1276 | context.setdefault('authors', [FeedParserDict()])
1277 | context['authors'][-1][key] = value
1278 |
1279 | def _save_contributor(self, key, value):
1280 | context = self._getContext()
1281 | context.setdefault('contributors', [FeedParserDict()])
1282 | context['contributors'][-1][key] = value
1283 |
1284 | def _sync_author_detail(self, key='author'):
1285 | context = self._getContext()
1286 | detail = context.get('%s_detail' % key)
1287 | if detail:
1288 | name = detail.get('name')
1289 | email = detail.get('email')
1290 | if name and email:
1291 | context[key] = u'%s (%s)' % (name, email)
1292 | elif name:
1293 | context[key] = name
1294 | elif email:
1295 | context[key] = email
1296 | else:
1297 | author, email = context.get(key), None
1298 | if not author:
1299 | return
1300 | emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1301 | if emailmatch:
1302 | email = emailmatch.group(0)
1303 | # probably a better way to do the following, but it passes all the tests
1304 | author = author.replace(email, u'')
1305 | author = author.replace(u'()', u'')
1306 | author = author.replace(u'<>', u'')
1307 | author = author.replace(u'<>', u'')
1308 | author = author.strip()
1309 | if author and (author[0] == u'('):
1310 | author = author[1:]
1311 | if author and (author[-1] == u')'):
1312 | author = author[:-1]
1313 | author = author.strip()
1314 | if author or email:
1315 | context.setdefault('%s_detail' % key, FeedParserDict())
1316 | if author:
1317 | context['%s_detail' % key]['name'] = author
1318 | if email:
1319 | context['%s_detail' % key]['email'] = email
1320 |
1321 | def _start_subtitle(self, attrsD):
1322 | self.pushContent('subtitle', attrsD, u'text/plain', 1)
1323 | _start_tagline = _start_subtitle
1324 | _start_itunes_subtitle = _start_subtitle
1325 |
1326 | def _end_subtitle(self):
1327 | self.popContent('subtitle')
1328 | _end_tagline = _end_subtitle
1329 | _end_itunes_subtitle = _end_subtitle
1330 |
1331 | def _start_rights(self, attrsD):
1332 | self.pushContent('rights', attrsD, u'text/plain', 1)
1333 | _start_dc_rights = _start_rights
1334 | _start_copyright = _start_rights
1335 |
1336 | def _end_rights(self):
1337 | self.popContent('rights')
1338 | _end_dc_rights = _end_rights
1339 | _end_copyright = _end_rights
1340 |
1341 | def _start_item(self, attrsD):
1342 | self.entries.append(FeedParserDict())
1343 | self.push('item', 0)
1344 | self.inentry = 1
1345 | self.guidislink = 0
1346 | self.title_depth = -1
1347 | self.psc_chapters_counter = 0
1348 | id = self._getAttribute(attrsD, 'rdf:about')
1349 | if id:
1350 | context = self._getContext()
1351 | context['id'] = id
1352 | self._cdf_common(attrsD)
1353 | _start_entry = _start_item
1354 |
1355 | def _end_item(self):
1356 | self.pop('item')
1357 | self.inentry = 0
1358 | _end_entry = _end_item
1359 |
1360 | def _start_dc_language(self, attrsD):
1361 | self.push('language', 1)
1362 | _start_language = _start_dc_language
1363 |
1364 | def _end_dc_language(self):
1365 | self.lang = self.pop('language')
1366 | _end_language = _end_dc_language
1367 |
1368 | def _start_dc_publisher(self, attrsD):
1369 | self.push('publisher', 1)
1370 | _start_webmaster = _start_dc_publisher
1371 |
1372 | def _end_dc_publisher(self):
1373 | self.pop('publisher')
1374 | self._sync_author_detail('publisher')
1375 | _end_webmaster = _end_dc_publisher
1376 |
1377 | def _start_published(self, attrsD):
1378 | self.push('published', 1)
1379 | _start_dcterms_issued = _start_published
1380 | _start_issued = _start_published
1381 | _start_pubdate = _start_published
1382 |
1383 | def _end_published(self):
1384 | value = self.pop('published')
1385 | self._save('published_parsed', _parse_date(value), overwrite=True)
1386 | _end_dcterms_issued = _end_published
1387 | _end_issued = _end_published
1388 | _end_pubdate = _end_published
1389 |
1390 | def _start_updated(self, attrsD):
1391 | self.push('updated', 1)
1392 | _start_modified = _start_updated
1393 | _start_dcterms_modified = _start_updated
1394 | _start_dc_date = _start_updated
1395 | _start_lastbuilddate = _start_updated
1396 |
1397 | def _end_updated(self):
1398 | value = self.pop('updated')
1399 | parsed_value = _parse_date(value)
1400 | self._save('updated_parsed', parsed_value, overwrite=True)
1401 | _end_modified = _end_updated
1402 | _end_dcterms_modified = _end_updated
1403 | _end_dc_date = _end_updated
1404 | _end_lastbuilddate = _end_updated
1405 |
1406 | def _start_created(self, attrsD):
1407 | self.push('created', 1)
1408 | _start_dcterms_created = _start_created
1409 |
1410 | def _end_created(self):
1411 | value = self.pop('created')
1412 | self._save('created_parsed', _parse_date(value), overwrite=True)
1413 | _end_dcterms_created = _end_created
1414 |
1415 | def _start_expirationdate(self, attrsD):
1416 | self.push('expired', 1)
1417 |
1418 | def _end_expirationdate(self):
1419 | self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1420 |
1421 | # geospatial location, or "where", from georss.org
1422 |
1423 | def _start_georssgeom(self, attrsD):
1424 | self.push('geometry', 0)
1425 | context = self._getContext()
1426 | context['where'] = FeedParserDict()
1427 |
1428 | _start_georss_point = _start_georssgeom
1429 | _start_georss_line = _start_georssgeom
1430 | _start_georss_polygon = _start_georssgeom
1431 | _start_georss_box = _start_georssgeom
1432 |
1433 | def _save_where(self, geometry):
1434 | context = self._getContext()
1435 | context['where'].update(geometry)
1436 |
1437 | def _end_georss_point(self):
1438 | geometry = _parse_georss_point(self.pop('geometry'))
1439 | if geometry:
1440 | self._save_where(geometry)
1441 |
1442 | def _end_georss_line(self):
1443 | geometry = _parse_georss_line(self.pop('geometry'))
1444 | if geometry:
1445 | self._save_where(geometry)
1446 |
1447 | def _end_georss_polygon(self):
1448 | this = self.pop('geometry')
1449 | geometry = _parse_georss_polygon(this)
1450 | if geometry:
1451 | self._save_where(geometry)
1452 |
1453 | def _end_georss_box(self):
1454 | geometry = _parse_georss_box(self.pop('geometry'))
1455 | if geometry:
1456 | self._save_where(geometry)
1457 |
1458 | def _start_where(self, attrsD):
1459 | self.push('where', 0)
1460 | context = self._getContext()
1461 | context['where'] = FeedParserDict()
1462 | _start_georss_where = _start_where
1463 |
1464 | def _parse_srs_attrs(self, attrsD):
1465 | srsName = attrsD.get('srsname')
1466 | try:
1467 | srsDimension = int(attrsD.get('srsdimension', '2'))
1468 | except ValueError:
1469 | srsDimension = 2
1470 | context = self._getContext()
1471 | context['where']['srsName'] = srsName
1472 | context['where']['srsDimension'] = srsDimension
1473 |
1474 | def _start_gml_point(self, attrsD):
1475 | self._parse_srs_attrs(attrsD)
1476 | self.ingeometry = 1
1477 | self.push('geometry', 0)
1478 |
1479 | def _start_gml_linestring(self, attrsD):
1480 | self._parse_srs_attrs(attrsD)
1481 | self.ingeometry = 'linestring'
1482 | self.push('geometry', 0)
1483 |
1484 | def _start_gml_polygon(self, attrsD):
1485 | self._parse_srs_attrs(attrsD)
1486 | self.push('geometry', 0)
1487 |
1488 | def _start_gml_exterior(self, attrsD):
1489 | self.push('geometry', 0)
1490 |
1491 | def _start_gml_linearring(self, attrsD):
1492 | self.ingeometry = 'polygon'
1493 | self.push('geometry', 0)
1494 |
1495 | def _start_gml_pos(self, attrsD):
1496 | self.push('pos', 0)
1497 |
1498 | def _end_gml_pos(self):
1499 | this = self.pop('pos')
1500 | context = self._getContext()
1501 | srsName = context['where'].get('srsName')
1502 | srsDimension = context['where'].get('srsDimension', 2)
1503 | swap = True
1504 | if srsName and "EPSG" in srsName:
1505 | epsg = int(srsName.split(":")[-1])
1506 | swap = bool(epsg in _geogCS)
1507 | geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
1508 | if geometry:
1509 | self._save_where(geometry)
1510 |
1511 | def _start_gml_poslist(self, attrsD):
1512 | self.push('pos', 0)
1513 |
1514 | def _end_gml_poslist(self):
1515 | this = self.pop('pos')
1516 | context = self._getContext()
1517 | srsName = context['where'].get('srsName')
1518 | srsDimension = context['where'].get('srsDimension', 2)
1519 | swap = True
1520 | if srsName and "EPSG" in srsName:
1521 | epsg = int(srsName.split(":")[-1])
1522 | swap = bool(epsg in _geogCS)
1523 | geometry = _parse_poslist(
1524 | this, self.ingeometry, swap=swap, dims=srsDimension)
1525 | if geometry:
1526 | self._save_where(geometry)
1527 |
1528 | def _end_geom(self):
1529 | self.ingeometry = 0
1530 | self.pop('geometry')
1531 | _end_gml_point = _end_geom
1532 | _end_gml_linestring = _end_geom
1533 | _end_gml_linearring = _end_geom
1534 | _end_gml_exterior = _end_geom
1535 | _end_gml_polygon = _end_geom
1536 |
1537 | def _end_where(self):
1538 | self.pop('where')
1539 | _end_georss_where = _end_where
1540 |
1541 | # end geospatial
1542 |
1543 | def _start_cc_license(self, attrsD):
1544 | context = self._getContext()
1545 | value = self._getAttribute(attrsD, 'rdf:resource')
1546 | attrsD = FeedParserDict()
1547 | attrsD['rel'] = u'license'
1548 | if value:
1549 | attrsD['href']=value
1550 | context.setdefault('links', []).append(attrsD)
1551 |
1552 | def _start_creativecommons_license(self, attrsD):
1553 | self.push('license', 1)
1554 | _start_creativeCommons_license = _start_creativecommons_license
1555 |
1556 | def _end_creativecommons_license(self):
1557 | value = self.pop('license')
1558 | context = self._getContext()
1559 | attrsD = FeedParserDict()
1560 | attrsD['rel'] = u'license'
1561 | if value:
1562 | attrsD['href'] = value
1563 | context.setdefault('links', []).append(attrsD)
1564 | del context['license']
1565 | _end_creativeCommons_license = _end_creativecommons_license
1566 |
1567 | def _addTag(self, term, scheme, label):
1568 | context = self._getContext()
1569 | tags = context.setdefault('tags', [])
1570 | if (not term) and (not scheme) and (not label):
1571 | return
1572 | value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1573 | if value not in tags:
1574 | tags.append(value)
1575 |
1576 | def _start_category(self, attrsD):
1577 | term = attrsD.get('term')
1578 | scheme = attrsD.get('scheme', attrsD.get('domain'))
1579 | label = attrsD.get('label')
1580 | self._addTag(term, scheme, label)
1581 | self.push('category', 1)
1582 | _start_dc_subject = _start_category
1583 | _start_keywords = _start_category
1584 |
1585 | def _start_media_category(self, attrsD):
1586 | attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1587 | self._start_category(attrsD)
1588 |
1589 | def _end_itunes_keywords(self):
1590 | for term in self.pop('itunes_keywords').split(','):
1591 | if term.strip():
1592 | self._addTag(term.strip(), u'http://www.itunes.com/', None)
1593 |
1594 | def _start_itunes_category(self, attrsD):
1595 | self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1596 | self.push('category', 1)
1597 |
1598 | def _end_category(self):
1599 | value = self.pop('category')
1600 | if not value:
1601 | return
1602 | context = self._getContext()
1603 | tags = context['tags']
1604 | if value and len(tags) and not tags[-1]['term']:
1605 | tags[-1]['term'] = value
1606 | else:
1607 | self._addTag(value, None, None)
1608 | _end_dc_subject = _end_category
1609 | _end_keywords = _end_category
1610 | _end_itunes_category = _end_category
1611 | _end_media_category = _end_category
1612 |
1613 | def _start_cloud(self, attrsD):
1614 | self._getContext()['cloud'] = FeedParserDict(attrsD)
1615 |
1616 | def _start_link(self, attrsD):
1617 | attrsD.setdefault('rel', u'alternate')
1618 | if attrsD['rel'] == u'self':
1619 | attrsD.setdefault('type', u'application/atom+xml')
1620 | else:
1621 | attrsD.setdefault('type', u'text/html')
1622 | context = self._getContext()
1623 | attrsD = self._itsAnHrefDamnIt(attrsD)
1624 | if 'href' in attrsD:
1625 | attrsD['href'] = self.resolveURI(attrsD['href'])
1626 | expectingText = self.infeed or self.inentry or self.insource
1627 | context.setdefault('links', [])
1628 | if not (self.inentry and self.inimage):
1629 | context['links'].append(FeedParserDict(attrsD))
1630 | if 'href' in attrsD:
1631 | expectingText = 0
1632 | if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1633 | context['link'] = attrsD['href']
1634 | else:
1635 | self.push('link', expectingText)
1636 |
1637 | def _end_link(self):
1638 | value = self.pop('link')
1639 |
1640 | def _start_guid(self, attrsD):
1641 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1642 | self.push('id', 1)
1643 | _start_id = _start_guid
1644 |
1645 | def _end_guid(self):
1646 | value = self.pop('id')
1647 | self._save('guidislink', self.guidislink and 'link' not in self._getContext())
1648 | if self.guidislink:
1649 | # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1650 | # and only if the item doesn't already have a link element
1651 | self._save('link', value)
1652 | _end_id = _end_guid
1653 |
1654 | def _start_title(self, attrsD):
1655 | if self.svgOK:
1656 | return self.unknown_starttag('title', attrsD.items())
1657 | self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1658 | _start_dc_title = _start_title
1659 | _start_media_title = _start_title
1660 |
1661 | def _end_title(self):
1662 | if self.svgOK:
1663 | return
1664 | value = self.popContent('title')
1665 | if not value:
1666 | return
1667 | self.title_depth = self.depth
1668 | _end_dc_title = _end_title
1669 |
1670 | def _end_media_title(self):
1671 | title_depth = self.title_depth
1672 | self._end_title()
1673 | self.title_depth = title_depth
1674 |
1675 | def _start_description(self, attrsD):
1676 | context = self._getContext()
1677 | if 'summary' in context:
1678 | self._summaryKey = 'content'
1679 | self._start_content(attrsD)
1680 | else:
1681 | self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1682 | _start_dc_description = _start_description
1683 | _start_media_description = _start_description
1684 |
1685 | def _start_abstract(self, attrsD):
1686 | self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1687 |
1688 | def _end_description(self):
1689 | if self._summaryKey == 'content':
1690 | self._end_content()
1691 | else:
1692 | value = self.popContent('description')
1693 | self._summaryKey = None
1694 | _end_abstract = _end_description
1695 | _end_dc_description = _end_description
1696 | _end_media_description = _end_description
1697 |
1698 | def _start_info(self, attrsD):
1699 | self.pushContent('info', attrsD, u'text/plain', 1)
1700 | _start_feedburner_browserfriendly = _start_info
1701 |
1702 | def _end_info(self):
1703 | self.popContent('info')
1704 | _end_feedburner_browserfriendly = _end_info
1705 |
1706 | def _start_generator(self, attrsD):
1707 | if attrsD:
1708 | attrsD = self._itsAnHrefDamnIt(attrsD)
1709 | if 'href' in attrsD:
1710 | attrsD['href'] = self.resolveURI(attrsD['href'])
1711 | self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1712 | self.push('generator', 1)
1713 |
1714 | def _end_generator(self):
1715 | value = self.pop('generator')
1716 | context = self._getContext()
1717 | if 'generator_detail' in context:
1718 | context['generator_detail']['name'] = value
1719 |
1720 | def _start_admin_generatoragent(self, attrsD):
1721 | self.push('generator', 1)
1722 | value = self._getAttribute(attrsD, 'rdf:resource')
1723 | if value:
1724 | self.elementstack[-1][2].append(value)
1725 | self.pop('generator')
1726 | self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1727 |
1728 | def _start_admin_errorreportsto(self, attrsD):
1729 | self.push('errorreportsto', 1)
1730 | value = self._getAttribute(attrsD, 'rdf:resource')
1731 | if value:
1732 | self.elementstack[-1][2].append(value)
1733 | self.pop('errorreportsto')
1734 |
1735 | def _start_summary(self, attrsD):
1736 | context = self._getContext()
1737 | if 'summary' in context:
1738 | self._summaryKey = 'content'
1739 | self._start_content(attrsD)
1740 | else:
1741 | self._summaryKey = 'summary'
1742 | self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1743 | _start_itunes_summary = _start_summary
1744 |
1745 | def _end_summary(self):
1746 | if self._summaryKey == 'content':
1747 | self._end_content()
1748 | else:
1749 | self.popContent(self._summaryKey or 'summary')
1750 | self._summaryKey = None
1751 | _end_itunes_summary = _end_summary
1752 |
1753 | def _start_enclosure(self, attrsD):
1754 | attrsD = self._itsAnHrefDamnIt(attrsD)
1755 | context = self._getContext()
1756 | attrsD['rel'] = u'enclosure'
1757 | context.setdefault('links', []).append(FeedParserDict(attrsD))
1758 |
1759 | def _start_source(self, attrsD):
1760 | if 'url' in attrsD:
1761 | # This means that we're processing a source element from an RSS 2.0 feed
1762 | self.sourcedata['href'] = attrsD[u'url']
1763 | self.push('source', 1)
1764 | self.insource = 1
1765 | self.title_depth = -1
1766 |
1767 | def _end_source(self):
1768 | self.insource = 0
1769 | value = self.pop('source')
1770 | if value:
1771 | self.sourcedata['title'] = value
1772 | self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1773 | self.sourcedata.clear()
1774 |
1775 | def _start_content(self, attrsD):
1776 | self.pushContent('content', attrsD, u'text/plain', 1)
1777 | src = attrsD.get('src')
1778 | if src:
1779 | self.contentparams['src'] = src
1780 | self.push('content', 1)
1781 |
1782 | def _start_body(self, attrsD):
1783 | self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1784 | _start_xhtml_body = _start_body
1785 |
1786 | def _start_content_encoded(self, attrsD):
1787 | self.pushContent('content', attrsD, u'text/html', 1)
1788 | _start_fullitem = _start_content_encoded
1789 |
1790 | def _end_content(self):
1791 | copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1792 | value = self.popContent('content')
1793 | if copyToSummary:
1794 | self._save('summary', value)
1795 |
1796 | _end_body = _end_content
1797 | _end_xhtml_body = _end_content
1798 | _end_content_encoded = _end_content
1799 | _end_fullitem = _end_content
1800 |
1801 | def _start_itunes_image(self, attrsD):
1802 | self.push('itunes_image', 0)
1803 | if attrsD.get('href'):
1804 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1805 | elif attrsD.get('url'):
1806 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
1807 | _start_itunes_link = _start_itunes_image
1808 |
1809 | def _end_itunes_block(self):
1810 | value = self.pop('itunes_block', 0)
1811 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1812 |
1813 | def _end_itunes_explicit(self):
1814 | value = self.pop('itunes_explicit', 0)
1815 | # Convert 'yes' -> True, 'clean' to False, and any other value to None
1816 | # False and None both evaluate as False, so the difference can be ignored
1817 | # by applications that only need to know if the content is explicit.
1818 | self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1819 |
1820 | def _start_media_group(self, attrsD):
1821 | # don't do anything, but don't break the enclosed tags either
1822 | pass
1823 |
1824 | def _start_media_credit(self, attrsD):
1825 | context = self._getContext()
1826 | context.setdefault('media_credit', [])
1827 | context['media_credit'].append(attrsD)
1828 | self.push('credit', 1)
1829 |
1830 | def _end_media_credit(self):
1831 | credit = self.pop('credit')
1832 | if credit != None and len(credit.strip()) != 0:
1833 | context = self._getContext()
1834 | context['media_credit'][-1]['content'] = credit
1835 |
1836 | def _start_media_restriction(self, attrsD):
1837 | context = self._getContext()
1838 | context.setdefault('media_restriction', attrsD)
1839 | self.push('restriction', 1)
1840 |
1841 | def _end_media_restriction(self):
1842 | restriction = self.pop('restriction')
1843 | if restriction != None and len(restriction.strip()) != 0:
1844 | context = self._getContext()
1845 | context['media_restriction']['content'] = restriction
1846 |
1847 | def _start_media_license(self, attrsD):
1848 | context = self._getContext()
1849 | context.setdefault('media_license', attrsD)
1850 | self.push('license', 1)
1851 |
1852 | def _end_media_license(self):
1853 | license = self.pop('license')
1854 | if license != None and len(license.strip()) != 0:
1855 | context = self._getContext()
1856 | context['media_license']['content'] = license
1857 |
1858 | def _start_media_content(self, attrsD):
1859 | context = self._getContext()
1860 | context.setdefault('media_content', [])
1861 | context['media_content'].append(attrsD)
1862 |
1863 | def _start_media_thumbnail(self, attrsD):
1864 | context = self._getContext()
1865 | context.setdefault('media_thumbnail', [])
1866 | self.push('url', 1) # new
1867 | context['media_thumbnail'].append(attrsD)
1868 |
1869 | def _end_media_thumbnail(self):
1870 | url = self.pop('url')
1871 | context = self._getContext()
1872 | if url != None and len(url.strip()) != 0:
1873 | if 'url' not in context['media_thumbnail'][-1]:
1874 | context['media_thumbnail'][-1]['url'] = url
1875 |
1876 | def _start_media_player(self, attrsD):
1877 | self.push('media_player', 0)
1878 | self._getContext()['media_player'] = FeedParserDict(attrsD)
1879 |
1880 | def _end_media_player(self):
1881 | value = self.pop('media_player')
1882 | context = self._getContext()
1883 | context['media_player']['content'] = value
1884 |
1885 | def _start_newlocation(self, attrsD):
1886 | self.push('newlocation', 1)
1887 |
1888 | def _end_newlocation(self):
1889 | url = self.pop('newlocation')
1890 | context = self._getContext()
1891 | # don't set newlocation if the context isn't right
1892 | if context is not self.feeddata:
1893 | return
1894 | context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1895 |
1896 | def _start_psc_chapters(self, attrsD):
1897 | version = self._getAttribute(attrsD, 'version')
1898 | if version == '1.1' and self.psc_chapters_counter == 0:
1899 | self.psc_chapters_counter += 1
1900 | attrsD['chapters'] = []
1901 | self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
1902 |
1903 | def _end_psc_chapters(self):
1904 | version = self._getContext()['psc_chapters']['version']
1905 | if version == '1.1':
1906 | self.psc_chapters_counter += 1
1907 |
1908 | def _start_psc_chapter(self, attrsD):
1909 | if self.psc_chapters_counter == 1:
1910 | start = self._getAttribute(attrsD, 'start')
1911 | attrsD['start_parsed'] = _parse_psc_chapter_start(start)
1912 |
1913 | context = self._getContext()['psc_chapters']
1914 | context['chapters'].append(FeedParserDict(attrsD))
1915 |
1916 |
1917 | if _XML_AVAILABLE:
1918 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1919 | def __init__(self, baseuri, baselang, encoding):
1920 | xml.sax.handler.ContentHandler.__init__(self)
1921 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1922 | self.bozo = 0
1923 | self.exc = None
1924 | self.decls = {}
1925 |
1926 | def startPrefixMapping(self, prefix, uri):
1927 | if not uri:
1928 | return
1929 | # Jython uses '' instead of None; standardize on None
1930 | prefix = prefix or None
1931 | self.trackNamespace(prefix, uri)
1932 | if prefix and uri == 'http://www.w3.org/1999/xlink':
1933 | self.decls['xmlns:' + prefix] = uri
1934 |
1935 | def startElementNS(self, name, qname, attrs):
1936 | namespace, localname = name
1937 | lowernamespace = str(namespace or '').lower()
1938 | if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1939 | # match any backend.userland.com namespace
1940 | namespace = u'http://backend.userland.com/rss'
1941 | lowernamespace = namespace
1942 | if qname and qname.find(':') > 0:
1943 | givenprefix = qname.split(':')[0]
1944 | else:
1945 | givenprefix = None
1946 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1947 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
1948 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1949 | localname = str(localname).lower()
1950 |
1951 | # qname implementation is horribly broken in Python 2.1 (it
1952 | # doesn't report any), and slightly broken in Python 2.2 (it
1953 | # doesn't report the xml: namespace). So we match up namespaces
1954 | # with a known list first, and then possibly override them with
1955 | # the qnames the SAX parser gives us (if indeed it gives us any
1956 | # at all). Thanks to MatejC for helping me test this and
1957 | # tirelessly telling me that it didn't work yet.
1958 | attrsD, self.decls = self.decls, {}
1959 | if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1960 | attrsD['xmlns']=namespace
1961 | if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1962 | attrsD['xmlns']=namespace
1963 |
1964 | if prefix:
1965 | localname = prefix.lower() + ':' + localname
1966 | elif namespace and not qname: #Expat
1967 | for name,value in self.namespacesInUse.items():
1968 | if name and value == namespace:
1969 | localname = name + ':' + localname
1970 | break
1971 |
1972 | for (namespace, attrlocalname), attrvalue in attrs.items():
1973 | lowernamespace = (namespace or '').lower()
1974 | prefix = self._matchnamespaces.get(lowernamespace, '')
1975 | if prefix:
1976 | attrlocalname = prefix + ':' + attrlocalname
1977 | attrsD[str(attrlocalname).lower()] = attrvalue
1978 | for qname in attrs.getQNames():
1979 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1980 | localname = str(localname).lower()
1981 | self.unknown_starttag(localname, attrsD.items())
1982 |
1983 | def characters(self, text):
1984 | self.handle_data(text)
1985 |
1986 | def endElementNS(self, name, qname):
1987 | namespace, localname = name
1988 | lowernamespace = str(namespace or '').lower()
1989 | if qname and qname.find(':') > 0:
1990 | givenprefix = qname.split(':')[0]
1991 | else:
1992 | givenprefix = ''
1993 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1994 | if prefix:
1995 | localname = prefix + ':' + localname
1996 | elif namespace and not qname: #Expat
1997 | for name,value in self.namespacesInUse.items():
1998 | if name and value == namespace:
1999 | localname = name + ':' + localname
2000 | break
2001 | localname = str(localname).lower()
2002 | self.unknown_endtag(localname)
2003 |
2004 | def error(self, exc):
2005 | self.bozo = 1
2006 | self.exc = exc
2007 |
2008 | # drv_libxml2 calls warning() in some cases
2009 | warning = error
2010 |
2011 | def fatalError(self, exc):
2012 | self.error(exc)
2013 | raise exc
2014 |
2015 | class _BaseHTMLProcessor(sgmllib.SGMLParser):
2016 | special = re.compile('''[<>'"]''')
2017 | bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
2018 | elements_no_end_tag = set([
2019 | 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
2020 | 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
2021 | 'source', 'track', 'wbr'
2022 | ])
2023 |
2024 | def __init__(self, encoding, _type):
2025 | self.encoding = encoding
2026 | self._type = _type
2027 | sgmllib.SGMLParser.__init__(self)
2028 |
2029 | def reset(self):
2030 | self.pieces = []
2031 | sgmllib.SGMLParser.reset(self)
2032 |
2033 | def _shorttag_replace(self, match):
2034 | tag = match.group(1)
2035 | if tag in self.elements_no_end_tag:
2036 | return '<' + tag + ' />'
2037 | else:
2038 | return '<' + tag + '>' + tag + '>'
2039 |
2040 | # By declaring these methods and overriding their compiled code
2041 | # with the code from sgmllib, the original code will execute in
2042 | # feedparser's scope instead of sgmllib's. This means that the
2043 | # `tagfind` and `charref` regular expressions will be found as
2044 | # they're declared above, not as they're declared in sgmllib.
2045 | def goahead(self, i):
2046 | pass
2047 | goahead.func_code = sgmllib.SGMLParser.goahead.func_code
2048 |
2049 | def __parse_starttag(self, i):
2050 | pass
2051 | __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
2052 |
2053 | def parse_starttag(self,i):
2054 | j = self.__parse_starttag(i)
2055 | if self._type == 'application/xhtml+xml':
2056 | if j>2 and self.rawdata[j-2:j]=='/>':
2057 | self.unknown_endtag(self.lasttag)
2058 | return j
2059 |
2060 | def feed(self, data):
2061 | data = re.compile(r'\s]+?)\s*/>', self._shorttag_replace, data)
2063 | data = data.replace(''', "'")
2064 | data = data.replace('"', '"')
2065 | try:
2066 | bytes
2067 | if bytes is str:
2068 | raise NameError
2069 | self.encoding = self.encoding + u'_INVALID_PYTHON_3'
2070 | except NameError:
2071 | if self.encoding and isinstance(data, unicode):
2072 | data = data.encode(self.encoding)
2073 | sgmllib.SGMLParser.feed(self, data)
2074 | sgmllib.SGMLParser.close(self)
2075 |
2076 | def normalize_attrs(self, attrs):
2077 | if not attrs:
2078 | return attrs
2079 | # utility method to be called by descendants
2080 | attrs = dict([(k.lower(), v) for k, v in attrs]).items()
2081 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
2082 | attrs.sort()
2083 | return attrs
2084 |
2085 | def unknown_starttag(self, tag, attrs):
2086 | # called for each start tag
2087 | # attrs is a list of (attr, value) tuples
2088 | # e.g. for
, tag='pre', attrs=[('class', 'screen')]
2089 | uattrs = []
2090 | strattrs=''
2091 | if attrs:
2092 | for key, value in attrs:
2093 | value=value.replace('>','>').replace('<','<').replace('"','"')
2094 | value = self.bare_ampersand.sub("&", value)
2095 | # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2096 | if not isinstance(value, unicode):
2097 | value = value.decode(self.encoding, 'ignore')
2098 | try:
2099 | # Currently, in Python 3 the key is already a str, and cannot be decoded again
2100 | uattrs.append((unicode(key, self.encoding), value))
2101 | except TypeError:
2102 | uattrs.append((key, value))
2103 | strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2104 | if self.encoding:
2105 | try:
2106 | strattrs = strattrs.encode(self.encoding)
2107 | except (UnicodeEncodeError, LookupError):
2108 | pass
2109 | if tag in self.elements_no_end_tag:
2110 | self.pieces.append('<%s%s />' % (tag, strattrs))
2111 | else:
2112 | self.pieces.append('<%s%s>' % (tag, strattrs))
2113 |
2114 | def unknown_endtag(self, tag):
2115 | # called for each end tag, e.g. for
, tag will be 'pre'
2116 | # Reconstruct the original end tag.
2117 | if tag not in self.elements_no_end_tag:
2118 | self.pieces.append("%s>" % tag)
2119 |
2120 | def handle_charref(self, ref):
2121 | # called for each character reference, e.g. for ' ', ref will be '160'
2122 | # Reconstruct the original character reference.
2123 | ref = ref.lower()
2124 | if ref.startswith('x'):
2125 | value = int(ref[1:], 16)
2126 | else:
2127 | value = int(ref)
2128 |
2129 | if value in _cp1252:
2130 | self.pieces.append('%s;' % hex(ord(_cp1252[value]))[1:])
2131 | else:
2132 | self.pieces.append('%s;' % ref)
2133 |
2134 | def handle_entityref(self, ref):
2135 | # called for each entity reference, e.g. for '©', ref will be 'copy'
2136 | # Reconstruct the original entity reference.
2137 | if ref in name2codepoint or ref == 'apos':
2138 | self.pieces.append('&%s;' % ref)
2139 | else:
2140 | self.pieces.append('&%s' % ref)
2141 |
2142 | def handle_data(self, text):
2143 | # called for each block of plain text, i.e. outside of any tag and
2144 | # not containing any character or entity references
2145 | # Store the original text verbatim.
2146 | self.pieces.append(text)
2147 |
2148 | def handle_comment(self, text):
2149 | # called for each HTML comment, e.g.
2150 | # Reconstruct the original comment.
2151 | self.pieces.append('' % text)
2152 |
2153 | def handle_pi(self, text):
2154 | # called for each processing instruction, e.g.
2155 | # Reconstruct original processing instruction.
2156 | self.pieces.append('%s>' % text)
2157 |
2158 | def handle_decl(self, text):
2159 | # called for the DOCTYPE, if present, e.g.
2160 | #
2162 | # Reconstruct original DOCTYPE
2163 | self.pieces.append('' % text)
2164 |
2165 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
2166 | def _scan_name(self, i, declstartpos):
2167 | rawdata = self.rawdata
2168 | n = len(rawdata)
2169 | if i == n:
2170 | return None, -1
2171 | m = self._new_declname_match(rawdata, i)
2172 | if m:
2173 | s = m.group()
2174 | name = s.strip()
2175 | if (i + len(s)) == n:
2176 | return None, -1 # end of buffer
2177 | return name.lower(), m.end()
2178 | else:
2179 | self.handle_data(rawdata)
2180 | # self.updatepos(declstartpos, i)
2181 | return None, -1
2182 |
2183 | def convert_charref(self, name):
2184 | return '%s;' % name
2185 |
2186 | def convert_entityref(self, name):
2187 | return '&%s;' % name
2188 |
2189 | def output(self):
2190 | '''Return processed HTML as a single string'''
2191 | return ''.join([str(p) for p in self.pieces])
2192 |
2193 | def parse_declaration(self, i):
2194 | try:
2195 | return sgmllib.SGMLParser.parse_declaration(self, i)
2196 | except sgmllib.SGMLParseError:
2197 | # escape the doctype declaration and continue parsing
2198 | self.handle_data('<')
2199 | return i+1
2200 |
2201 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2202 | def __init__(self, baseuri, baselang, encoding, entities):
2203 | sgmllib.SGMLParser.__init__(self)
2204 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2205 | _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2206 | self.entities=entities
2207 |
2208 | def decodeEntities(self, element, data):
2209 | data = data.replace('<', '<')
2210 | data = data.replace('<', '<')
2211 | data = data.replace('<', '<')
2212 | data = data.replace('>', '>')
2213 | data = data.replace('>', '>')
2214 | data = data.replace('>', '>')
2215 | data = data.replace('&', '&')
2216 | data = data.replace('&', '&')
2217 | data = data.replace('"', '"')
2218 | data = data.replace('"', '"')
2219 | data = data.replace(''', ''')
2220 | data = data.replace(''', ''')
2221 | if not self.contentparams.get('type', u'xml').endswith(u'xml'):
2222 | data = data.replace('<', '<')
2223 | data = data.replace('>', '>')
2224 | data = data.replace('&', '&')
2225 | data = data.replace('"', '"')
2226 | data = data.replace(''', "'")
2227 | return data
2228 |
2229 | def strattrs(self, attrs):
2230 | return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2231 |
2232 | class _RelativeURIResolver(_BaseHTMLProcessor):
2233 | relative_uris = set([('a', 'href'),
2234 | ('applet', 'codebase'),
2235 | ('area', 'href'),
2236 | ('blockquote', 'cite'),
2237 | ('body', 'background'),
2238 | ('del', 'cite'),
2239 | ('form', 'action'),
2240 | ('frame', 'longdesc'),
2241 | ('frame', 'src'),
2242 | ('iframe', 'longdesc'),
2243 | ('iframe', 'src'),
2244 | ('head', 'profile'),
2245 | ('img', 'longdesc'),
2246 | ('img', 'src'),
2247 | ('img', 'usemap'),
2248 | ('input', 'src'),
2249 | ('input', 'usemap'),
2250 | ('ins', 'cite'),
2251 | ('link', 'href'),
2252 | ('object', 'classid'),
2253 | ('object', 'codebase'),
2254 | ('object', 'data'),
2255 | ('object', 'usemap'),
2256 | ('q', 'cite'),
2257 | ('script', 'src'),
2258 | ('video', 'poster')])
2259 |
2260 | def __init__(self, baseuri, encoding, _type):
2261 | _BaseHTMLProcessor.__init__(self, encoding, _type)
2262 | self.baseuri = baseuri
2263 |
2264 | def resolveURI(self, uri):
2265 | return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
2266 |
2267 | def unknown_starttag(self, tag, attrs):
2268 | attrs = self.normalize_attrs(attrs)
2269 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2270 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2271 |
2272 | def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2273 | if not _SGML_AVAILABLE:
2274 | return htmlSource
2275 |
2276 | p = _RelativeURIResolver(baseURI, encoding, _type)
2277 | p.feed(htmlSource)
2278 | return p.output()
2279 |
2280 | def _makeSafeAbsoluteURI(base, rel=None):
2281 | # bail if ACCEPTABLE_URI_SCHEMES is empty
2282 | if not ACCEPTABLE_URI_SCHEMES:
2283 | try:
2284 | return _urljoin(base, rel or u'')
2285 | except ValueError:
2286 | return u''
2287 | if not base:
2288 | return rel or u''
2289 | if not rel:
2290 | try:
2291 | scheme = urlparse.urlparse(base)[0]
2292 | except ValueError:
2293 | return u''
2294 | if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2295 | return base
2296 | return u''
2297 | try:
2298 | uri = _urljoin(base, rel)
2299 | except ValueError:
2300 | return u''
2301 | if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2302 | return u''
2303 | return uri
2304 |
2305 | class _HTMLSanitizer(_BaseHTMLProcessor):
2306 | acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
2307 | 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2308 | 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2309 | 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2310 | 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2311 | 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2312 | 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2313 | 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2314 | 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2315 | 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2316 | 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2317 | 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2318 | 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
2319 |
2320 | acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
2321 | 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2322 | 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2323 | 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2324 | 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2325 | 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2326 | 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2327 | 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2328 | 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2329 | 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2330 | 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2331 | 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2332 | 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2333 | 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2334 | 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2335 | 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
2336 | 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
2337 | 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
2338 | 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
2339 | 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
2340 | 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
2341 | 'width', 'wrap', 'xml:lang'])
2342 |
2343 | unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
2344 |
2345 | acceptable_css_properties = set(['azimuth', 'background-color',
2346 | 'border-bottom-color', 'border-collapse', 'border-color',
2347 | 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2348 | 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2349 | 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2350 | 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2351 | 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2352 | 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2353 | 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2354 | 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2355 | 'white-space', 'width'])
2356 |
2357 | # survey of common keywords found in feeds
2358 | acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
2359 | 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2360 | 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2361 | 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2362 | 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2363 | 'transparent', 'underline', 'white', 'yellow'])
2364 |
2365 | valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2366 | '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2367 |
2368 | mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math',
2369 | 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2370 | 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2371 | 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2372 | 'munderover', 'none', 'semantics'])
2373 |
2374 | mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign',
2375 | 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2376 | 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2377 | 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2378 | 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2379 | 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2380 | 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2381 | 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2382 | 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'])
2383 |
2384 | # svgtiny - foreignObject + linearGradient + radialGradient + stop
2385 | svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
2386 | 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2387 | 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2388 | 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2389 | 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2390 | 'svg', 'switch', 'text', 'title', 'tspan', 'use'])
2391 |
2392 | # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2393 | svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
2394 | 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2395 | 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2396 | 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2397 | 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2398 | 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2399 | 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2400 | 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2401 | 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2402 | 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2403 | 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2404 | 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2405 | 'overline-position', 'overline-thickness', 'panose-1', 'path',
2406 | 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2407 | 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2408 | 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2409 | 'stop-color', 'stop-opacity', 'strikethrough-position',
2410 | 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2411 | 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2412 | 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2413 | 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2414 | 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2415 | 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2416 | 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2417 | 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2418 | 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2419 | 'y2', 'zoomAndPan'])
2420 |
2421 | svg_attr_map = None
2422 | svg_elem_map = None
2423 |
2424 | acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
2425 | 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2426 | 'stroke-opacity'])
2427 |
2428 | def reset(self):
2429 | _BaseHTMLProcessor.reset(self)
2430 | self.unacceptablestack = 0
2431 | self.mathmlOK = 0
2432 | self.svgOK = 0
2433 |
2434 | def unknown_starttag(self, tag, attrs):
2435 | acceptable_attributes = self.acceptable_attributes
2436 | keymap = {}
2437 | if not tag in self.acceptable_elements or self.svgOK:
2438 | if tag in self.unacceptable_elements_with_end_tag:
2439 | self.unacceptablestack += 1
2440 |
2441 | # add implicit namespaces to html5 inline svg/mathml
2442 | if self._type.endswith('html'):
2443 | if not dict(attrs).get('xmlns'):
2444 | if tag=='svg':
2445 | attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2446 | if tag=='math':
2447 | attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2448 |
2449 | # not otherwise acceptable, perhaps it is MathML or SVG?
2450 | if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2451 | self.mathmlOK += 1
2452 | if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2453 | self.svgOK += 1
2454 |
2455 | # chose acceptable attributes based on tag class, else bail
2456 | if self.mathmlOK and tag in self.mathml_elements:
2457 | acceptable_attributes = self.mathml_attributes
2458 | elif self.svgOK and tag in self.svg_elements:
2459 | # for most vocabularies, lowercasing is a good idea. Many
2460 | # svg elements, however, are camel case
2461 | if not self.svg_attr_map:
2462 | lower=[attr.lower() for attr in self.svg_attributes]
2463 | mix=[a for a in self.svg_attributes if a not in lower]
2464 | self.svg_attributes = lower
2465 | self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2466 |
2467 | lower=[attr.lower() for attr in self.svg_elements]
2468 | mix=[a for a in self.svg_elements if a not in lower]
2469 | self.svg_elements = lower
2470 | self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2471 | acceptable_attributes = self.svg_attributes
2472 | tag = self.svg_elem_map.get(tag,tag)
2473 | keymap = self.svg_attr_map
2474 | elif not tag in self.acceptable_elements:
2475 | return
2476 |
2477 | # declare xlink namespace, if needed
2478 | if self.mathmlOK or self.svgOK:
2479 | if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2480 | if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2481 | attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2482 |
2483 | clean_attrs = []
2484 | for key, value in self.normalize_attrs(attrs):
2485 | if key in acceptable_attributes:
2486 | key=keymap.get(key,key)
2487 | # make sure the uri uses an acceptable uri scheme
2488 | if key == u'href':
2489 | value = _makeSafeAbsoluteURI(value)
2490 | clean_attrs.append((key,value))
2491 | elif key=='style':
2492 | clean_value = self.sanitize_style(value)
2493 | if clean_value:
2494 | clean_attrs.append((key,clean_value))
2495 | _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2496 |
2497 | def unknown_endtag(self, tag):
2498 | if not tag in self.acceptable_elements:
2499 | if tag in self.unacceptable_elements_with_end_tag:
2500 | self.unacceptablestack -= 1
2501 | if self.mathmlOK and tag in self.mathml_elements:
2502 | if tag == 'math' and self.mathmlOK:
2503 | self.mathmlOK -= 1
2504 | elif self.svgOK and tag in self.svg_elements:
2505 | tag = self.svg_elem_map.get(tag,tag)
2506 | if tag == 'svg' and self.svgOK:
2507 | self.svgOK -= 1
2508 | else:
2509 | return
2510 | _BaseHTMLProcessor.unknown_endtag(self, tag)
2511 |
2512 | def handle_pi(self, text):
2513 | pass
2514 |
2515 | def handle_decl(self, text):
2516 | pass
2517 |
2518 | def handle_data(self, text):
2519 | if not self.unacceptablestack:
2520 | _BaseHTMLProcessor.handle_data(self, text)
2521 |
2522 | def sanitize_style(self, style):
2523 | # disallow urls
2524 | style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2525 |
2526 | # gauntlet
2527 | if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
2528 | return ''
2529 | # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2530 | if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
2531 | return ''
2532 |
2533 | clean = []
2534 | for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2535 | if not value:
2536 | continue
2537 | if prop.lower() in self.acceptable_css_properties:
2538 | clean.append(prop + ': ' + value + ';')
2539 | elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2540 | for keyword in value.split():
2541 | if not keyword in self.acceptable_css_keywords and \
2542 | not self.valid_css_values.match(keyword):
2543 | break
2544 | else:
2545 | clean.append(prop + ': ' + value + ';')
2546 | elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2547 | clean.append(prop + ': ' + value + ';')
2548 |
2549 | return ' '.join(clean)
2550 |
2551 | def parse_comment(self, i, report=1):
2552 | ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2553 | if ret >= 0:
2554 | return ret
2555 | # if ret == -1, this may be a malicious attempt to circumvent
2556 | # sanitization, or a page-destroying unclosed comment
2557 | match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2558 | if match:
2559 | return match.end()
2560 | # unclosed comment; deliberately fail to handle_data()
2561 | return len(self.rawdata)
2562 |
2563 |
2564 | def _sanitizeHTML(htmlSource, encoding, _type):
2565 | if not _SGML_AVAILABLE:
2566 | return htmlSource
2567 | p = _HTMLSanitizer(encoding, _type)
2568 | htmlSource = htmlSource.replace(' stream
2619 |
2620 | This function lets you define parsers that take any input source
2621 | (URL, pathname to local or network file, or actual data as a string)
2622 | and deal with it in a uniform manner. Returned object is guaranteed
2623 | to have all the basic stdio read methods (read, readline, readlines).
2624 | Just .close() the object when you're done with it.
2625 |
2626 | If the etag argument is supplied, it will be used as the value of an
2627 | If-None-Match request header.
2628 |
2629 | If the modified argument is supplied, it can be a tuple of 9 integers
2630 | (as returned by gmtime() in the standard Python time module) or a date
2631 | string in any format supported by feedparser. Regardless, it MUST
2632 | be in GMT (Greenwich Mean Time). It will be reformatted into an
2633 | RFC 1123-compliant date and used as the value of an If-Modified-Since
2634 | request header.
2635 |
2636 | If the agent argument is supplied, it will be used as the value of a
2637 | User-Agent request header.
2638 |
2639 | If the referrer argument is supplied, it will be used as the value of a
2640 | Referer[sic] request header.
2641 |
2642 | If handlers is supplied, it is a list of handlers used to build a
2643 | urllib2 opener.
2644 |
2645 | if request_headers is supplied it is a dictionary of HTTP request headers
2646 | that will override the values generated by FeedParser.
2647 | """
2648 |
2649 | if hasattr(url_file_stream_or_string, 'read'):
2650 | return url_file_stream_or_string
2651 |
2652 | if isinstance(url_file_stream_or_string, basestring) \
2653 | and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2654 | # Deal with the feed URI scheme
2655 | if url_file_stream_or_string.startswith('feed:http'):
2656 | url_file_stream_or_string = url_file_stream_or_string[5:]
2657 | elif url_file_stream_or_string.startswith('feed:'):
2658 | url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2659 | if not agent:
2660 | agent = USER_AGENT
2661 | # Test for inline user:password credentials for HTTP basic auth
2662 | auth = None
2663 | if base64 and not url_file_stream_or_string.startswith('ftp:'):
2664 | urltype, rest = urllib.splittype(url_file_stream_or_string)
2665 | realhost, rest = urllib.splithost(rest)
2666 | if realhost:
2667 | user_passwd, realhost = urllib.splituser(realhost)
2668 | if user_passwd:
2669 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2670 | auth = base64.standard_b64encode(user_passwd).strip()
2671 |
2672 | # iri support
2673 | if isinstance(url_file_stream_or_string, unicode):
2674 | url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2675 |
2676 | # try to open with urllib2 (to use optional headers)
2677 | request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2678 | opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
2679 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2680 | try:
2681 | return opener.open(request)
2682 | finally:
2683 | opener.close() # JohnD
2684 |
2685 | # try to open with native open function (if url_file_stream_or_string is a filename)
2686 | try:
2687 | return open(url_file_stream_or_string, 'rb')
2688 | except (IOError, UnicodeEncodeError, TypeError):
2689 | # if url_file_stream_or_string is a unicode object that
2690 | # cannot be converted to the encoding returned by
2691 | # sys.getfilesystemencoding(), a UnicodeEncodeError
2692 | # will be thrown
2693 | # If url_file_stream_or_string is a string that contains NULL
2694 | # (such as an XML document encoded in UTF-32), TypeError will
2695 | # be thrown.
2696 | pass
2697 |
2698 | # treat url_file_stream_or_string as string
2699 | if isinstance(url_file_stream_or_string, unicode):
2700 | return _StringIO(url_file_stream_or_string.encode('utf-8'))
2701 | return _StringIO(url_file_stream_or_string)
2702 |
2703 | def _convert_to_idn(url):
2704 | """Convert a URL to IDN notation"""
2705 | # this function should only be called with a unicode string
2706 | # strategy: if the host cannot be encoded in ascii, then
2707 | # it'll be necessary to encode it in idn form
2708 | parts = list(urlparse.urlsplit(url))
2709 | try:
2710 | parts[1].encode('ascii')
2711 | except UnicodeEncodeError:
2712 | # the url needs to be converted to idn notation
2713 | host = parts[1].rsplit(':', 1)
2714 | newhost = []
2715 | port = u''
2716 | if len(host) == 2:
2717 | port = host.pop()
2718 | for h in host[0].split('.'):
2719 | newhost.append(h.encode('idna').decode('utf-8'))
2720 | parts[1] = '.'.join(newhost)
2721 | if port:
2722 | parts[1] += ':' + port
2723 | return urlparse.urlunsplit(parts)
2724 | else:
2725 | return url
2726 |
2727 | def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2728 | request = urllib2.Request(url)
2729 | request.add_header('User-Agent', agent)
2730 | if etag:
2731 | request.add_header('If-None-Match', etag)
2732 | if isinstance(modified, basestring):
2733 | modified = _parse_date(modified)
2734 | elif isinstance(modified, datetime.datetime):
2735 | modified = modified.utctimetuple()
2736 | if modified:
2737 | # format into an RFC 1123-compliant timestamp. We can't use
2738 | # time.strftime() since the %a and %b directives can be affected
2739 | # by the current locale, but RFC 2616 states that dates must be
2740 | # in English.
2741 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2742 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2743 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2744 | if referrer:
2745 | request.add_header('Referer', referrer)
2746 | if gzip and zlib:
2747 | request.add_header('Accept-encoding', 'gzip, deflate')
2748 | elif gzip:
2749 | request.add_header('Accept-encoding', 'gzip')
2750 | elif zlib:
2751 | request.add_header('Accept-encoding', 'deflate')
2752 | else:
2753 | request.add_header('Accept-encoding', '')
2754 | if auth:
2755 | request.add_header('Authorization', 'Basic %s' % auth)
2756 | if ACCEPT_HEADER:
2757 | request.add_header('Accept', ACCEPT_HEADER)
2758 | # use this for whatever -- cookies, special headers, etc
2759 | # [('Cookie','Something'),('x-special-header','Another Value')]
2760 | for header_name, header_value in request_headers.items():
2761 | request.add_header(header_name, header_value)
2762 | request.add_header('A-IM', 'feed') # RFC 3229 support
2763 | return request
2764 |
2765 | def _parse_psc_chapter_start(start):
2766 | FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
2767 |
2768 | m = re.compile(FORMAT).match(start)
2769 | if m is None:
2770 | return None
2771 |
2772 | _, h, m, s, _, ms = m.groups()
2773 | h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
2774 | return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)
2775 |
2776 | _date_handlers = []
2777 | def registerDateHandler(func):
2778 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2779 | _date_handlers.insert(0, func)
2780 |
2781 | # ISO-8601 date parsing routines written by Fazal Majid.
2782 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2783 | # parser is beyond the scope of feedparser and would be a worthwhile addition
2784 | # to the Python library.
2785 | # A single regular expression cannot parse ISO 8601 date formats into groups
2786 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2787 | # 0301-04-01), so we use templates instead.
2788 | # Please note the order in templates is significant because we need a
2789 | # greedy match.
2790 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2791 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2792 | '-YY-?MM', '-OOO', '-YY',
2793 | '--MM-?DD', '--MM',
2794 | '---DD',
2795 | 'CC', '']
2796 | _iso8601_re = [
2797 | tmpl.replace(
2798 | 'YYYY', r'(?P
\d{4})').replace(
2799 | 'YY', r'(?P\d\d)').replace(
2800 | 'MM', r'(?P[01]\d)').replace(
2801 | 'DD', r'(?P[0123]\d)').replace(
2802 | 'OOO', r'(?P[0123]\d\d)').replace(
2803 | 'CC', r'(?P\d\d$)')
2804 | + r'(T?(?P\d{2}):(?P\d{2})'
2805 | + r'(:(?P\d{2}))?'
2806 | + r'(\.(?P\d+))?'
2807 | + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?'
2808 | for tmpl in _iso8601_tmpl]
2809 | try:
2810 | del tmpl
2811 | except NameError:
2812 | pass
2813 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2814 | try:
2815 | del regex
2816 | except NameError:
2817 | pass
2818 |
2819 | def _parse_date_iso8601(dateString):
2820 | '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2821 | m = None
2822 | for _iso8601_match in _iso8601_matches:
2823 | m = _iso8601_match(dateString)
2824 | if m:
2825 | break
2826 | if not m:
2827 | return
2828 | if m.span() == (0, 0):
2829 | return
2830 | params = m.groupdict()
2831 | ordinal = params.get('ordinal', 0)
2832 | if ordinal:
2833 | ordinal = int(ordinal)
2834 | else:
2835 | ordinal = 0
2836 | year = params.get('year', '--')
2837 | if not year or year == '--':
2838 | year = time.gmtime()[0]
2839 | elif len(year) == 2:
2840 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2841 | year = 100 * int(time.gmtime()[0] / 100) + int(year)
2842 | else:
2843 | year = int(year)
2844 | month = params.get('month', '-')
2845 | if not month or month == '-':
2846 | # ordinals are NOT normalized by mktime, we simulate them
2847 | # by setting month=1, day=ordinal
2848 | if ordinal:
2849 | month = 1
2850 | else:
2851 | month = time.gmtime()[1]
2852 | month = int(month)
2853 | day = params.get('day', 0)
2854 | if not day:
2855 | # see above
2856 | if ordinal:
2857 | day = ordinal
2858 | elif params.get('century', 0) or \
2859 | params.get('year', 0) or params.get('month', 0):
2860 | day = 1
2861 | else:
2862 | day = time.gmtime()[2]
2863 | else:
2864 | day = int(day)
2865 | # special case of the century - is the first year of the 21st century
2866 | # 2000 or 2001 ? The debate goes on...
2867 | if 'century' in params:
2868 | year = (int(params['century']) - 1) * 100 + 1
2869 | # in ISO 8601 most fields are optional
2870 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
2871 | if not params.get(field, None):
2872 | params[field] = 0
2873 | hour = int(params.get('hour', 0))
2874 | minute = int(params.get('minute', 0))
2875 | second = int(float(params.get('second', 0)))
2876 | # weekday is normalized by mktime(), we can ignore it
2877 | weekday = 0
2878 | daylight_savings_flag = -1
2879 | tm = [year, month, day, hour, minute, second, weekday,
2880 | ordinal, daylight_savings_flag]
2881 | # ISO 8601 time zone adjustments
2882 | tz = params.get('tz')
2883 | if tz and tz != 'Z':
2884 | if tz[0] == '-':
2885 | tm[3] += int(params.get('tzhour', 0))
2886 | tm[4] += int(params.get('tzmin', 0))
2887 | elif tz[0] == '+':
2888 | tm[3] -= int(params.get('tzhour', 0))
2889 | tm[4] -= int(params.get('tzmin', 0))
2890 | else:
2891 | return None
2892 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
2893 | # which is guaranteed to normalize d/m/y/h/m/s.
2894 | # Many implementations have bugs, but we'll pretend they don't.
2895 | return time.localtime(time.mktime(tuple(tm)))
2896 | registerDateHandler(_parse_date_iso8601)
2897 |
2898 | # 8-bit date handling routines written by ytrewq1.
2899 | _korean_year = u'\ub144' # b3e2 in euc-kr
2900 | _korean_month = u'\uc6d4' # bff9 in euc-kr
2901 | _korean_day = u'\uc77c' # c0cf in euc-kr
2902 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
2903 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
2904 |
2905 | _korean_onblog_date_re = \
2906 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
2907 | (_korean_year, _korean_month, _korean_day))
2908 | _korean_nate_date_re = \
2909 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
2910 | (_korean_am, _korean_pm))
2911 | def _parse_date_onblog(dateString):
2912 | '''Parse a string according to the OnBlog 8-bit date format'''
2913 | m = _korean_onblog_date_re.match(dateString)
2914 | if not m:
2915 | return
2916 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2917 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2918 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2919 | 'zonediff': '+09:00'}
2920 | return _parse_date_w3dtf(w3dtfdate)
2921 | registerDateHandler(_parse_date_onblog)
2922 |
2923 | def _parse_date_nate(dateString):
2924 | '''Parse a string according to the Nate 8-bit date format'''
2925 | m = _korean_nate_date_re.match(dateString)
2926 | if not m:
2927 | return
2928 | hour = int(m.group(5))
2929 | ampm = m.group(4)
2930 | if (ampm == _korean_pm):
2931 | hour += 12
2932 | hour = str(hour)
2933 | if len(hour) == 1:
2934 | hour = '0' + hour
2935 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2936 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2937 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
2938 | 'zonediff': '+09:00'}
2939 | return _parse_date_w3dtf(w3dtfdate)
2940 | registerDateHandler(_parse_date_nate)
2941 |
2942 | # Unicode strings for Greek date strings
2943 | _greek_months = \
2944 | { \
2945 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2946 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2947 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2948 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2949 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2950 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2951 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2952 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2953 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2954 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2955 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2956 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2957 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2958 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2959 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2960 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2961 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2962 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2963 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2964 | }
2965 |
2966 | _greek_wdays = \
2967 | { \
2968 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2969 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2970 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2971 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2972 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2973 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2974 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2975 | }
2976 |
2977 | _greek_date_format_re = \
2978 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2979 |
2980 | def _parse_date_greek(dateString):
2981 | '''Parse a string according to a Greek 8-bit date format.'''
2982 | m = _greek_date_format_re.match(dateString)
2983 | if not m:
2984 | return
2985 | wday = _greek_wdays[m.group(1)]
2986 | month = _greek_months[m.group(3)]
2987 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2988 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2989 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2990 | 'zonediff': m.group(8)}
2991 | return _parse_date_rfc822(rfc822date)
2992 | registerDateHandler(_parse_date_greek)
2993 |
2994 | # Unicode strings for Hungarian date strings
2995 | _hungarian_months = \
2996 | { \
2997 | u'janu\u00e1r': u'01', # e1 in iso-8859-2
2998 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2999 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3000 | u'\u00e1prilis': u'04', # e1 in iso-8859-2
3001 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3002 | u'j\u00fanius': u'06', # fa in iso-8859-2
3003 | u'j\u00falius': u'07', # fa in iso-8859-2
3004 | u'augusztus': u'08',
3005 | u'szeptember': u'09',
3006 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3007 | u'november': u'11',
3008 | u'december': u'12',
3009 | }
3010 |
3011 | _hungarian_date_format_re = \
3012 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3013 |
3014 | def _parse_date_hungarian(dateString):
3015 | '''Parse a string according to a Hungarian 8-bit date format.'''
3016 | m = _hungarian_date_format_re.match(dateString)
3017 | if not m or m.group(2) not in _hungarian_months:
3018 | return None
3019 | month = _hungarian_months[m.group(2)]
3020 | day = m.group(3)
3021 | if len(day) == 1:
3022 | day = '0' + day
3023 | hour = m.group(4)
3024 | if len(hour) == 1:
3025 | hour = '0' + hour
3026 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3027 | {'year': m.group(1), 'month': month, 'day': day,\
3028 | 'hour': hour, 'minute': m.group(5),\
3029 | 'zonediff': m.group(6)}
3030 | return _parse_date_w3dtf(w3dtfdate)
3031 | registerDateHandler(_parse_date_hungarian)
3032 |
3033 | timezonenames = {
3034 | 'ut': 0, 'gmt': 0, 'z': 0,
3035 | 'adt': -3, 'ast': -4, 'at': -4,
3036 | 'edt': -4, 'est': -5, 'et': -5,
3037 | 'cdt': -5, 'cst': -6, 'ct': -6,
3038 | 'mdt': -6, 'mst': -7, 'mt': -7,
3039 | 'pdt': -7, 'pst': -8, 'pt': -8,
3040 | 'a': -1, 'n': 1,
3041 | 'm': -12, 'y': 12,
3042 | }
3043 | # W3 date and time format parser
3044 | # http://www.w3.org/TR/NOTE-datetime
3045 | # Also supports MSSQL-style datetimes as defined at:
3046 | # http://msdn.microsoft.com/en-us/library/ms186724.aspx
3047 | # (basically, allow a space as a date/time/timezone separator)
3048 | def _parse_date_w3dtf(datestr):
3049 | if not datestr.strip():
3050 | return None
3051 | parts = datestr.lower().split('t')
3052 | if len(parts) == 1:
3053 | # This may be a date only, or may be an MSSQL-style date
3054 | parts = parts[0].split()
3055 | if len(parts) == 1:
3056 | # Treat this as a date only
3057 | parts.append('00:00:00z')
3058 | elif len(parts) > 2:
3059 | return None
3060 | date = parts[0].split('-', 2)
3061 | if not date or len(date[0]) != 4:
3062 | return None
3063 | # Ensure that `date` has 3 elements. Using '1' sets the default
3064 | # month to January and the default day to the 1st of the month.
3065 | date.extend(['1'] * (3 - len(date)))
3066 | try:
3067 | year, month, day = [int(i) for i in date]
3068 | except ValueError:
3069 | # `date` may have more than 3 elements or may contain
3070 | # non-integer strings.
3071 | return None
3072 | if parts[1].endswith('z'):
3073 | parts[1] = parts[1][:-1]
3074 | parts.append('z')
3075 | # Append the numeric timezone offset, if any, to parts.
3076 | # If this is an MSSQL-style date then parts[2] already contains
3077 | # the timezone information, so `append()` will not affect it.
3078 | # Add 1 to each value so that if `find()` returns -1 it will be
3079 | # treated as False.
3080 | loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
3081 | loc = loc - 1
3082 | parts.append(parts[1][loc:])
3083 | parts[1] = parts[1][:loc]
3084 | time = parts[1].split(':', 2)
3085 | # Ensure that time has 3 elements. Using '0' means that the
3086 | # minutes and seconds, if missing, will default to 0.
3087 | time.extend(['0'] * (3 - len(time)))
3088 | tzhour = 0
3089 | tzmin = 0
3090 | if parts[2][:1] in ('-', '+'):
3091 | try:
3092 | tzhour = int(parts[2][1:3])
3093 | tzmin = int(parts[2][4:])
3094 | except ValueError:
3095 | return None
3096 | if parts[2].startswith('-'):
3097 | tzhour = tzhour * -1
3098 | tzmin = tzmin * -1
3099 | else:
3100 | tzhour = timezonenames.get(parts[2], 0)
3101 | try:
3102 | hour, minute, second = [int(float(i)) for i in time]
3103 | except ValueError:
3104 | return None
3105 | # Create the datetime object and timezone delta objects
3106 | try:
3107 | stamp = datetime.datetime(year, month, day, hour, minute, second)
3108 | except ValueError:
3109 | return None
3110 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3111 | # Return the date and timestamp in a UTC 9-tuple
3112 | try:
3113 | return (stamp - delta).utctimetuple()
3114 | except (OverflowError, ValueError):
3115 | # IronPython throws ValueErrors instead of OverflowErrors
3116 | return None
3117 |
3118 | registerDateHandler(_parse_date_w3dtf)
3119 |
3120 | def _parse_date_rfc822(date):
3121 | """Parse RFC 822 dates and times
3122 | http://tools.ietf.org/html/rfc822#section-5
3123 |
3124 | There are some formatting differences that are accounted for:
3125 | 1. Years may be two or four digits.
3126 | 2. The month and day can be swapped.
3127 | 3. Additional timezone names are supported.
3128 | 4. A default time and timezone are assumed if only a date is present.
3129 | """
3130 | daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
3131 | months = {
3132 | 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
3133 | 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
3134 | }
3135 |
3136 | parts = date.lower().split()
3137 | if len(parts) < 5:
3138 | # Assume that the time and timezone are missing
3139 | parts.extend(('00:00:00', '0000'))
3140 | # Remove the day name
3141 | if parts[0][:3] in daynames:
3142 | parts = parts[1:]
3143 | if len(parts) < 5:
3144 | # If there are still fewer than five parts, there's not enough
3145 | # information to interpret this
3146 | return None
3147 | try:
3148 | day = int(parts[0])
3149 | except ValueError:
3150 | # Check if the day and month are swapped
3151 | if months.get(parts[0][:3]):
3152 | try:
3153 | day = int(parts[1])
3154 | except ValueError:
3155 | return None
3156 | else:
3157 | parts[1] = parts[0]
3158 | else:
3159 | return None
3160 | month = months.get(parts[1][:3])
3161 | if not month:
3162 | return None
3163 | try:
3164 | year = int(parts[2])
3165 | except ValueError:
3166 | return None
3167 | # Normalize two-digit years:
3168 | # Anything in the 90's is interpreted as 1990 and on
3169 | # Anything 89 or less is interpreted as 2089 or before
3170 | if len(parts[2]) <= 2:
3171 | year += (1900, 2000)[year < 90]
3172 | timeparts = parts[3].split(':')
3173 | timeparts = timeparts + ([0] * (3 - len(timeparts)))
3174 | try:
3175 | (hour, minute, second) = map(int, timeparts)
3176 | except ValueError:
3177 | return None
3178 | tzhour = 0
3179 | tzmin = 0
3180 | # Strip 'Etc/' from the timezone
3181 | if parts[4].startswith('etc/'):
3182 | parts[4] = parts[4][4:]
3183 | # Normalize timezones that start with 'gmt':
3184 | # GMT-05:00 => -0500
3185 | # GMT => GMT
3186 | if parts[4].startswith('gmt'):
3187 | parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
3188 | # Handle timezones like '-0500', '+0500', and 'EST'
3189 | if parts[4] and parts[4][0] in ('-', '+'):
3190 | try:
3191 | tzhour = int(parts[4][1:3])
3192 | tzmin = int(parts[4][3:])
3193 | except ValueError:
3194 | return None
3195 | if parts[4].startswith('-'):
3196 | tzhour = tzhour * -1
3197 | tzmin = tzmin * -1
3198 | else:
3199 | tzhour = timezonenames.get(parts[4], 0)
3200 | # Create the datetime object and timezone delta objects
3201 | try:
3202 | stamp = datetime.datetime(year, month, day, hour, minute, second)
3203 | except ValueError:
3204 | return None
3205 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3206 | # Return the date and timestamp in a UTC 9-tuple
3207 | try:
3208 | return (stamp - delta).utctimetuple()
3209 | except (OverflowError, ValueError):
3210 | # IronPython throws ValueErrors instead of OverflowErrors
3211 | return None
3212 | registerDateHandler(_parse_date_rfc822)
3213 |
3214 | _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
3215 | 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
3216 | def _parse_date_asctime(dt):
3217 | """Parse asctime-style dates"""
3218 | dayname, month, day, remainder = dt.split(None, 3)
3219 | # Convert month and day into zero-padded integers
3220 | month = '%02i ' % (_months.index(month.lower()) + 1)
3221 | day = '%02i ' % (int(day),)
3222 | dt = month + day + remainder
3223 | return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, )
3224 | registerDateHandler(_parse_date_asctime)
3225 |
3226 | def _parse_date_perforce(aDateString):
3227 | """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3228 | # Fri, 2006/09/15 08:19:53 EDT
3229 | _my_date_pattern = re.compile( \
3230 | r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3231 |
3232 | m = _my_date_pattern.search(aDateString)
3233 | if m is None:
3234 | return None
3235 | dow, year, month, day, hour, minute, second, tz = m.groups()
3236 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3237 | dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3238 | tm = rfc822.parsedate_tz(dateString)
3239 | if tm:
3240 | return time.gmtime(rfc822.mktime_tz(tm))
3241 | registerDateHandler(_parse_date_perforce)
3242 |
3243 | def _parse_date(dateString):
3244 | '''Parses a variety of date formats into a 9-tuple in GMT'''
3245 | if not dateString:
3246 | return None
3247 | for handler in _date_handlers:
3248 | try:
3249 | date9tuple = handler(dateString)
3250 | except (KeyError, OverflowError, ValueError):
3251 | continue
3252 | if not date9tuple:
3253 | continue
3254 | if len(date9tuple) != 9:
3255 | continue
3256 | return date9tuple
3257 | return None
3258 |
3259 | # Each marker represents some of the characters of the opening XML
3260 | # processing instruction ('
3271 | RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
3272 |
3273 | # Capture the value of the XML processing instruction's encoding attribute.
3274 | # Example:
3275 | RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
3276 |
3277 | def convert_to_utf8(http_headers, data):
3278 | '''Detect and convert the character encoding to UTF-8.
3279 |
3280 | http_headers is a dictionary
3281 | data is a raw string (not Unicode)'''
3282 |
3283 | # This is so much trickier than it sounds, it's not even funny.
3284 | # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3285 | # is application/xml, application/*+xml,
3286 | # application/xml-external-parsed-entity, or application/xml-dtd,
3287 | # the encoding given in the charset parameter of the HTTP Content-Type
3288 | # takes precedence over the encoding given in the XML prefix within the
3289 | # document, and defaults to 'utf-8' if neither are specified. But, if
3290 | # the HTTP Content-Type is text/xml, text/*+xml, or
3291 | # text/xml-external-parsed-entity, the encoding given in the XML prefix
3292 | # within the document is ALWAYS IGNORED and only the encoding given in
3293 | # the charset parameter of the HTTP Content-Type header should be
3294 | # respected, and it defaults to 'us-ascii' if not specified.
3295 |
3296 | # Furthermore, discussion on the atom-syntax mailing list with the
3297 | # author of RFC 3023 leads me to the conclusion that any document
3298 | # served with a Content-Type of text/* and no charset parameter
3299 | # must be treated as us-ascii. (We now do this.) And also that it
3300 | # must always be flagged as non-well-formed. (We now do this too.)
3301 |
3302 | # If Content-Type is unspecified (input was local file or non-HTTP source)
3303 | # or unrecognized (server just got it totally wrong), then go by the
3304 | # encoding given in the XML prefix of the document and default to
3305 | # 'iso-8859-1' as per the HTTP specification (RFC 2616).
3306 |
3307 | # Then, assuming we didn't find a character encoding in the HTTP headers
3308 | # (and the HTTP Content-type allowed us to look in the body), we need
3309 | # to sniff the first few bytes of the XML data and try to determine
3310 | # whether the encoding is ASCII-compatible. Section F of the XML
3311 | # specification shows the way here:
3312 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3313 |
3314 | # If the sniffed encoding is not ASCII-compatible, we need to make it
3315 | # ASCII compatible so that we can sniff further into the XML declaration
3316 | # to find the encoding attribute, which will tell us the true encoding.
3317 |
3318 | # Of course, none of this guarantees that we will be able to parse the
3319 | # feed in the declared character encoding (assuming it was declared
3320 | # correctly, which many are not). iconv_codec can help a lot;
3321 | # you should definitely install it if you can.
3322 | # http://cjkpython.i18n.org/
3323 |
3324 | bom_encoding = u''
3325 | xml_encoding = u''
3326 | rfc3023_encoding = u''
3327 |
3328 | # Look at the first few bytes of the document to guess what
3329 | # its encoding may be. We only need to decode enough of the
3330 | # document that we can use an ASCII-compatible regular
3331 | # expression to search for an XML encoding declaration.
3332 | # The heuristic follows the XML specification, section F:
3333 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3334 | # Check for BOMs first.
3335 | if data[:4] == codecs.BOM_UTF32_BE:
3336 | bom_encoding = u'utf-32be'
3337 | data = data[4:]
3338 | elif data[:4] == codecs.BOM_UTF32_LE:
3339 | bom_encoding = u'utf-32le'
3340 | data = data[4:]
3341 | elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
3342 | bom_encoding = u'utf-16be'
3343 | data = data[2:]
3344 | elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
3345 | bom_encoding = u'utf-16le'
3346 | data = data[2:]
3347 | elif data[:3] == codecs.BOM_UTF8:
3348 | bom_encoding = u'utf-8'
3349 | data = data[3:]
3350 | # Check for the characters ''''
3463 | if RE_XML_DECLARATION.search(data):
3464 | data = RE_XML_DECLARATION.sub(new_declaration, data)
3465 | else:
3466 | data = new_declaration + u'\n' + data
3467 | data = data.encode('utf-8')
3468 | break
3469 | # if still no luck, give up
3470 | if not known_encoding:
3471 | error = CharacterEncodingUnknown(
3472 | 'document encoding unknown, I tried ' +
3473 | '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
3474 | (rfc3023_encoding, xml_encoding))
3475 | rfc3023_encoding = u''
3476 | elif proposed_encoding != rfc3023_encoding:
3477 | error = CharacterEncodingOverride(
3478 | 'document declared as %s, but parsed as %s' %
3479 | (rfc3023_encoding, proposed_encoding))
3480 | rfc3023_encoding = proposed_encoding
3481 |
3482 | return data, rfc3023_encoding, error
3483 |
3484 | # Match XML entity declarations.
3485 | # Example:
3486 | RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
3487 |
3488 | # Match XML DOCTYPE declarations.
3489 | # Example:
3490 | RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
3491 |
3492 | # Match safe entity declarations.
3493 | # This will allow hexadecimal character references through,
3494 | # as well as text, but not arbitrary nested entities.
3495 | # Example: cubed "³"
3496 | # Example: copyright "(C)"
3497 | # Forbidden: explode1 "&explode2;&explode2;"
3498 | RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(\w+;|[^&"]*)"'))
3499 |
3500 | def replace_doctype(data):
3501 | '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
3502 |
3503 | rss_version may be 'rss091n' or None
3504 | stripped_data is the same XML document with a replaced DOCTYPE
3505 | '''
3506 |
3507 | # Divide the document into two groups by finding the location
3508 | # of the first element that doesn't begin with '' or '\n\n]>')
3534 | data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
3535 |
3536 | # Precompute the safe entities for the loose parser.
3537 | safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
3538 | for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
3539 | return version, data, safe_entities
3540 |
3541 |
3542 | # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3543 | # items, or None in the case of a parsing error.
3544 |
3545 | def _parse_poslist(value, geom_type, swap=True, dims=2):
3546 | if geom_type == 'linestring':
3547 | return _parse_georss_line(value, swap, dims)
3548 | elif geom_type == 'polygon':
3549 | ring = _parse_georss_line(value, swap, dims)
3550 | return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
3551 | else:
3552 | return None
3553 |
3554 | def _gen_georss_coords(value, swap=True, dims=2):
3555 | # A generator of (lon, lat) pairs from a string of encoded GeoRSS
3556 | # coordinates. Converts to floats and swaps order.
3557 | latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
3558 | nxt = latlons.next
3559 | while True:
3560 | t = [nxt(), nxt()][::swap and -1 or 1]
3561 | if dims == 3:
3562 | t.append(nxt())
3563 | yield tuple(t)
3564 |
3565 | def _parse_georss_point(value, swap=True, dims=2):
3566 | # A point contains a single latitude-longitude pair, separated by
3567 | # whitespace. We'll also handle comma separators.
3568 | try:
3569 | coords = list(_gen_georss_coords(value, swap, dims))
3570 | return {u'type': u'Point', u'coordinates': coords[0]}
3571 | except (IndexError, ValueError):
3572 | return None
3573 |
3574 | def _parse_georss_line(value, swap=True, dims=2):
3575 | # A line contains a space separated list of latitude-longitude pairs in
3576 | # WGS84 coordinate reference system, with each pair separated by
3577 | # whitespace. There must be at least two pairs.
3578 | try:
3579 | coords = list(_gen_georss_coords(value, swap, dims))
3580 | return {u'type': u'LineString', u'coordinates': coords}
3581 | except (IndexError, ValueError):
3582 | return None
3583 |
3584 | def _parse_georss_polygon(value, swap=True, dims=2):
3585 | # A polygon contains a space separated list of latitude-longitude pairs,
3586 | # with each pair separated by whitespace. There must be at least four
3587 | # pairs, with the last being identical to the first (so a polygon has a
3588 | # minimum of three actual points).
3589 | try:
3590 | ring = list(_gen_georss_coords(value, swap, dims))
3591 | except (IndexError, ValueError):
3592 | return None
3593 | if len(ring) < 4:
3594 | return None
3595 | return {u'type': u'Polygon', u'coordinates': (ring,)}
3596 |
3597 | def _parse_georss_box(value, swap=True, dims=2):
3598 | # A bounding box is a rectangular region, often used to define the extents
3599 | # of a map or a rough area of interest. A box contains two space seperate
3600 | # latitude-longitude pairs, with each pair separated by whitespace. The
3601 | # first pair is the lower corner, the second is the upper corner.
3602 | try:
3603 | coords = list(_gen_georss_coords(value, swap, dims))
3604 | return {u'type': u'Box', u'coordinates': tuple(coords)}
3605 | except (IndexError, ValueError):
3606 | return None
3607 |
3608 | # end geospatial parsers
3609 |
3610 |
3611 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3612 | '''Parse a feed from a URL, file, stream, or string.
3613 |
3614 | request_headers, if given, is a dict from http header name to value to add
3615 | to the request; this overrides internally generated values.
3616 | '''
3617 |
3618 | if handlers is None:
3619 | handlers = []
3620 | if request_headers is None:
3621 | request_headers = {}
3622 | if response_headers is None:
3623 | response_headers = {}
3624 |
3625 | result = FeedParserDict()
3626 | result['feed'] = FeedParserDict()
3627 | result['entries'] = []
3628 | result['bozo'] = 0
3629 | if not isinstance(handlers, list):
3630 | handlers = [handlers]
3631 | try:
3632 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3633 | data = f.read()
3634 | except Exception, e:
3635 | result['bozo'] = 1
3636 | result['bozo_exception'] = e
3637 | data = None
3638 | f = None
3639 |
3640 | if hasattr(f, 'headers'):
3641 | result['headers'] = dict(f.headers)
3642 | # overwrite existing headers using response_headers
3643 | if 'headers' in result:
3644 | result['headers'].update(response_headers)
3645 | elif response_headers:
3646 | result['headers'] = copy.deepcopy(response_headers)
3647 |
3648 | # lowercase all of the HTTP headers for comparisons per RFC 2616
3649 | if 'headers' in result:
3650 | http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
3651 | else:
3652 | http_headers = {}
3653 |
3654 | # if feed is gzip-compressed, decompress it
3655 | if f and data and http_headers:
3656 | if gzip and 'gzip' in http_headers.get('content-encoding', ''):
3657 | try:
3658 | data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3659 | except (IOError, struct.error), e:
3660 | # IOError can occur if the gzip header is bad.
3661 | # struct.error can occur if the data is damaged.
3662 | result['bozo'] = 1
3663 | result['bozo_exception'] = e
3664 | if isinstance(e, struct.error):
3665 | # A gzip header was found but the data is corrupt.
3666 | # Ideally, we should re-request the feed without the
3667 | # 'Accept-encoding: gzip' header, but we don't.
3668 | data = None
3669 | elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
3670 | try:
3671 | data = zlib.decompress(data)
3672 | except zlib.error, e:
3673 | try:
3674 | # The data may have no headers and no checksum.
3675 | data = zlib.decompress(data, -15)
3676 | except zlib.error, e:
3677 | result['bozo'] = 1
3678 | result['bozo_exception'] = e
3679 |
3680 | # save HTTP headers
3681 | if http_headers:
3682 | if 'etag' in http_headers:
3683 | etag = http_headers.get('etag', u'')
3684 | if not isinstance(etag, unicode):
3685 | etag = etag.decode('utf-8', 'ignore')
3686 | if etag:
3687 | result['etag'] = etag
3688 | if 'last-modified' in http_headers:
3689 | modified = http_headers.get('last-modified', u'')
3690 | if modified:
3691 | result['modified'] = modified
3692 | result['modified_parsed'] = _parse_date(modified)
3693 | if hasattr(f, 'url'):
3694 | if not isinstance(f.url, unicode):
3695 | result['href'] = f.url.decode('utf-8', 'ignore')
3696 | else:
3697 | result['href'] = f.url
3698 | result['status'] = 200
3699 | if hasattr(f, 'status'):
3700 | result['status'] = f.status
3701 | if hasattr(f, 'close'):
3702 | f.close()
3703 |
3704 | if data is None:
3705 | return result
3706 |
3707 | # Stop processing if the server sent HTTP 304 Not Modified.
3708 | if getattr(f, 'code', 0) == 304:
3709 | result['version'] = u''
3710 | result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3711 | 'so the server sent no data. This is a feature, not a bug!'
3712 | return result
3713 |
3714 | data, result['encoding'], error = convert_to_utf8(http_headers, data)
3715 | use_strict_parser = result['encoding'] and True or False
3716 | if error is not None:
3717 | result['bozo'] = 1
3718 | result['bozo_exception'] = error
3719 |
3720 | result['version'], data, entities = replace_doctype(data)
3721 |
3722 | # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
3723 | contentloc = http_headers.get('content-location', u'')
3724 | href = result.get('href', u'')
3725 | baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3726 |
3727 | baselang = http_headers.get('content-language', None)
3728 | if not isinstance(baselang, unicode) and baselang is not None:
3729 | baselang = baselang.decode('utf-8', 'ignore')
3730 |
3731 | if not _XML_AVAILABLE:
3732 | use_strict_parser = 0
3733 | if use_strict_parser:
3734 | # initialize the SAX parser
3735 | feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3736 | saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3737 | saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3738 | try:
3739 | # disable downloading external doctype references, if possible
3740 | saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3741 | except xml.sax.SAXNotSupportedException:
3742 | pass
3743 | saxparser.setContentHandler(feedparser)
3744 | saxparser.setErrorHandler(feedparser)
3745 | source = xml.sax.xmlreader.InputSource()
3746 | source.setByteStream(_StringIO(data))
3747 | try:
3748 | saxparser.parse(source)
3749 | except xml.sax.SAXException, e:
3750 | result['bozo'] = 1
3751 | result['bozo_exception'] = feedparser.exc or e
3752 | use_strict_parser = 0
3753 | if not use_strict_parser and _SGML_AVAILABLE:
3754 | feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3755 | feedparser.feed(data.decode('utf-8', 'replace'))
3756 | result['feed'] = feedparser.feeddata
3757 | result['entries'] = feedparser.entries
3758 | result['version'] = result['version'] or feedparser.version
3759 | result['namespaces'] = feedparser.namespacesInUse
3760 | return result
3761 |
3762 | # The list of EPSG codes for geographic (latitude/longitude) coordinate
3763 | # systems to support decoding of GeoRSS GML profiles.
3764 | _geogCS = [
3765 | 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
3766 | 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
3767 | 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
3768 | 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
3769 | 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
3770 | 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
3771 | 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
3772 | 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
3773 | 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
3774 | 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
3775 | 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
3776 | 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
3777 | 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
3778 | 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
3779 | 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
3780 | 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
3781 | 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
3782 | 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
3783 | 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
3784 | 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
3785 | 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
3786 | 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
3787 | 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
3788 | 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
3789 | 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
3790 | 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
3791 | 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
3792 | 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
3793 | 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
3794 | 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
3795 | 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
3796 | 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
3797 | 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
3798 | 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]
--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
1 | indexes:
2 |
3 | # AUTOGENERATED
4 |
5 | # This index.yaml is automatically updated whenever the dev_appserver
6 | # detects that a new type of query is run. If you want to manage the
7 | # index.yaml file manually, remove the above marker line (the line
8 | # saying "# AUTOGENERATED"). If you want to manage some indexes
9 | # manually, move them above the marker line. The index.yaml file is
10 | # automatically uploaded to the admin console when you next deploy
11 | # your application using appcfg.py.
12 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2007 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import os
19 | import logging
20 | import webapp2
21 | import json
22 |
23 | from google.appengine.api import memcache
24 | from google.appengine.api import urlfetch
25 | from google.appengine.ext.webapp import template
26 |
27 | import extractlinks
28 | from extractlinks import LinkExtractor
29 | import feedparser
30 | import re
31 | import urlparse
32 |
33 | class MainHandler(webapp2.RequestHandler):
34 |
35 | def render_json(self, obj):
36 | self.response.headers["Content-Type"] = 'text/javascript'
37 | if self.request.get("callback"):
38 | self.response.write(self.request.get("callback") + "(" + json.dumps(obj) + ")")
39 | else:
40 | self.response.write(json.dumps(obj))
41 |
42 | # Correct feed urls with rel="self" and add hubs
43 | def extend_feed(self, feed, links):
44 | feed_self = next((l for l in links if l['rel'] == 'self'), None)
45 | if feed_self is not None:
46 | feed['href'] = feed_self['href']
47 | feed['type'] = feed_self['type']
48 | feed['hubs'] = [l for l in links if l['rel'] == 'hub']
49 |
50 | def get(self):
51 | # We need to clean up the url first and remove any fragment
52 | site_url = urlparse.urldefrag(self.request.get("url"))[0]
53 | force = (self.request.get("force").lower()) in ['true', '1']
54 | extend = (self.request.get("extend").lower()) in ['true', '1']
55 | feeds = [] # default value
56 |
57 | if site_url:
58 | feeds = memcache.get(site_url + "." + str(extend))
59 | if feeds is not None and not force:
60 | # good
61 | logging.debug("Memcache hit.")
62 | self.render_json(feeds)
63 | else:
64 | logging.debug("Memcache miss.")
65 | try:
66 | result = urlfetch.fetch(url=site_url, deadline=10)
67 | parser = LinkExtractor()
68 | parser.set_base_url(site_url)
69 | parser.feed(result.content)
70 | if parser.links:
71 | feeds = parser.links
72 | else:
73 | feeds = []
74 |
75 | if not feeds:
76 | # Let's check if by any chance this is actually not a feed?
77 | data = feedparser.parse(result.content)
78 | if data.bozo == 0:
79 | feed = {'title': data.feed.get('title', ''), 'rel': 'self', 'type': 'application/atom+xml', 'href': site_url}
80 | links = data.feed.get('links', [])
81 | if extend:
82 | self.extend_feed(feed, links)
83 | feeds = [feed]
84 | else:
85 | if extend:
86 | for f in feeds:
87 | data = feedparser.parse(f['href'])
88 | links = data.feed.get('links', [])
89 | self.extend_feed(f, links)
90 |
91 | except:
92 | feeds = []
93 |
94 | if not memcache.set(site_url + "." + str(extend), feeds, 86400):
95 | logging.error("Memcache set failed.")
96 | else:
97 | logging.debug("Memcache set.")
98 | self.render_json(feeds)
99 |
100 | else:
101 | self.response.write(template.render(os.path.join(os.path.dirname(__file__), 'templates', "index.html"), {}))
102 |
103 | app = webapp2.WSGIApplication([('/', MainHandler)], debug=True)
104 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 | Feediscovery
7 |
8 |
9 |
10 | Feediscovery
11 |
12 | Let the music play!
13 |
14 | Query:
15 | GET http://feediscovery.appspot.com/?url=http://blog.superfeedr.com
16 | You can as well add a callback parameter.
17 | Response:
18 |
19 | [{"href":"http://blog.superfeedr.com/atom.xml","title":"Superfeedr' thoughts","rel":"alternate","type":"application/atom+xml"}]
20 |
21 |
22 | Brought to you by Superfeedr
|
23 | Learn more | Make it better!
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------