tag within
tag within
FooBar *
* should pop to 'p', not 'b'. 1220 |
Foo
| * | * should pop to 'tr', not the first 'td'
1226 | """
1227 |
1228 | nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1229 | isNestable = nestingResetTriggers != None
1230 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1231 | popTo = None
1232 | inclusive = True
1233 | for i in range(len(self.tagStack)-1, 0, -1):
1234 | p = self.tagStack[i]
1235 | if (not p or p.name == name) and not isNestable:
1236 | #Non-nestable tags get popped to the top or to their
1237 | #last occurance.
1238 | popTo = name
1239 | break
1240 | if (nestingResetTriggers != None
1241 | and p.name in nestingResetTriggers) \
1242 | or (nestingResetTriggers == None and isResetNesting
1243 | and self.RESET_NESTING_TAGS.has_key(p.name)):
1244 |
1245 | #If we encounter one of the nesting reset triggers
1246 | #peculiar to this tag, or we encounter another tag
1247 | #that causes nesting to reset, pop up to but not
1248 | #including that tag.
1249 | popTo = p.name
1250 | inclusive = False
1251 | break
1252 | p = p.parent
1253 | if popTo:
1254 | self._popToTag(popTo, inclusive)
1255 |
1256 | def unknown_starttag(self, name, attrs, selfClosing=0):
1257 | #print "Start tag %s: %s" % (name, attrs)
1258 | if self.quoteStack:
1259 | #This is not a real tag.
1260 | #print "<%s> is not real!" % name
1261 | attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1262 | self.handle_data('<%s%s>' % (name, attrs))
1263 | return
1264 | self.endData()
1265 |
1266 | if not self.isSelfClosingTag(name) and not selfClosing:
1267 | self._smartPop(name)
1268 |
1269 | if self.parseOnlyThese and len(self.tagStack) <= 1 \
1270 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1271 | return
1272 |
1273 | tag = Tag(self, name, attrs, self.currentTag, self.previous)
1274 | if self.previous:
1275 | self.previous.next = tag
1276 | self.previous = tag
1277 | self.pushTag(tag)
1278 | if selfClosing or self.isSelfClosingTag(name):
1279 | self.popTag()
1280 | if name in self.QUOTE_TAGS:
1281 | #print "Beginning quote (%s)" % name
1282 | self.quoteStack.append(name)
1283 | self.literal = 1
1284 | return tag
1285 |
1286 | def unknown_endtag(self, name):
1287 | #print "End tag %s" % name
1288 | if self.quoteStack and self.quoteStack[-1] != name:
1289 | #This is not a real end tag.
1290 | #print "%s> is not real!" % name
1291 | self.handle_data('%s>' % name)
1292 | return
1293 | self.endData()
1294 | self._popToTag(name)
1295 | if self.quoteStack and self.quoteStack[-1] == name:
1296 | self.quoteStack.pop()
1297 | self.literal = (len(self.quoteStack) > 0)
1298 |
1299 | def handle_data(self, data):
1300 | self.currentData.append(data)
1301 |
1302 | def _toStringSubclass(self, text, subclass):
1303 | """Adds a certain piece of text to the tree as a NavigableString
1304 | subclass."""
1305 | self.endData()
1306 | self.handle_data(text)
1307 | self.endData(subclass)
1308 |
1309 | def handle_pi(self, text):
1310 | """Handle a processing instruction as a ProcessingInstruction
1311 | object, possibly one with a %SOUP-ENCODING% slot into which an
1312 | encoding will be plugged later."""
1313 | if text[:3] == "xml":
1314 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1315 | self._toStringSubclass(text, ProcessingInstruction)
1316 |
1317 | def handle_comment(self, text):
1318 | "Handle comments as Comment objects."
1319 | self._toStringSubclass(text, Comment)
1320 |
1321 | def handle_charref(self, ref):
1322 | "Handle character references as data."
1323 | if self.convertEntities:
1324 | data = unichr(int(ref))
1325 | else:
1326 | data = '%s;' % ref
1327 | self.handle_data(data)
1328 |
1329 | def handle_entityref(self, ref):
1330 | """Handle entity references as data, possibly converting known
1331 | HTML and/or XML entity references to the corresponding Unicode
1332 | characters."""
1333 | data = None
1334 | if self.convertHTMLEntities:
1335 | try:
1336 | data = unichr(name2codepoint[ref])
1337 | except KeyError:
1338 | pass
1339 |
1340 | if not data and self.convertXMLEntities:
1341 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1342 |
1343 | if not data and self.convertHTMLEntities and \
1344 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1345 | # TODO: We've got a problem here. We're told this is
1346 | # an entity reference, but it's not an XML entity
1347 | # reference or an HTML entity reference. Nonetheless,
1348 | # the logical thing to do is to pass it through as an
1349 | # unrecognized entity reference.
1350 | #
1351 | # Except: when the input is "&carol;" this function
1352 | # will be called with input "carol". When the input is
1353 | # "AT&T", this function will be called with input
1354 | # "T". We have no way of knowing whether a semicolon
1355 | # was present originally, so we don't know whether
1356 | # this is an unknown entity or just a misplaced
1357 | # ampersand.
1358 | #
1359 | # The more common case is a misplaced ampersand, so I
1360 | # escape the ampersand and omit the trailing semicolon.
1361 | data = "&%s" % ref
1362 | if not data:
1363 | # This case is different from the one above, because we
1364 | # haven't already gone through a supposedly comprehensive
1365 | # mapping of entities to Unicode characters. We might not
1366 | # have gone through any mapping at all. So the chances are
1367 | # very high that this is a real entity, and not a
1368 | # misplaced ampersand.
1369 | data = "&%s;" % ref
1370 | self.handle_data(data)
1371 |
1372 | def handle_decl(self, data):
1373 | "Handle DOCTYPEs and the like as Declaration objects."
1374 | self._toStringSubclass(data, Declaration)
1375 |
1376 | def parse_declaration(self, i):
1377 | """Treat a bogus SGML declaration as raw data. Treat a CDATA
1378 | declaration as a CData object."""
1379 | j = None
1380 | if self.rawdata[i:i+9] == '', i)
1382 | if k == -1:
1383 | k = len(self.rawdata)
1384 | data = self.rawdata[i+9:k]
1385 | j = k+3
1386 | self._toStringSubclass(data, CData)
1387 | else:
1388 | try:
1389 | j = SGMLParser.parse_declaration(self, i)
1390 | except SGMLParseError:
1391 | toHandle = self.rawdata[i:]
1392 | self.handle_data(toHandle)
1393 | j = i + len(toHandle)
1394 | return j
1395 |
1396 | class BeautifulSoup(BeautifulStoneSoup):
1397 |
1398 | """This parser knows the following facts about HTML:
1399 |
1400 | * Some tags have no closing tag and should be interpreted as being
1401 | closed as soon as they are encountered.
1402 |
1403 | * The text inside some tags (ie. 'script') may contain tags which
1404 | are not really part of the document and which should be parsed
1405 | as text, not tags. If you want to parse the text as tags, you can
1406 | always fetch it and parse it explicitly.
1407 |
1408 | * Tag nesting rules:
1409 |
1410 | Most tags can't be nested at all. For instance, the occurance of
1411 | a tag should implicitly close the previous tag. 1412 | 1413 | Para1 Para2 1414 | should be transformed into: 1415 | Para1 Para2 1416 | 1417 | Some tags can be nested arbitrarily. For instance, the occurance 1418 | of a tag should _not_ implicitly close the previous 1419 |tag. 1420 | 1421 | Alice said:Bob said:Blah 1422 | should NOT be transformed into: 1423 | Alice said:Bob said:Blah 1424 | 1425 | Some tags can be nested, but the nesting is reset by the 1426 | interposition of other tags. For instance, a |